码迷,mamicode.com
首页 > 其他好文 > 详细

数据定义和描述

时间:2018-10-22 16:40:49      阅读:171      评论:0      收藏:0      [点我收藏+]

标签:practice   move   设置   des   task   esc   date   mod   带宽   

create table employee (
name string,
work_place array<string>,         -- 调用样式 array_name[0]
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      类似于Hbase的family,调用样式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
row format delimited
fields terminated by "|"
collection items terminated by ","
map keys terminated by ":";

!table employee --不用
!column employee --不用
describe formatted employee;  --用这个,可读性更好
load data local inpath "/home/centos/hive essential/ch03/employee.txt" overwrite into table employee; 
# query the whole array 
select work_place from employee; 
select work_place[0] as col_1, work_place[1] as col_2, work_place[2] as col_3 from employee;
#query the whole map
select gender_age from employee;
select gender_age.gender , gender_age.age from employee;
#query the whole struct and each column in table:
select skills_score from employee;
select name, 
skills_score["DB"] as DB, 
skills_score["Perl"] as Perl,
skills_score["Python"] as Python,
skills_score["Sales"] as Sales,
skills_score["HR"] as HR
from employee;
#query composite type
select apart_title from employee;
select name,
apart_title["Product"] as Product,
apart_title["Test"] as Test,
apart_title["COE"] as COE,
apart_title["Sales"] as Sales
from employee;

DDL
操作数据库
create database if not exists myhivebook
comment "cho3 hive database in practice"  --添加描述
location "/hdfs/hive"   --hdfs上的路径
with dbproperties ("name"="MengRui", "date"="2018-08-20");

show databases;
describe database myhivebook;   --打印出指定数据库的信息

use myhivebook;

drop database if exists myhivebook; --删除空库
drop database if exists myhivebook cascade; --删除含表的库

alter database myhivebook   --设置数据库属性
set dbproperties ("edited by"="dog");
alter database myhivebook
set owner user dog;

操作表
create external table external_employee (
name string,
work_place array<string>,         -- 调用样式 array_name[0]
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      调用样式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
comment "this is a external table" --属性位置固定,否则会报错
row format delimited
fields terminated by "|"
collection items terminated by ","
map keys terminated by ":"
stored as textfile   --
location "/user/ch03/employee"; --此路径下不能包含其他文件夹,否则,在查询时会出错。若路径不存在,Hive会自动创建路径
    
load data local inpath "/home/centos/hive essential/ch03/employee.txt" overwrite into table external_employee;
create temporary table temporary_name... ???

--CTAS copy metadata and data to new table
create table ctas_employee as 
select * from external_employee;

--创建CTE
男性中选出名为"Michael"的姓名,并且选择出所有女性的姓名
create table cte_employee as  -- CTAS
with r1 as (select name from r2 where name = "Michael"), --CTE   
r2 as (select name from employee where agender_age.agender = "Male"),
r3 as (select name from employee where agender_age.agender = "Female")
select * from r1 union all select * from r3;
select * from cte_employee;

--创建空表
//create table empty_ctas_employee as  --CTAS  会使用mapper,耗时不推荐
//select * from employee where 1 = 2;  
create table empty_like_employee  -- use LIKE only metadata replication
like employee; --like [table or view]

-- 统计行数
select count(*) as row_counts from employee;

-- 完全地删除内部表, removes the metadata completely and moves date to Trash.
drop table if exists empty_ctas_employee; 

-- remove all the rows from a internal table
truncate table cte_employee;

ALTER 只改变元数据
--alter table rename
alter table internal_employee to empty_employee;

alter table employee set --添加或更新表属性
tblproperties("comment" = "this is internal table");

alter table employee set
serdeproperties("field.delim" = "$");

alter table employee set
location "hdfs://mycluster/user/hive/warehouse/new_employee";  -- 设置路径,hive不会自动创建路径,路径必须为hdfs中的绝对路径

alter table external_employee partition(year = 2012, month = 1, day = 1) enable no_drop;    --阻止删除分区表   
alter table external_employee enable off_line;      -- 阻止查询分区表中的data(not metadata)

alter table employee concatenate;  --merge small files into larger files,only RCFile and ORCFile Formats are supportted right now

alter table employee set fileformat rcfile;    --设置文件格式
alter table employee set fileformat textfile;

--check column type
desc employee;
 
alter table empty_employee  --下述操作只改变元数据,数据必须与更新后的字段匹配
change column name employee_name string  -- change the column
after work_place;  -- move the column

alter table empty_employee 
add columns (wife string);  --添加新列

alter table empty_employee 
replace columns(wife string);  --替换掉原来的所有列为单个列

分区表
--创建分区表,极大地降低查询时的时间和带宽
create table partition_employee (
name string,
work_place array<string>,         -- 调用样式 array_name[0]
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      调用样式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
partitioned by (year int, month int)
row format delimited
fields terminated by "|"          
collection items terminated by ","
map keys terminated by ":";
--检查分区
show partitions partition_employee;  
--1)首次创建表时无分区,需要手动添加分区
alter table partition_employee add
partition (year = 2017, month = 07)
partition (year = 2017, month = 08);
--2)load data into partitions
load data local inpath "/home/centos/hive essential/ch03/employee.txt". -- local:从本地文件系统加载数据
overwrite into table partition_employee 
partition (year = 2017, month = 7);

-- 查询分区数据时,需先设置:
hive.strict.checks.large.query=false
hive.mapred.mode=nonstrict

-- drop the partition
alter table partition_employee
drop if exists partition (year = 2017, month = 7);

分桶表
--1)Prepare another dataset and table for bucket table
create table employee_id (
name string,
employee_id int,   -- bucket column 
work_place array<string>,         -- 调用样式 array_name[0]
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      调用样式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
row format delimited
fields terminated by "|"
collection items terminated by ","
map keys terminated by ":";
load data local inpath "/home/centos/hive essential/ch03/employee_id.txt"
overwrite into table employee_id;
--2)create bucket table
create table employee_id_buckets (
name string,
employee_id int,   -- bucket column
work_place array<string>,         -- 调用样式 array_name[0] 
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      调用样式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
clustered by (employee_id) into 2 buckets  --桶的容量:near two blocks of data(256M) 桶的数量:2N
row format delimited
fields terminated by "|"
collection items terminated by ","           --tuple1,tuple2,...
map keys terminated by ":";

-- 3)
set map.reducer.max.tasks = 2; --reducer的数量等于桶数
set hive.enforce.bucketing = true; 

-- 4)populate data into buckets
insert overwrite table employee_id_buckets     -- insert的作用??: 根据元数据校验数据
select * from employee_id;
-- 5) verify the buckets in the HDFS
dfs -ls /user/hive/warehouse/employee_id_buckets;

视图
降低查询的复杂性,增加数据安全性

参考书籍

Programming_Hive
Apache Hive Essentials

数据定义和描述

标签:practice   move   设置   des   task   esc   date   mod   带宽   

原文地址:https://www.cnblogs.com/StephenMeng/p/9830534.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!