标签:practice move 设置 des task esc date mod 带宽
create table employee (
name string,
work_place array<string>, -- 调用样式 array_name[0]
gender_age struct<gender:string, age:int>, --struct<col_name:type, ...> 类似于Hbase的family,调用样式 sruct_name.col_name
skills_score map<string, int>, -- map_name[key]
apart_title map<string, array<string>>
)
row format delimited
fields terminated by "|"
collection items terminated by ","
map keys terminated by ":";
!table employee --不用
!column employee --不用
describe formatted employee; --用这个,可读性更好
load data local inpath "/home/centos/hive essential/ch03/employee.txt" overwrite into table employee;
# query the whole array
select work_place from employee;
select work_place[0] as col_1, work_place[1] as col_2, work_place[2] as col_3 from employee;
#query the whole map
select gender_age from employee;
select gender_age.gender , gender_age.age from employee;
#query the whole struct and each column in table:
select skills_score from employee;
select name,
skills_score["DB"] as DB,
skills_score["Perl"] as Perl,
skills_score["Python"] as Python,
skills_score["Sales"] as Sales,
skills_score["HR"] as HR
from employee;
#query composite type
select apart_title from employee;
select name,
apart_title["Product"] as Product,
apart_title["Test"] as Test,
apart_title["COE"] as COE,
apart_title["Sales"] as Sales
from employee;
DDL
操作数据库
create database if not exists myhivebook
comment "cho3 hive database in practice" --添加描述
location "/hdfs/hive" --hdfs上的路径
with dbproperties ("name"="MengRui", "date"="2018-08-20");
show databases;
describe database myhivebook; --打印出指定数据库的信息
use myhivebook;
drop database if exists myhivebook; --删除空库
drop database if exists myhivebook cascade; --删除含表的库
alter database myhivebook --设置数据库属性
set dbproperties ("edited by"="dog");
alter database myhivebook
set owner user dog;
操作表
create external table external_employee (
name string,
work_place array<string>, -- 调用样式 array_name[0]
gender_age struct<gender:string, age:int>, --struct<col_name:type, ...> 调用样式 sruct_name.col_name
skills_score map<string, int>, -- map_name[key]
apart_title map<string, array<string>>
)
comment "this is a external table" --属性位置固定,否则会报错
row format delimited
fields terminated by "|"
collection items terminated by ","
map keys terminated by ":"
stored as textfile --
location "/user/ch03/employee"; --此路径下不能包含其他文件夹,否则,在查询时会出错。若路径不存在,Hive会自动创建路径
load data local inpath "/home/centos/hive essential/ch03/employee.txt" overwrite into table external_employee;
create temporary table temporary_name... ???
--CTAS copy metadata and data to new table
create table ctas_employee as
select * from external_employee;
--创建CTE
男性中选出名为"Michael"的姓名,并且选择出所有女性的姓名
create table cte_employee as -- CTAS
with r1 as (select name from r2 where name = "Michael"), --CTE
r2 as (select name from employee where agender_age.agender = "Male"),
r3 as (select name from employee where agender_age.agender = "Female")
select * from r1 union all select * from r3;
select * from cte_employee;
--创建空表
//create table empty_ctas_employee as --CTAS 会使用mapper,耗时不推荐
//select * from employee where 1 = 2;
create table empty_like_employee -- use LIKE only metadata replication
like employee; --like [table or view]
-- 统计行数
select count(*) as row_counts from employee;
-- 完全地删除内部表, removes the metadata completely and moves date to Trash.
drop table if exists empty_ctas_employee;
-- remove all the rows from a internal table
truncate table cte_employee;
ALTER 只改变元数据
--alter table rename
alter table internal_employee to empty_employee;
alter table employee set --添加或更新表属性
tblproperties("comment" = "this is internal table");
alter table employee set
serdeproperties("field.delim" = "$");
alter table employee set
location "hdfs://mycluster/user/hive/warehouse/new_employee"; -- 设置路径,hive不会自动创建路径,路径必须为hdfs中的绝对路径
alter table external_employee partition(year = 2012, month = 1, day = 1) enable no_drop; --阻止删除分区表
alter table external_employee enable off_line; -- 阻止查询分区表中的data(not metadata)
alter table employee concatenate; --merge small files into larger files,only RCFile and ORCFile Formats are supportted right now
alter table employee set fileformat rcfile; --设置文件格式
alter table employee set fileformat textfile;
--check column type
desc employee;
alter table empty_employee --下述操作只改变元数据,数据必须与更新后的字段匹配
change column name employee_name string -- change the column
after work_place; -- move the column
alter table empty_employee
add columns (wife string); --添加新列
alter table empty_employee
replace columns(wife string); --替换掉原来的所有列为单个列
分区表
--创建分区表,极大地降低查询时的时间和带宽
create table partition_employee (
name string,
work_place array<string>, -- 调用样式 array_name[0]
gender_age struct<gender:string, age:int>, --struct<col_name:type, ...> 调用样式 sruct_name.col_name
skills_score map<string, int>, -- map_name[key]
apart_title map<string, array<string>>
)
partitioned by (year int, month int)
row format delimited
fields terminated by "|"
collection items terminated by ","
map keys terminated by ":";
--检查分区
show partitions partition_employee;
--1)首次创建表时无分区,需要手动添加分区
alter table partition_employee add
partition (year = 2017, month = 07)
partition (year = 2017, month = 08);
--2)load data into partitions
load data local inpath "/home/centos/hive essential/ch03/employee.txt". -- local:从本地文件系统加载数据
overwrite into table partition_employee
partition (year = 2017, month = 7);
-- 查询分区数据时,需先设置:
hive.strict.checks.large.query=false
hive.mapred.mode=nonstrict
-- drop the partition
alter table partition_employee
drop if exists partition (year = 2017, month = 7);
分桶表
--1)Prepare another dataset and table for bucket table
create table employee_id (
name string,
employee_id int, -- bucket column
work_place array<string>, -- 调用样式 array_name[0]
gender_age struct<gender:string, age:int>, --struct<col_name:type, ...> 调用样式 sruct_name.col_name
skills_score map<string, int>, -- map_name[key]
apart_title map<string, array<string>>
)
row format delimited
fields terminated by "|"
collection items terminated by ","
map keys terminated by ":";
load data local inpath "/home/centos/hive essential/ch03/employee_id.txt"
overwrite into table employee_id;
--2)create bucket table
create table employee_id_buckets (
name string,
employee_id int, -- bucket column
work_place array<string>, -- 调用样式 array_name[0]
gender_age struct<gender:string, age:int>, --struct<col_name:type, ...> 调用样式 sruct_name.col_name
skills_score map<string, int>, -- map_name[key]
apart_title map<string, array<string>>
)
clustered by (employee_id) into 2 buckets --桶的容量:near two blocks of data(256M) 桶的数量:2N
row format delimited
fields terminated by "|"
collection items terminated by "," --tuple1,tuple2,...
map keys terminated by ":";
-- 3)
set map.reducer.max.tasks = 2; --reducer的数量等于桶数
set hive.enforce.bucketing = true;
-- 4)populate data into buckets
insert overwrite table employee_id_buckets -- insert的作用??: 根据元数据校验数据
select * from employee_id;
-- 5) verify the buckets in the HDFS
dfs -ls /user/hive/warehouse/employee_id_buckets;
视图
降低查询的复杂性,增加数据安全性
Programming_Hive
Apache Hive Essentials
标签:practice move 设置 des task esc date mod 带宽
原文地址:https://www.cnblogs.com/StephenMeng/p/9830534.html