数据定义和描述

时间：2018-10-22 16:40:49 阅读：171 评论：0 收藏：0 [点我收藏+]
标签：practice move 设置 des task esc date mod 带宽
create table employee (
name string,
work_place array<string>,         -- 调用样式 array_name[0]
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      类似于Hbase的family，调用样式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
row format delimited
fields terminated by "|"
collection items terminated by ","
map keys terminated by ":";

!table employee --不用
!column employee --不用
describe formatted employee;  --用这个,可读性更好
load data local inpath "/home/centos/hive essential/ch03/employee.txt" overwrite into table employee; 
# query the whole array 
select work_place from employee; 
select work_place[0] as col_1, work_place[1] as col_2, work_place[2] as col_3 from employee;
#query the whole map
select gender_age from employee;
select gender_age.gender , gender_age.age from employee;
#query the whole struct and each column in table:
select skills_score from employee;
select name, 
skills_score["DB"] as DB, 
skills_score["Perl"] as Perl,
skills_score["Python"] as Python,
skills_score["Sales"] as Sales,
skills_score["HR"] as HR
from employee;
#query composite type
select apart_title from employee;
select name,
apart_title["Product"] as Product,
apart_title["Test"] as Test,
apart_title["COE"] as COE,
apart_title["Sales"] as Sales
from employee;

DDL
操作数据库
create database if not exists myhivebook
comment "cho3 hive database in practice"  --添加描述
location "/hdfs/hive"   --hdfs上的路径
with dbproperties ("name"="MengRui", "date"="2018-08-20");

show databases;
describe database myhivebook;   --打印出指定数据库的信息

use myhivebook;

drop database if exists myhivebook; --删除空库
drop database if exists myhivebook cascade; --删除含表的库

alter database myhivebook   --设置数据库属性
set dbproperties ("edited by"="dog");
alter database myhivebook
set owner user dog;

操作表
create external table external_employee (
name string,
work_place array<string>,         -- 调用样式 array_name[0]
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      调用样式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
comment "this is a external table" --属性位置固定，否则会报错
row format delimited
fields terminated by "|"
collection items terminated by ","
map keys terminated by ":"
stored as textfile   --
location "/user/ch03/employee"; --此路径下不能包含其他文件夹，否则，在查询时会出错。若路径不存在，Hive会自动创建路径
    
load data local inpath "/home/centos/hive essential/ch03/employee.txt" overwrite into table external_employee;
create temporary table temporary_name... ？？？

--CTAS copy metadata and data to new table
create table ctas_employee as 
select * from external_employee;

--创建CTE
男性中选出名为"Michael"的姓名，并且选择出所有女性的姓名
create table cte_employee as  -- CTAS
with r1 as (select name from r2 where name = "Michael"), --CTE   
r2 as (select name from employee where agender_age.agender = "Male"),
r3 as (select name from employee where agender_age.agender = "Female")
select * from r1 union all select * from r3;
select * from cte_employee;

--创建空表
//create table empty_ctas_employee as  --CTAS  会使用mapper，耗时不推荐
//select * from employee where 1 = 2;  
create table empty_like_employee  -- use LIKE only metadata replication
like employee; --like [table or view]

-- 统计行数
select count(*) as row_counts from employee;

-- 完全地删除内部表, removes the metadata completely and moves date to Trash.
drop table if exists empty_ctas_employee; 

-- remove all the rows from a internal table
truncate table cte_employee;

ALTER 只改变元数据
--alter table rename
alter table internal_employee to empty_employee;

alter table employee set --添加或更新表属性
tblproperties("comment" = "this is internal table");

alter table employee set
serdeproperties("field.delim" = "$");

alter table employee set
location "hdfs://mycluster/user/hive/warehouse/new_employee";  -- 设置路径，hive不会自动创建路径，路径必须为hdfs中的绝对路径

alter table external_employee partition(year = 2012, month = 1, day = 1) enable no_drop;    --阻止删除分区表   
alter table external_employee enable off_line;      -- 阻止查询分区表中的data(not metadata)

alter table employee concatenate;  --merge small files into larger files,only RCFile and ORCFile Formats are supportted right now

alter table employee set fileformat rcfile;    --设置文件格式
alter table employee set fileformat textfile;

--check column type
desc employee;
 
alter table empty_employee  --下述操作只改变元数据，数据必须与更新后的字段匹配
change column name employee_name string  -- change the column
after work_place;  -- move the column

alter table empty_employee 
add columns (wife string);  --添加新列

alter table empty_employee 
replace columns(wife string);  --替换掉原来的所有列为单个列

分区表
--创建分区表，极大地降低查询时的时间和带宽
create table partition_employee (
name string,
work_place array<string>,         -- 调用样式 array_name[0]
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      调用样式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
partitioned by (year int, month int)
row format delimited
fields terminated by "|"          
collection items terminated by ","
map keys terminated by ":";
--检查分区
show partitions partition_employee;  
--1）首次创建表时无分区，需要手动添加分区
alter table partition_employee add
partition (year = 2017, month = 07)
partition (year = 2017, month = 08);
--2）load data into partitions
load data local inpath "/home/centos/hive essential/ch03/employee.txt". -- local:从本地文件系统加载数据
overwrite into table partition_employee 
partition (year = 2017, month = 7);

-- 查询分区数据时，需先设置：
hive.strict.checks.large.query=false
hive.mapred.mode=nonstrict

-- drop the partition
alter table partition_employee
drop if exists partition (year = 2017, month = 7);

分桶表
--1)Prepare another dataset and table for bucket table
create table employee_id (
name string,
employee_id int,   -- bucket column 
work_place array<string>,         -- 调用样式 array_name[0]
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      调用样式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
row format delimited
fields terminated by "|"
collection items terminated by ","
map keys terminated by ":";
load data local inpath "/home/centos/hive essential/ch03/employee_id.txt"
overwrite into table employee_id;
--2)create bucket table
create table employee_id_buckets (
name string,
employee_id int,   -- bucket column
work_place array<string>,         -- 调用样式 array_name[0] 
gender_age struct<gender:string, age:int>,  --struct<col_name:type, ...>      调用样式 sruct_name.col_name
skills_score map<string, int>,    -- map_name[key]
apart_title map<string, array<string>>
)
clustered by (employee_id) into 2 buckets  --桶的容量：near two blocks of data(256M) 桶的数量：2N
row format delimited
fields terminated by "|"
collection items terminated by ","           --tuple1,tuple2,...
map keys terminated by ":";

-- 3)
set map.reducer.max.tasks = 2; --reducer的数量等于桶数
set hive.enforce.bucketing = true; 

-- 4)populate data into buckets
insert overwrite table employee_id_buckets     -- insert的作用??: 根据元数据校验数据
select * from employee_id;
-- 5) verify the buckets in the HDFS
dfs -ls /user/hive/warehouse/employee_id_buckets;

视图
降低查询的复杂性，增加数据安全性
参考书籍

Programming_Hive
Apache Hive Essentials
数据定义和描述
标签：practice move 设置 des task esc date mod 带宽
原文地址：https://www.cnblogs.com/StephenMeng/p/9830534.html
踩
(0)
评论一句话评论（0）
分享档案
更多>
2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)
周排行