码迷,mamicode.com
首页 > 其他好文 > 详细

Hive拉链表实现

时间:2019-12-12 17:55:34      阅读:164      评论:0      收藏:0      [点我收藏+]

标签:拉链   path   sele   加载   union   span   订单   exists   数据   

拉链表测试:

有如下测试数据

--2019/12/1号订单的全量数据
id    status    create_time    operation_time
1    待支付    2019-12-01    
2    待支付    2019-12-01    
3    已支付    2019-12-01    

--2019/12/2号订单的全量数据
id    status    create_time    operation_time
1    待支付    2019-12-01    
2    已支付    2019-12-01    2019-12-02
3    已支付    2019-12-01    
4    待支付    2019-12-02    
5    已支付    2019-12-02    

 

--创建订单表
drop table if exists order_info;
create table order_info(
id int, 
status string,
create_time string,
operate_time string
)
partitioned by(dt string)
row format
delimited fields terminated by \t;

 

--加载12/1的数据
load data local inpath "/opt/data/order_info1" into table chain.order_info partition(dt=2019-12-01);

--加载12/2的数据
load data local inpath "/opt/data/order_info2" into table chain.order_info partition(dt=2019-12-02);

 

--创建拉链表
drop table if exists order_info_chain;
create table order_info_chain(
id int, 
status string,
create_time string,
operate_time string,
start_time string,
end_time string
)
row format 
delimited fields terminated by \t;

--初始化拉链表,加载12/1号的数据
insert overwrite table order_info_chain
select 
id,
status,
create_time,
operate_time,
2019-12-01,
9999-99-99
from order_info
where dt=2019-12-01;

 

--订单变化表
create table order_change(
id string,
status string,
create_time string,
operate_time string
) partitioned by (dt string);

--灌入12/2变化和新增的数据,通过create_time,create_time判断
insert overwrite table order_change partition(dt=2019-12-02)
select 
id,
status,
create_time,
operate_time
from order_info where create_time=2019-12-02 or operate_time=2019-12-02;

 

--创建临时拉链表
create table tmp_chain(
id string,
status string,
create_time string COMMENT 创建时间,
operate_time string COMMENT 修改时间,
start_time string COMMENT 有效开始时间,
end_time string COMMENT 有效结束时间
);

 

--向临时拉链表导入数据
insert overwrite table tmp_chain
select * from
(
    select
        id,
        status,
        create_time,
        operate_time,
        2019-12-02 as start_time,
        9999-99-99 as end_time
    from order_change where dt=2019-12-02
    
    union all
    
    --修改发生过更新的记录的end_time为前一天
    select 
        orch.id,
        orch.status,
        orch.create_time,
        orch.operate_time,
        orch.start_time,
        if(ch.id is null, orch.end_time, date_add(ch.dt,-1)) as end_time
    from order_info_chain orch
    left join
        (select * from order_change where dt=2019-12-02) ch
    on orch.id = ch.id and orch.end_time=9999-99-99
) t 
order by t.id,t.start_time;

 

--把临时表覆盖给拉链表
insert overwrite table order_info_chain
select * from tmp_chain;

 

Hive拉链表实现

标签:拉链   path   sele   加载   union   span   订单   exists   数据   

原文地址:https://www.cnblogs.com/lucas-zhao/p/12030594.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!