标签:pre 数据清洗 drop comm bsp tor cat date 三范式
业务数仓的DWD层一般有两个典型操作:
①因为是DWD层,所以要进行数据清洗。
②因为数据来源于web项目的数据库,标的设计遵循三范式,因此在数仓里需要进行降维,以减少join次数。
在示例的8张表中,订单表,订单详情表,用户表,支付流水表字段与ODS层一致。对商品表的分类进行降维。增加二级分类,一级分类的id和名称。
drop table if exists dwd_sku_info; create external table dwd_sku_info( `id` string COMMENT ‘skuId‘, `spu_id` string COMMENT ‘spuid‘, `price` decimal(10,2) COMMENT ‘‘, `sku_name` string COMMENT ‘‘, `sku_desc` string COMMENT ‘‘, `weight` string COMMENT ‘‘, `tm_id` string COMMENT ‘id‘, `category3_id` string COMMENT ‘1id‘, `category2_id` string COMMENT ‘2id‘, `category1_id` string COMMENT ‘3id‘, `category3_name` string COMMENT ‘3‘, `category2_name` string COMMENT ‘2‘, `category1_name` string COMMENT ‘1‘, `create_time` string COMMENT ‘‘ ) PARTITIONED BY (`dt` string) stored as parquet location ‘/warehouse/gmall/dwd/dwd_sku_info/‘ tblproperties ("parquet.compression"="snappy") ;
DWD层数据导入脚本,降维时,只需对相关表进行join即可。
#!/bin/bash # 定义变量方便修改 APP=gmall hive=/opt/module/hive/bin/hive # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天 if [ -n "$1" ] ;then do_date=$1 else do_date=`date -d "-1 day" +%F` fi sql=" set hive.exec.dynamic.partition.mode=nonstrict; insert overwrite table "$APP".dwd_order_info partition(dt) select * from "$APP".ods_order_info where dt=‘$do_date‘ and id is not null; insert overwrite table "$APP".dwd_order_detail partition(dt) select * from "$APP".ods_order_detail where dt=‘$do_date‘ and id is not null; insert overwrite table "$APP".dwd_user_info partition(dt) select * from "$APP".ods_user_info where dt=‘$do_date‘ and id is not null; insert overwrite table "$APP".dwd_payment_info partition(dt) select * from "$APP".ods_payment_info where dt=‘$do_date‘ and id is not null; insert overwrite table "$APP".dwd_sku_info partition(dt) select sku.id, sku.spu_id, sku.price, sku.sku_name, sku.sku_desc, sku.weight, sku.tm_id, sku.category3_id, c2.id category2_id, c1.id category1_id, c3.name category3_name, c2.name category2_name, c1.name category1_name, sku.create_time, sku.dt from "$APP".ods_sku_info sku join "$APP".ods_base_category3 c3 on sku.category3_id=c3.id join "$APP".ods_base_category2 c2 on c3.category2_id=c2.id join "$APP".ods_base_category1 c1 on c2.category1_id=c1.id where sku.dt=‘$do_date‘ and c2.dt=‘$do_date‘ and c3.dt=‘$do_date‘ and c1.dt=‘$do_date‘ and sku.id is not null; " $hive -e "$sql"
标签:pre 数据清洗 drop comm bsp tor cat date 三范式
原文地址:https://www.cnblogs.com/noyouth/p/13222202.html