hive数据压缩脚本实现

时间：2017-09-15 18:51:13 阅读：175 评论：0 收藏：0 [点我收藏+]

标签：名称 cti hadoop from select blog path home hdfs

#!/bin/bash

#参数个数校验
if [ $# -ne 2 ]
  then
    echo "输入库名:[/core/loan/md/lc/wt/adm] 压缩时间：[1900/非1900]";
    exit 1;
fi;
#压缩库名
db_name=$1
#压缩时间选择
iscompess_type=$2
com_date=""
#脚本路径
script_path="/home/hadoop/hdp_script/load"
#压缩方式
hive_conf=‘SET hive.exec.compress.output=true;SET hive.exec.parallel=true;‘
#获取压缩数据的集合
public_arr=(`hdfs dfs -ls -R /user/hive/warehouse/${db_name}.db |grep etl_date |grep part-m |awk -F "/part-m" ‘{print $1}‘|awk -F " " ‘{print $8}‘|sort -u`)
#并发控制参数
index=0
#检查压缩结果
function hive_job_status_checking(){
    if [ $? -eq 0 ];then
        echo "$1 压缩成功!!!"
    else
        echo "$1 压缩失败!!!"
        exit 1;
    fi
}
#执行压缩任务
function hive_job_runing(){
    if [ ${flag} != 0 ];then
       hive -e "${hive_conf} insert overwrite table ${db_table_name} partition(etl_date=‘${table_etl_date}‘) select ${table_field} from ${db_table_name} where etl_date=‘${table_etl_date}‘;" &
       hive_job_status_checking ${db_table_name}${table_etl_date}
    else
       hive -e "${hive_conf} insert overwrite table ${db_table_name} partition(etl_date=‘${table_etl_date}‘) select ${table_field} from ${db_table_name} where etl_date=‘${table_etl_date}‘;"
       hive_job_status_checking ${db_table_name}${table_etl_date}
    fi
}
#压缩当前时间的前30天数据
function do_compess_30day(){
    if [[ ${table_etl_date_unix} -le ${com_date_unix} ]];then
       hive_job_runing 
    fi
}
#压缩当前时间数据
function do_compess(){
    if [[ ${table_etl_date_unix} -eq ${com_date_unix} ]];then
       hive_job_runing 
    fi
}

for elem in ${public_arr[@]}
do
   flag=${index:${#index}-1}                                                                #并发控制参数
   index=$((index+1))                                                                       #并发控制参数
   script_name=`echo ${elem}|awk -F ‘/‘ ‘{print $5"*"$6".sh"}‘|sed ‘s/.db/\//g‘`            #脚本名称
   db_table_name=`echo ${elem}|awk -F ‘/‘ ‘{print $5 $6}‘|sed ‘s/.db/./g‘`                  #库名称和表名称
   table_etl_date=`echo ${elem}|awk -F ‘=‘ ‘{print $2}‘`                                    #数据抽取时间
   table_field=`cat ${script_path}/${script_name}|grep query_str=|awk -F ‘"‘ ‘{print $2}‘`  #表字段
   table_etl_date_unix=`date -d "${table_etl_date}"  +%s`                                   #unix时间转换
   if [[ ${iscompess_type} -eq 1900 ]];then
       com_date=`date +%Y-%m-%d -d‘-30 day‘`
       com_date_unix=`date -d "${com_date}"  +%s`
       do_compess_30day
   else
       com_date=`date +%Y-%m-%d -d‘-0 day‘`
       com_date_unix=`date -d "${com_date}"  +%s`
       do_compess
   fi
done

hive数据压缩脚本实现

标签：名称 cti hadoop from select blog path home hdfs

原文地址：http://www.cnblogs.com/guotianqi/p/7527183.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行