标签:des style blog color io 使用 ar for 2014
知识点:
1、Hive的窗口和分析函数进阶
CUME_DIST 小于等于当前行值的行数 / 总行数
PERCENT_RANK 当前rank值-1 / 总行数-1
NTILE 将窗口分成n片
LEAD(col, n, default) 窗口内下n行值
LAG(col, n , default) 窗口内上n行值
FIRST_VALUE 窗口内第一个值
LAST_VALUE 窗口内最后一个值
2、分析函数中包含三个分析子句
分组(Partition By)
排序(Order By)
窗口(Window)
3、ROWS BETWEEN ... AND ...
CURRENT ROW 当前行
N-rows PRECEDING 前面N行
N-rows FOLLOWING 后面N行
count() / sum() / max() / min()
CREATE TABLE rates ( app_name STRING , star_rates STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t‘;
load data local inpath ‘/home/spark/software/data/rates.txt‘ overwrite into table rates;
select * from rates; app0 1 app1 2 app2 2 app3 3 app4 3 app5 3 app6 5 app7 5 app8 5 app9 5
create table app_ranks as select app_name , star_rates , NTILE(3) OVER (ORDER BY star_rates DESC) as nt , row_number() OVER (ORDER BY star_rates DESC) as rn , rank() OVER (ORDER BY star_rates DESC) as rk , dense_rank() OVER (ORDER BY star_rates DESC) as drk , CUME_DIST() OVER (ORDER BY star_rates) as cd , PERCENT_RANK() OVER (ORDER BY star_rates) as pr from rates order by star_rates desc;
select * from app_ranks; app_name star_rates nt rn rk drk cd pr app9 5 1 1 1 1 1.0 0.6666666666666666 app8 5 1 2 1 1 1.0 0.6666666666666666 app7 5 1 3 1 1 1.0 0.6666666666666666 app6 5 1 4 1 1 1.0 0.6666666666666666 app5 3 2 5 5 2 0.6 0.3333333333333333 app4 3 2 6 5 2 0.6 0.3333333333333333 app3 3 2 7 5 2 0.6 0.3333333333333333 app2 2 3 8 8 3 0.3 0.1111111111111111 app1 2 3 9 8 3 0.3 0.1111111111111111 app0 1 3 10 10 4 0.1 0.0
select app_name, star_rates, sum(cd) OVER (PARTITION BY star_rates ORDER BY rn ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) from app_ranks; app0 1 0.1 app2 2 0.3 app1 2 0.6 app5 3 0.6 app4 3 1.2 app3 3 1.7999999999999998 app9 5 1.0 app8 5 2.0 app7 5 3.0 app6 5 4.0
select app_name, star_rates, sum(cd) OVER (PARTITION BY star_rates ORDER BY rn ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) from app_ranks; app0 1 0.1 app2 2 0.6 app1 2 0.3 app5 3 1.7999999999999998 app4 3 1.2 app3 3 0.6 app9 5 4.0 app8 5 3.0 app7 5 2.0 app6 5 1.0
select app_name, star_rates, sum(cd) OVER (PARTITION BY star_rates ORDER BY rn ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING) from app_ranks; app0 1 0.1 app2 2 0.6 app1 2 0.3 app5 3 1.2 app4 3 1.2 app3 3 0.6 app9 5 2.0 app8 5 2.0 app7 5 2.0 app6 5 1.0
select app_name, star_rates, rn, lead(rn) OVER (PARTITION BY star_rates ORDER BY rn), lag(rn) OVER (PARTITION BY star_rates ORDER BY rn) from app_ranks; app0 1 10 NULL NULL app2 2 8 9 NULL app1 2 9 NULL 8 app5 3 5 6 NULL app4 3 6 7 5 app3 3 7 NULL 6 app9 5 1 2 NULL app8 5 2 3 1 app7 5 3 4 2 app6 5 4 NULL 3
CREATE EXTERNAL TABLE visitors ( domain STRING , month STRING , visitor STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t‘;
load data local inpath ‘/home/spark/software/data/visitors.txt‘ overwrite into table visitors;
select * from visitors where domain = ‘d001‘; d001 201301 101 d001 201302 111 d001 201303 121 d001 201304 131 d001 201305 141 d001 201306 151 d001 201307 201 d001 201308 211 d001 201309 221 d001 201310 231 d001 201311 241 d001 201312 301 d001 201401 301 d001 201402 211 d001 201403 271 d001 201404 331 d001 201405 351
select domain , month , visitor , first_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC) , last_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC) , lead(visitor) OVER (PARTITION BY domain ORDER BY month DESC) , lag(visitor) OVER (PARTITION BY domain ORDER BY month DESC) from visitors where domain = ‘d001‘; d001 201405 351 351 351 331 NULL d001 201404 331 351 331 271 351 d001 201403 271 351 271 211 331 d001 201402 211 351 211 301 271 d001 201401 301 351 301 301 211 d001 201312 301 351 301 241 301 d001 201311 241 351 241 231 301 d001 201310 231 351 231 221 241 d001 201309 221 351 221 211 231 d001 201308 211 351 211 201 221 d001 201307 201 351 201 151 211 d001 201306 151 351 151 141 201 d001 201305 141 351 141 131 151 d001 201304 131 351 131 121 141 d001 201303 121 351 121 111 131 d001 201302 111 351 111 101 121 d001 201301 101 351 101 NULL 111
select domain , month , visitor , first_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC) , last_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC) , lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC) , lag(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC) , lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC) , lag(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC) from visitors where domain = ‘d001‘; d001 201405 351 351 351 331 0 141 0 d001 201404 331 351 331 271 351 131 0 d001 201403 271 351 271 211 331 121 0 d001 201402 211 351 211 301 271 111 0 d001 201401 301 351 301 301 211 101 0 d001 201312 301 351 301 241 301 0 0 d001 201311 241 351 241 231 301 0 0 d001 201310 231 351 231 221 241 0 0 d001 201309 221 351 221 211 231 0 0 d001 201308 211 351 211 201 221 0 0 d001 201307 201 351 201 151 211 0 0 d001 201306 151 351 151 141 201 0 0 d001 201305 141 351 141 131 151 0 351 d001 201304 131 351 131 121 141 0 331 d001 201303 121 351 121 111 131 0 271 d001 201302 111 351 111 101 121 0 211 d001 201301 101 351 101 0 111 0 301
create table visitors_report as select domain , month , visitor , lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC) as last_mon , visitor - lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC) as delta_mon , lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC) as last_year , visitor - lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC) as delta_year from visitors ;
select * from visitors_report where domain = ‘d001‘ and month > ‘2014‘; d001 201405 351 331 20.0 141 210.0 d001 201404 331 271 60.0 131 200.0 d001 201403 271 211 60.0 121 150.0 d001 201402 211 301 -90.0 111 100.0 d001 201401 301 301 0.0 101 200.0
select month , domain , visitor , last_mon , last_year from visitors_report where (domain = ‘d001‘ or domain = ‘d002‘) and month > ‘2014‘ order by month desc, domain asc limit 100; 201405 d001 351 331 141 201405 d002 352 332 142 201404 d001 331 271 131 201404 d002 332 272 132 201403 d001 271 211 121 201403 d002 272 212 122 201402 d001 211 301 111 201402 d002 212 302 112 201401 d001 301 301 101 201401 d002 302 302 102
select month , domain , visitor , max(visitor) OVER (PARTITION BY month) as max_visitors , min(visitor) OVER (PARTITION BY month) as min_visitors from visitors where month > ‘2014‘ order by month desc, domain asc; 201405 d001 351 353 351 201405 d002 352 353 351 201405 d003 353 353 351 201404 d001 331 333 331 201404 d002 332 333 331 201404 d003 333 333 331 201403 d001 271 273 271 201403 d002 272 273 271 201403 d003 273 273 271 201402 d001 211 213 211 201402 d002 212 213 211 201402 d003 213 213 211 201401 d001 301 303 301 201401 d002 302 303 301 201401 d003 303 303 301
select * from ( select month , domain , visitor , max(visitor) OVER (PARTITION BY domain ORDER BY month DESC ROWS BETWEEN CURRENT ROW AND 12 FOLLOWING) as max_visitors_last_12_mon , min(visitor) OVER (PARTITION BY domain ORDER BY month DESC ROWS BETWEEN CURRENT ROW AND 12 FOLLOWING) as min_visitors_last_12_mon from visitors ) v where month > ‘20131‘ order by month desc, domain asc; 201405 d001 351 351 141 201405 d002 352 352 142 201405 d003 353 353 143 201404 d001 331 331 131 201404 d002 332 332 132 201404 d003 333 333 133 201403 d001 271 301 121 201403 d002 272 302 122 201403 d003 273 303 123 201402 d001 211 301 111 201402 d002 212 302 112 201402 d003 213 303 113 201401 d001 301 301 101 201401 d002 302 302 102 201401 d003 303 303 103 201312 d001 301 301 101 201312 d002 302 302 102 201312 d003 303 303 103 201311 d001 241 241 101 201311 d002 242 242 102 201311 d003 243 243 103 201310 d001 231 231 101 201310 d002 232 232 102 201310 d003 233 233 103
标签:des style blog color io 使用 ar for 2014
原文地址:http://www.cnblogs.com/luogankun/p/3962761.html