码迷,mamicode.com
首页 > 其他好文 > 详细

Hive综合案例分析之开窗函数使用

时间:2014-09-09 17:43:09      阅读:370      评论:0      收藏:0      [点我收藏+]

标签:des   style   blog   color   io   使用   ar   for   2014   

知识点:

1、Hive的窗口和分析函数进阶

  CUME_DIST 小于等于当前行值的行数 / 总行数

  PERCENT_RANK 当前rank值-1 / 总行数-1

  NTILE 将窗口分成n片

  LEAD(col, n, default) 窗口内下n行值

  LAG(col, n , default) 窗口内上n行值

  FIRST_VALUE 窗口内第一个值

  LAST_VALUE 窗口内最后一个值

2、分析函数中包含三个分析子句

  分组(Partition By)

  排序(Order By)

  窗口(Window)

3、ROWS BETWEEN ... AND ...

  CURRENT ROW 当前行

  N-rows PRECEDING 前面N行

  N-rows FOLLOWING 后面N行

  count() / sum() / max() / min()

 

CREATE TABLE rates (
    app_name    STRING 
  , star_rates  STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY \t;
load data local inpath /home/spark/software/data/rates.txt overwrite into table rates;
select * from rates;
app0    1
app1    2
app2    2
app3    3
app4    3
app5    3
app6    5
app7    5
app8    5
app9    5
create table app_ranks as
  select app_name
       , star_rates
       , NTILE(3) OVER (ORDER BY star_rates DESC) as nt
       , row_number() OVER (ORDER BY star_rates DESC) as rn
       , rank() OVER (ORDER BY star_rates DESC) as rk
       , dense_rank() OVER (ORDER BY star_rates DESC) as drk
       , CUME_DIST() OVER (ORDER BY star_rates) as cd
       , PERCENT_RANK() OVER (ORDER BY star_rates) as pr
  from rates
  order by star_rates desc;
select * from app_ranks;

app_name        star_rates      nt      rn      rk      drk     cd      pr
app9    5       1       1       1       1       1.0     0.6666666666666666
app8    5       1       2       1       1       1.0     0.6666666666666666
app7    5       1       3       1       1       1.0     0.6666666666666666
app6    5       1       4       1       1       1.0     0.6666666666666666
app5    3       2       5       5       2       0.6     0.3333333333333333
app4    3       2       6       5       2       0.6     0.3333333333333333
app3    3       2       7       5       2       0.6     0.3333333333333333
app2    2       3       8       8       3       0.3     0.1111111111111111
app1    2       3       9       8       3       0.3     0.1111111111111111
app0    1       3       10      10      4       0.1     0.0
select app_name, star_rates, sum(cd) OVER (PARTITION BY star_rates ORDER BY rn ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) from app_ranks;
app0    1       0.1
app2    2       0.3
app1    2       0.6
app5    3       0.6
app4    3       1.2
app3    3       1.7999999999999998
app9    5       1.0
app8    5       2.0
app7    5       3.0
app6    5       4.0

 

select app_name, star_rates, sum(cd) OVER (PARTITION BY star_rates ORDER BY rn ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) from app_ranks;
app0    1       0.1
app2    2       0.6
app1    2       0.3
app5    3       1.7999999999999998
app4    3       1.2
app3    3       0.6
app9    5       4.0
app8    5       3.0
app7    5       2.0
app6    5       1.0
select app_name, star_rates, sum(cd) OVER (PARTITION BY star_rates ORDER BY rn ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING) from app_ranks;
app0    1       0.1
app2    2       0.6
app1    2       0.3
app5    3       1.2
app4    3       1.2
app3    3       0.6
app9    5       2.0
app8    5       2.0
app7    5       2.0
app6    5       1.0
select app_name, star_rates, rn, lead(rn) OVER (PARTITION BY star_rates ORDER BY rn), lag(rn) OVER (PARTITION BY star_rates ORDER BY rn) from app_ranks;
app0    1       10      NULL    NULL
app2    2       8       9       NULL
app1    2       9       NULL    8
app5    3       5       6       NULL
app4    3       6       7       5
app3    3       7       NULL    6
app9    5       1       2       NULL
app8    5       2       3       1
app7    5       3       4       2
app6    5       4       NULL    3

 

 

CREATE EXTERNAL TABLE visitors (
    domain  STRING 
  , month   STRING
  , visitor STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY \t;
load data local inpath /home/spark/software/data/visitors.txt overwrite into table visitors;
select * from visitors where domain = d001;
d001    201301  101
d001    201302  111
d001    201303  121
d001    201304  131
d001    201305  141
d001    201306  151
d001    201307  201
d001    201308  211
d001    201309  221
d001    201310  231
d001    201311  241
d001    201312  301
d001    201401  301
d001    201402  211
d001    201403  271
d001    201404  331
d001    201405  351
select domain
     , month
     , visitor
     , first_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC)
     , last_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC)
     , lead(visitor) OVER (PARTITION BY domain ORDER BY month DESC)
     , lag(visitor) OVER (PARTITION BY domain ORDER BY month DESC)
from visitors
where domain = d001;
d001    201405  351     351     351     331     NULL
d001    201404  331     351     331     271     351
d001    201403  271     351     271     211     331
d001    201402  211     351     211     301     271
d001    201401  301     351     301     301     211
d001    201312  301     351     301     241     301
d001    201311  241     351     241     231     301
d001    201310  231     351     231     221     241
d001    201309  221     351     221     211     231
d001    201308  211     351     211     201     221
d001    201307  201     351     201     151     211
d001    201306  151     351     151     141     201
d001    201305  141     351     141     131     151
d001    201304  131     351     131     121     141
d001    201303  121     351     121     111     131
d001    201302  111     351     111     101     121
d001    201301  101     351     101     NULL    111
select domain
     , month
     , visitor
     , first_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC)
     , last_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC)
     , lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC)
     , lag(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC)
     , lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC)
     , lag(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC)
from visitors
where domain = d001;
d001    201405  351     351     351     331     0       141     0
d001    201404  331     351     331     271     351     131     0
d001    201403  271     351     271     211     331     121     0
d001    201402  211     351     211     301     271     111     0
d001    201401  301     351     301     301     211     101     0
d001    201312  301     351     301     241     301     0       0
d001    201311  241     351     241     231     301     0       0
d001    201310  231     351     231     221     241     0       0
d001    201309  221     351     221     211     231     0       0
d001    201308  211     351     211     201     221     0       0
d001    201307  201     351     201     151     211     0       0
d001    201306  151     351     151     141     201     0       0
d001    201305  141     351     141     131     151     0       351
d001    201304  131     351     131     121     141     0       331
d001    201303  121     351     121     111     131     0       271
d001    201302  111     351     111     101     121     0       211
d001    201301  101     351     101     0       111     0       301
create table visitors_report as
  select domain
       , month
       , visitor
       , lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC)            as last_mon
       , visitor - lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC)  as delta_mon
       , lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC)           as last_year
       , visitor - lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC) as delta_year
  from visitors
;
select * from visitors_report where domain = d001 and month > 2014;
d001    201405  351     331     20.0    141     210.0
d001    201404  331     271     60.0    131     200.0
d001    201403  271     211     60.0    121     150.0
d001    201402  211     301     -90.0   111     100.0
d001    201401  301     301     0.0     101     200.0
select month
     , domain
     , visitor
     , last_mon
     , last_year
from visitors_report
where (domain = d001 or domain = d002) and month > 2014
order by month desc, domain asc
limit 100;
201405  d001    351     331     141
201405  d002    352     332     142
201404  d001    331     271     131
201404  d002    332     272     132
201403  d001    271     211     121
201403  d002    272     212     122
201402  d001    211     301     111
201402  d002    212     302     112
201401  d001    301     301     101
201401  d002    302     302     102
select month
     , domain
     , visitor
     , max(visitor) OVER (PARTITION BY month) as max_visitors
     , min(visitor) OVER (PARTITION BY month) as min_visitors
from visitors
where month > 2014
order by month desc, domain asc;
201405  d001    351     353     351
201405  d002    352     353     351
201405  d003    353     353     351
201404  d001    331     333     331
201404  d002    332     333     331
201404  d003    333     333     331
201403  d001    271     273     271
201403  d002    272     273     271
201403  d003    273     273     271
201402  d001    211     213     211
201402  d002    212     213     211
201402  d003    213     213     211
201401  d001    301     303     301
201401  d002    302     303     301
201401  d003    303     303     301
select *
from (
select month
     , domain
     , visitor
     , max(visitor) OVER (PARTITION BY domain ORDER BY month DESC ROWS BETWEEN CURRENT ROW AND 12 FOLLOWING) as max_visitors_last_12_mon
     , min(visitor) OVER (PARTITION BY domain ORDER BY month DESC ROWS BETWEEN CURRENT ROW AND 12 FOLLOWING) as min_visitors_last_12_mon
from visitors
) v
where month > 20131
order by month desc, domain asc;
201405  d001    351     351     141
201405  d002    352     352     142
201405  d003    353     353     143
201404  d001    331     331     131
201404  d002    332     332     132
201404  d003    333     333     133
201403  d001    271     301     121
201403  d002    272     302     122
201403  d003    273     303     123
201402  d001    211     301     111
201402  d002    212     302     112
201402  d003    213     303     113
201401  d001    301     301     101
201401  d002    302     302     102
201401  d003    303     303     103
201312  d001    301     301     101
201312  d002    302     302     102
201312  d003    303     303     103
201311  d001    241     241     101
201311  d002    242     242     102
201311  d003    243     243     103
201310  d001    231     231     101
201310  d002    232     232     102
201310  d003    233     233     103

 

Hive综合案例分析之开窗函数使用

标签:des   style   blog   color   io   使用   ar   for   2014   

原文地址:http://www.cnblogs.com/luogankun/p/3962761.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!