码迷,mamicode.com
首页 > 其他好文 > 详细

Hive综合案例分析之简易推荐系统

时间:2014-09-09 15:27:09      阅读:1106      评论:0      收藏:0      [点我收藏+]

标签:des   style   blog   color   io   使用   ar   for   数据   

知识点:

1、Hive复合数据类型map与Lateral View的使用;

  map、str_to_map、map_keys、map_values,map与lateral view

2、通过translate进行简单数据保护;

  Hive转换函数进行数据保护,确保企业应用信息安全

3、Hive的窗口和分析函数入门;

  row_number、rank、dense_rank

 

创建订单表:

CREATE EXTERNAL TABLE f_orders (
    user_id   STRING
  , ts        STRING
  , order_id  STRING
  , items     map<STRING,BIGINT>
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY \t
COLLECTION ITEMS TERMINATED BY |
MAP KEYS TERMINATED BY :;

 加载数据:

load data local inpath /home/spark/software/data/f_orders.txt overwrite into table f_orders;

 查询数据:

select * from f_orders;
11      2014-05-01 06:01:12.334+01      10703007267488  {"item8":2,"item1":1}
22      2014-05-01 07:28:12.342+01      10101043505096  {"item6":3,"item3":2}
33      2014-05-01 07:50:12.33+01       10103043509747  {"item7":7}
11      2014-05-01 09:27:12.33+01       10103043501575  {"item5":5,"item1":1,"item4":1,"item9":1}
22      2014-05-01 09:03:12.324+01      10104043514061  {"item1":3}
33      2014-05-02 19:10:12.343+01      11003002067594  {"item4":2,"item1":1}
11      2014-05-02 09:07:12.344+01      10101043497459  {"item9":1}
35      2014-05-03 11:07:12.339+01      10203019269975  {"item5":1,"item1":1}
789     2014-05-03 12:59:12.743+01      10401003346256  {"item7":3,"item8":2,"item9":1}
77      2014-05-03 18:04:12.355+01      10203019262235  {"item5":2,"item1":1}
99      2014-05-04 00:36:39.713+01      10103044681799  {"item9":3,"item1":1}
33      2014-05-04 19:10:12.343+01      12345678901234  {"item5":1,"item1":1}
11      2014-05-05 09:07:12.344+01      12345678901235  {"item6":1,"item1":1}
35      2014-05-05 11:07:12.339+01      12345678901236  {"item5":2,"item1":1}
22      2014-05-05 12:59:12.743+01      12345678901237  {"item9":3,"item1":1}
77      2014-05-05 18:04:12.355+01      12345678901238  {"item8":3,"item1":1}
99      2014-05-05 20:36:39.713+01      12345678901239  {"item9":3,"item1":1}

从map中取值:map_keys, map_values

select map_keys(items), map_values(items) from f_orders where user_id = 35;
["item5","item1"]       [1,1]
["item5","item1"]       [2,1]

查询包含订单条目中有item8的订单

select * from f_orders where array_contains(map_keys(items), item8);
11      2014-05-01 06:01:12.334+01      10703007267488  {"item1":1,"item8":2}
789     2014-05-03 12:59:12.743+01      10401003346256  {"item7":3,"item8":2,"item9":1}
77      2014-05-05 18:04:12.355+01      12345678901238  {"item1":1,"item8":3}

将f_orders中items列打开成横向视图

select user_id, order_id, item, amount from f_orders LATERAL VIEW explode(items) t AS item, amount;
11      10703007267488  item8   2
11      10703007267488  item1   1
22      10101043505096  item6   3
22      10101043505096  item3   2
33      10103043509747  item7   7
11      10103043501575  item5   5
11      10103043501575  item1   1
11      10103043501575  item4   1
11      10103043501575  item9   1
22      10104043514061  item1   3
33      11003002067594  item4   2
33      11003002067594  item1   1
11      10101043497459  item9   1
35      10203019269975  item5   1
35      10203019269975  item1   1
789     10401003346256  item7   3
789     10401003346256  item8   2
789     10401003346256  item9   1
77      10203019262235  item5   2
77      10203019262235  item1   1
99      10103044681799  item9   3
99      10103044681799  item1   1
33      12345678901234  item5   1
33      12345678901234  item1   1
11      12345678901235  item6   1
11      12345678901235  item1   1
35      12345678901236  item5   2
35      12345678901236  item1   1
22      12345678901237  item9   3
22      12345678901237  item1   1
77      12345678901238  item8   3
77      12345678901238  item1   1
99      12345678901239  item9   3
99      12345678901239  item1   1

创建订单条目表:

CREATE EXTERNAL TABLE d_items (
  item_sku  STRING,
  price     DOUBLE,
  catalogs  array<STRING>
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY \t
COLLECTION ITEMS TERMINATED BY |;

加载数据:

load data local inpath /home/spark/software/data/d_items.txt overwrite into table d_items;

查询数据:

select * from d_items;
item1   100.2   ["catalogA","catalogD","catalogX"]
item2   200.3   ["catalogA"]
item3   300.4   ["catalogA","catalogX"]
item4   400.5   ["catalogB"]
item5   500.6   ["catalogB","catalogX"]
item6   600.7   ["catalogB"]
item7   700.8   ["catalogC"]
item8   800.9   ["catalogC","catalogD"]
item9   899.99  ["catalogC","catalogA"]

求每个人的每个订单的金额

select orders.user_id, orders.order_id, round(sum(d.price*orders.amount), 2) as order_price
from (
  select user_id, order_id, item, amount from f_orders LATERAL VIEW explode(items) t AS item, amount
) orders
join d_items d
on (orders.item = d.item_sku)
group by orders.user_id, orders.order_id;

11      10101043497459  899.99
11      10103043501575  3903.69
11      10703007267488  1702.0
11      12345678901235  700.9
22      10101043505096  2402.9
22      10104043514061  300.6
22      12345678901237  2800.17
33      10103043509747  4905.6
33      11003002067594  901.2
33      12345678901234  600.8
35      10203019269975  600.8
35      12345678901236  1101.4
77      10203019262235  1101.4
77      12345678901238  2502.9
789     10401003346256  4604.19
99      10103044681799  2800.17
99      12345678901239  2800.17

求人和订单条目以及订单条目数量对应关系的数量

select user_id, item, amount from f_orders LATERAL VIEW explode(items) t AS item, amount;
11      item8   2
11      item1   1
22      item6   3
22      item3   2
33      item7   7
11      item5   5
11      item1   1
11      item4   1
11      item9   1
22      item1   3
33      item4   2
33      item1   1
11      item9   1
35      item5   1
35      item1   1
789     item7   3
789     item8   2
789     item9   1
77      item5   2
77      item1   1
99      item9   3
99      item1   1
33      item5   1
33      item1   1
11      item6   1
11      item1   1
35      item5   2
35      item1   1
22      item9   3
22      item1   1
77      item8   3
77      item1   1
99      item9   3
99      item1   1

订单条目与类别(类别打散后)的关系

select item_sku, catalog from d_items LATERAL VIEW explode(catalogs) t AS catalog;
item1   catalogA
item1   catalogD
item1   catalogX
item2   catalogA
item3   catalogA
item3   catalogX
item4   catalogB
item5   catalogB
item5   catalogX
item6   catalogB
item7   catalogC
item8   catalogC
item8   catalogD
item9   catalogC
item9   catalogA

人和订单条目和订单条目数量以及与类别(类别打散后)的关系

select orders.user_id, orders.item, orders.amount, catalogs.catalog
from (
  select user_id, item, amount from f_orders LATERAL VIEW explode(items) t AS item, amount
) orders
join (
  select item_sku, catalog from d_items LATERAL VIEW explode(catalogs) t AS catalog
) catalogs
on (orders.item = catalogs.item_sku)
;
11      item8   2       catalogC
11      item8   2       catalogD
11      item1   1       catalogA
11      item1   1       catalogD
11      item1   1       catalogX
22      item6   3       catalogB
22      item3   2       catalogA
22      item3   2       catalogX
33      item7   7       catalogC
11      item5   5       catalogB
11      item5   5       catalogX
11      item1   1       catalogA
11      item1   1       catalogD
11      item1   1       catalogX
11      item4   1       catalogB
11      item9   1       catalogC
11      item9   1       catalogA
22      item1   3       catalogA
22      item1   3       catalogD
22      item1   3       catalogX
33      item4   2       catalogB
33      item1   1       catalogA
33      item1   1       catalogD
33      item1   1       catalogX
11      item9   1       catalogC
11      item9   1       catalogA
35      item5   1       catalogB
35      item5   1       catalogX
35      item1   1       catalogA
35      item1   1       catalogD
35      item1   1       catalogX
789     item7   3       catalogC
789     item8   2       catalogC
789     item8   2       catalogD
789     item9   1       catalogC
789     item9   1       catalogA
77      item5   2       catalogB
77      item5   2       catalogX
77      item1   1       catalogA
77      item1   1       catalogD
77      item1   1       catalogX
99      item9   3       catalogC
99      item9   3       catalogA
99      item1   1       catalogA
99      item1   1       catalogD
99      item1   1       catalogX
33      item5   1       catalogB
33      item5   1       catalogX
33      item1   1       catalogA
33      item1   1       catalogD
33      item1   1       catalogX
11      item6   1       catalogB
11      item1   1       catalogA
11      item1   1       catalogD
11      item1   1       catalogX
35      item5   2       catalogB
35      item5   2       catalogX
35      item1   1       catalogA
35      item1   1       catalogD
35      item1   1       catalogX
22      item9   3       catalogC
22      item9   3       catalogA
22      item1   1       catalogA
22      item1   1       catalogD
22      item1   1       catalogX
77      item8   3       catalogC
77      item8   3       catalogD
77      item1   1       catalogA
77      item1   1       catalogD
77      item1   1       catalogX
99      item9   3       catalogC
99      item9   3       catalogA
99      item1   1       catalogA
99      item1   1       catalogD
99      item1   1       catalogX

将结果写到usr_cat_weight表中

create table usr_cat_weight as
select orders.user_id, catalogs.catalog, sum(orders.amount) as weight
from (
  select user_id, item, amount from f_orders LATERAL VIEW explode(items) t AS item, amount
) orders
join (
  select item_sku, catalog from d_items LATERAL VIEW explode(catalogs) t AS catalog
) catalogs
on (orders.item = catalogs.item_sku)
group by orders.user_id, catalogs.catalog
order by user_id, weight desc;
select * from usr_cat_weight;
11      catalogX        8
11      catalogB        7
11      catalogD        5
11      catalogA        5
11      catalogC        4
22      catalogA        9
22      catalogX        6
22      catalogD        4
22      catalogB        3
22      catalogC        3
33      catalogC        7
33      catalogX        3
33      catalogB        3
33      catalogA        2
33      catalogD        2
35      catalogX        5
35      catalogB        3
35      catalogA        2
35      catalogD        2
77      catalogD        5
77      catalogX        4
77      catalogC        3
77      catalogA        2
77      catalogB        2
789     catalogC        6
789     catalogD        2
789     catalogA        1
99      catalogA        8
99      catalogC        6
99      catalogD        2
99      catalogX        2

row_number: 行号

select user_id, catalog, weight, row_number() OVER (PARTITION BY user_id ORDER BY weight DESC) as row_num FROM usr_cat_weight where user_id < 33;
11      catalogX        8       1
11      catalogB        7       2
11      catalogA        5       3
11      catalogD        5       4
11      catalogC        4       5
22      catalogA        9       1
22      catalogX        6       2
22      catalogD        4       3
22      catalogC        3       4
22      catalogB        3       5

rank: 相同的值排名是相同的,排名值会跳过重复排名的

select user_id, catalog, weight, rank() OVER (PARTITION BY user_id ORDER BY weight DESC) as rnk FROM usr_cat_weight where user_id < 33;
11      catalogX        8       1
11      catalogB        7       2
11      catalogA        5       3
11      catalogD        5       3
11      catalogC        4       5
22      catalogA        9       1
22      catalogX        6       2
22      catalogD        4       3
22      catalogC        3       4
22      catalogB        3       4

dense_rank:排名值不会跳过重复排名的

select user_id, catalog, weight, dense_rank() OVER (PARTITION BY user_id ORDER BY weight DESC) as drnk FROM usr_cat_weight where user_id < 33;
11      catalogX        8       1
11      catalogB        7       2
11      catalogA        5       3
11      catalogD        5       3
11      catalogC        4       4
22      catalogA        9       1
22      catalogX        6       2
22      catalogD        4       3
22      catalogC        3       4
22      catalogB        3       4
CREATE TABLE usr_cat AS
select user_id, catalog, row_number() OVER (PARTITION BY user_id ORDER BY weight DESC) as row_num
FROM (
select orders.user_id, catalogs.catalog, sum(orders.amount) as weight
from (
  select user_id, item, amount from f_orders LATERAL VIEW explode(items) t AS item, amount
) orders
join (
  select item_sku, catalog from d_items LATERAL VIEW explode(catalogs) t AS catalog
) catalogs
on (orders.item = catalogs.item_sku)
group by orders.user_id, catalogs.catalog
order by user_id, weight
) x
ORDER BY user_id, row_num;
select * from usr_cat;
11      catalogX        1
11      catalogB        2
11      catalogA        3
11      catalogD        4
11      catalogC        5
22      catalogA        1
22      catalogX        2
22      catalogD        3
22      catalogC        4
22      catalogB        5
33      catalogC        1
33      catalogB        2
33      catalogX        3
33      catalogD        4
33      catalogA        5
35      catalogX        1
35      catalogB        2
35      catalogA        3
35      catalogD        4
77      catalogD        1
77      catalogX        2
77      catalogC        3
77      catalogA        4
77      catalogB        5
789     catalogC        1
789     catalogD        2
789     catalogA        3
99      catalogA        1
99      catalogC        2
99      catalogD        3
99      catalogX        4

创建用户表:

CREATE EXTERNAL TABLE d_users (
    user_id  STRING
  , gender   STRING
  , birthday STRING
  , email    STRING
  , regday   STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY \073;

加载数据:

load data local inpath /home/spark/software/data/d_users.txt overwrite into table d_users;

查询:

select * from d_users;
11      m       1981-01-01      张三@gmail.com        2014-04-21
22      w       1982-01-01      user22@abcn.net 2014-04-22
33      m       1983-01-01      user33@fxlive.de        2014-04-23
77      w       1977-01-01      user77@fxlive.fr        2014-05-01
88      m       1988-01-01      user88@fxlive.eu        2014-05-02
99      w       1999-01-01      user99@abcn.net 2014-05-03
789     m       2008-01-01      admin@abcn.net  2014-05-03

Hive转换函数translate进行简单数据保护

select user_id, birthday, translate(birthday, 0123456789, 1234567890), email, translate(email, userfxgmail1234567890, 1234567890userfxgmail) from d_users;
11      1981-01-01      2092-12-12      user11@gmail.com        1234ss@7890u.co8
22      1982-01-01      2093-12-12      user22@abcn.net 1234ee@9bcn.n3t
33      1983-01-01      2094-12-12      user33@fxlive.de        1234rr@56u0v3.d3
77      1977-01-01      2088-12-12      user77@fxlive.fr        1234mm@56u0v3.54
88      1988-01-01      2099-12-12      user88@fxlive.eu        1234aa@56u0v3.31
99      1999-01-01      2000-12-12      user99@abcn.net 1234ii@9bcn.n3t
789     2008-01-01      3119-12-12      admin@abcn.net  9d80n@9bcn.n3t

 

Hive综合案例分析之简易推荐系统

标签:des   style   blog   color   io   使用   ar   for   数据   

原文地址:http://www.cnblogs.com/luogankun/p/3962377.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!