码迷,mamicode.com
首页 > 其他好文 > 详细

电影表

时间:2019-05-13 16:15:32      阅读:103      评论:0      收藏:0      [点我收藏+]

标签:field   div   准备   insert   cti   com   要求   sqoop   最大的   

技术图片

movies表
电影id,电影名称,电影类型

ratings表
用户id,电影id,评分,评论时间(毫秒值)

users表
用户id,性别,年龄段,工作类型,电话

age表
年龄段,年龄范围

job表
工作类型,工作岗位名称


1.清洗数据 mr

2.hive计算
表结构准备 外部表

create external table age(
aid string,
agescore string
)
row format delimited fields terminated by \t
lines terminated by \n
stored as textfile
location /project/age;

create external table job(
jid string,
jobname string
)
row format delimited fields terminated by \t
lines terminated by \n
stored as textfile
location /project/job;

create external table movies(
mid string,
mname string,
type string
)
row format delimited fields terminated by ,
lines terminated by \n
stored as textfile
location /project/movies;

create external table ratings(
uid string,
mid string,
score int,
mytime string
)
row format delimited fields terminated by ,
lines terminated by \n
stored as textfile
location /project/ratings;


create external table users(
uid string,
sex string,
aid string,
jid string,
tel string
)
row format delimited fields terminated by ,
lines terminated by \n
stored as textfile
location /project/users;


要求:
1.算出所有电影的平均评分。
每个电影的平均分
select m.mname,round(t.avg,2)
from
movies m,
(select avg(score) avg,mid from ratings group by mid)t
where m.mid = t.mid




中间过程  含有有效 和 脏  score is null
create table result1(
mid string,
mname string,
score double
)
row format delimited fields terminated by \t
lines terminated by \n
stored as textfile;

//3883  3882 score  4000 null
insert into result1
select mov.mid,mov.mname,t1.score
from movies mov left join(
   select mid,round(avg(score),1) score
   from ratings
   group by mid) t1 on mov.mid=t1.mid;

//hive和mysql 不同 update 并且 最不值钱的就是 表
create table result2(
mid string,
mname string,
score double
)
row format delimited fields terminated by \t
lines terminated by \n
stored as textfile;

insert into table result2
select mid,mname,score
from result1
where score is not null;

insert into table result2
select mid,mname,0
from result10
where score is null;



2.算出所有用户对那些类型的电影最感兴趣,对每个用户推送该类型前三名评分最高的电影。
每个用户最喜欢哪个类型的电影
row_number()
ratings表中 一个电影出现几次?
一个电影有几个类型?
类型在movies   评分或者观看次数ratings

movie join ratings

select t5.uid,t5.lx,m.mname
from
(select t3.uid,t3.lx,t4.mid
from
(select t2.uid,t2.lx
from
(select row_number() over (partition by t1.uid order by t1.cnt desc) rn,t1.uid,t1.lx
from
(select count(*) cnt,r.uid,t.lx
from
(select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx)t
,ratings r
where r.mid=t.mid
group by r.uid,t.lx)t1)t2
where t2.rn=1)t3
,
(select t2.lx,t2.mid 
from
(select row_number() over (partition by t1.lx order by t1.cnt desc) rn,t1.lx,t1.mid
from
(select count(*) cnt,t.lx,r.mid
from
(select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx)t
,
ratings r
where r.mid = t.mid
group by t.lx,r.mid)t1)t2
where t2.rn<4)t4
where t3.lx =t4.lx)t5
,movies m
where t5.mid=m.mid 


每个类型最受欢迎的前三个电影

类型== 类型










select t3.uid,t3.lx,t4.mid
from
(select t2.lx,t2.mid,t2.cnt
from
(select row_number() over (partition by t1.lx order by t1.cnt desc)rn,t1.lx,t1.mid,t1.cnt
from
(select count(*) cnt,t.lx,t.mid
from ratings r,
(select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx) t
where r.mid = t.mid
group by t.lx,t.mid)t1)t2
where t2.rn<4)t4
join
(select t2.lx,t2.uid,t2.cnt
from
(select row_number() over (partition by t1.uid order by t1.cnt desc)rn,t1.lx,t1.uid,t1.cnt
from
(select count(*) cnt,t.lx,r.uid
from ratings r,
(select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx) t
where r.mid = t.mid
group by t.lx,r.uid)t1)t2
where t2.rn=1)t3
where t3.lx = t4.lx

//hive 最不值钱的是表 一半 人 和 类型
create table result3(
uid string,
type string
)
row format delimited fields terminated by \t
lines terminated by \n
stored as textfile;

insert into result3
select users.uid,t4.type
from(
select t3.uid uid,t3.type type
   from(
      select t2.uid uid,t2.type type,t2.num num,row_number() over(partition by t2.uid order by num desc) rk
      from(
         select t1.uid uid,t1.type type,count(*) num
         from(
            select rat.uid uid,mov.type type
            from ratings rat,movies mov
            where rat.mid=mov.mid) t1
         group by t1.uid,t1.type) t2) t3
   where t3.rk=1) t4,users
where t4.uid=users.uid;

create table result4(
type string,
mname string
)
row format delimited fields terminated by \t
lines terminated by \n
stored as textfile;

insert into result4
select t2.type,t2.mname
from(
select t1.type type,t1.mname mname,row_number() over(partition by t1.type order by t1.score desc) rk
from(
   select mov.type type,mov.mname mname,avg(rat.score) score
   from ratings rat,movies mov
   where rat.mid=mov.mid
   group by mov.type,mov.mname) t1 ) t2
where t2.rk<4;   

select res3.uid,res4.mname
from result3 res3,result4 res4
where res3.type=res4.type;

3.每个年龄层次,前3种最受欢迎的电影类型。
select t3.agescore,t3.lx,t3.cnt
from
(select row_number() over (partition by t2.agescore order by t2.cnt desc) rn,t2.agescore,t2.lx,t2.cnt
from
(select t.lx,r.agescore,count(*) cnt
from
(select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx) t
,ratings r
where t.mid = r.mid)t1,users u,age a
where u.uid=t1.uid and u.aid=a.aid
group by a.agescore,t1.lx)t2)t3
where t3.rn<=3






movies ratings users age


ratings users movie
uid mid    aid,type

基础 方式 两两关联 没有优化 job数量是最大的
select age.agescore,t4.type
from(
select t3.aid aid,t3.type type
from(
   select t2.aid aid,t2.type type,t2.num num,row_number() over(partition by t2.aid order by num desc) rk
   from(
      select t1.aid aid,t1.type type,count(*) num
      from(
         select users.aid aid,mov.type type
         from ratings rat,users,movies mov
         where rat.uid=users.uid and rat.mid=mov.mid) t1
      group by t1.aid,t1.type) t2 ) t3
where t3.rk<4) t4,age
where t4.aid=age.aid;

最终目标 1sql 做所有
sql编程 sql语句由复杂的sql嵌套组成  能在一次sql当中 尽量的做做些事情 hive-》job 分布式-》提交 计算资源(数据在哪 需要哪些资源)->run

create table result5(
agescore string,
type string
)
row format delimited fields terminated by \t
lines terminated by \n
stored as textfile;

insert into result5
select age.agescore,t2.type
from(
   select t1.aid aid,t1.type type,t1.num num,row_number() over(partition by t1.aid order by t1.num desc) rk
   from(
      select users.aid aid,mov.type type,count(*) num
      from ratings rat,users,movies mov
      where rat.uid=users.uid and rat.mid=mov.mid
      group by users.aid,mov.type) t1 ) t2,age
where t2.rk<4 and t2.aid=age.aid;



4.算出所有电影都有那些工作岗位的人评论过,相对应的人数是多少?

select count(*),j.jobname,r.mid
from
ratings r,users u,job j
where r.uid = u.uid and j.jid=u.jid
group by j.jobname,r.mid limit 10;











rat->user->job

select t1.mname,t1.jobname,count(*) num
from(
   select job.jobname jobname,mov.mname
   from ratings rat,users,job,movies mov
   where rat.uid=users.uid and users.jid=job.jid and rat.mid=mov.mid) t1
group by t1.jobname,t1.mname
order by t1.mname,num desc;

from
where
group by
having
select
order by
limit

5.根据电影上映时间,得出当年最受欢迎和最不受欢迎的 10个名单
select t5.year,m1.mname,t5.cnt1,m2.mname,t5.cnt2,t5.rn
from
(select t3.year,t3.mid mid1,t3.cnt cnt1,t4.mid mid2,t4.cnt cnt2,t3.rn
from
(select t2.year,t2.mid,t2.cnt,t2.rn
from
(select row_number() over (partition by t1.year order by t1.cnt desc) rn, t1.year,t1.mid,t1.cnt
from
(select getDate(m.mname) year,r.mid,count(*) cnt
from movies m,ratings r
where m.mid = r.mid
group by getDate(m.mname),r.mid)t1)t2
where t2.rn<11)t3
,
(select t2.year,t2.mid,t2.cnt,t2.rn
from
(select row_number() over (partition by t1.year order by t1.cnt )rn, t1.year,t1.mid,t1.cnt
from
(select getDate(m.mname) year,r.mid,count(*) cnt
from movies m,ratings r
where m.mid = r.mid
group by getDate(m.mname),r.mid)t1)t2
where t2.rn<11)t4
where t4.year=t3.year and t3.rn=t4.rn) t5,movies m1,movies m2
where m1.mid = t5.mid1 and t5.mid2=m2.mid




1.hive udf

add jar /home/data/myTime.jar;
create temporary function myTime as com.beiwang.project.MyTime;


2.每年最受欢迎和最不受欢迎 top10
最受欢迎top10
create table result6(
year string,
mname string,
rk string,
flag string
)
row format delimited fields terminated by \t
lines terminated by \n
stored as textfile;

insert into result6
select t3.year,t3.mname,t3.rk,good
from(
   select t2.year year,t2.mname mname,row_number() over(partition by year order by num desc) rk
   from(
      select t1.year year,t1.mname mname,count(*) num
      from(
         select myTime(mov.mname) year,mov.mname mname
         from ratings rat,movies mov
         where rat.mid=mov.mid) t1
      group by t1.year,t1.mname ) t2 ) t3
where t3.rk<=10;

最不受欢迎top10

insert into result6
select t3.year,t3.mname,t3.rk,bad
from(
   select t2.year year,t2.mname mname,row_number() over(partition by year order by num) rk
   from(
      select t1.year year,t1.mname mname,count(*) num
      from(
         select myTime(mov.mname) year,mov.mname mname
         from ratings rat,movies mov
         where rat.mid=mov.mid) t1
      group by t1.year,t1.mname ) t2 ) t3
where t3.rk<=10;

sqoop hive --》 mysql


6.得出每个岗位 最喜欢的电影类型 前3名


7.每年 每个类型 最受欢迎的电影 前3名

(
group by year type
)
where rk<3

age.txt

1    "Under 18"
18    "18-24"
25    "25-34"
35    "35-44"
45    "45-49"
50    "50-55"
56    "56+"

job.txt

0    "other" or not specified
1    "academic/educator"
2    "artist"
3    "clerical/admin"
4    "college/grad student"
5    "customer service"
6    "doctor/health care"
7    "executive/managerial"
8    "farmer"
9    "homemaker"
10    "K-12 student"
11    "lawyer"
12    "programmer"
13    "retired"
14    "sales/marketing"
15    "scientist"
16    "self-employed"
17    "technician/engineer"
18    "tradesman/craftsman"
19    "unemployed"
20    "writer"

 

电影表

标签:field   div   准备   insert   cti   com   要求   sqoop   最大的   

原文地址:https://www.cnblogs.com/JBLi/p/10856921.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!