标签:field div 准备 insert cti com 要求 sqoop 最大的
movies表 电影id,电影名称,电影类型 ratings表 用户id,电影id,评分,评论时间(毫秒值) users表 用户id,性别,年龄段,工作类型,电话 age表 年龄段,年龄范围 job表 工作类型,工作岗位名称 1.清洗数据 mr 2.hive计算 表结构准备 外部表 create external table age( aid string, agescore string ) row format delimited fields terminated by ‘\t‘ lines terminated by ‘\n‘ stored as textfile location ‘/project/age‘; create external table job( jid string, jobname string ) row format delimited fields terminated by ‘\t‘ lines terminated by ‘\n‘ stored as textfile location ‘/project/job‘; create external table movies( mid string, mname string, type string ) row format delimited fields terminated by ‘,‘ lines terminated by ‘\n‘ stored as textfile location ‘/project/movies‘; create external table ratings( uid string, mid string, score int, mytime string ) row format delimited fields terminated by ‘,‘ lines terminated by ‘\n‘ stored as textfile location ‘/project/ratings‘; create external table users( uid string, sex string, aid string, jid string, tel string ) row format delimited fields terminated by ‘,‘ lines terminated by ‘\n‘ stored as textfile location ‘/project/users‘; 要求: 1.算出所有电影的平均评分。 每个电影的平均分 select m.mname,round(t.avg,2) from movies m, (select avg(score) avg,mid from ratings group by mid)t where m.mid = t.mid 中间过程 含有有效 和 脏 score is null create table result1( mid string, mname string, score double ) row format delimited fields terminated by ‘\t‘ lines terminated by ‘\n‘ stored as textfile; //3883 3882 score 4000 null insert into result1 select mov.mid,mov.mname,t1.score from movies mov left join( select mid,round(avg(score),1) score from ratings group by mid) t1 on mov.mid=t1.mid; //hive和mysql 不同 update 并且 最不值钱的就是 表 create table result2( mid string, mname string, score double ) row format delimited fields terminated by ‘\t‘ lines terminated by ‘\n‘ stored as textfile; insert into table result2 select mid,mname,score from result1 where score is not null; insert into table result2 select mid,mname,0 from result10 where score is null; 2.算出所有用户对那些类型的电影最感兴趣,对每个用户推送该类型前三名评分最高的电影。 每个用户最喜欢哪个类型的电影 row_number() ratings表中 一个电影出现几次? 一个电影有几个类型? 类型在movies 评分或者观看次数ratings movie join ratings select t5.uid,t5.lx,m.mname from (select t3.uid,t3.lx,t4.mid from (select t2.uid,t2.lx from (select row_number() over (partition by t1.uid order by t1.cnt desc) rn,t1.uid,t1.lx from (select count(*) cnt,r.uid,t.lx from (select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx)t ,ratings r where r.mid=t.mid group by r.uid,t.lx)t1)t2 where t2.rn=1)t3 , (select t2.lx,t2.mid from (select row_number() over (partition by t1.lx order by t1.cnt desc) rn,t1.lx,t1.mid from (select count(*) cnt,t.lx,r.mid from (select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx)t , ratings r where r.mid = t.mid group by t.lx,r.mid)t1)t2 where t2.rn<4)t4 where t3.lx =t4.lx)t5 ,movies m where t5.mid=m.mid 每个类型最受欢迎的前三个电影 类型== 类型 select t3.uid,t3.lx,t4.mid from (select t2.lx,t2.mid,t2.cnt from (select row_number() over (partition by t1.lx order by t1.cnt desc)rn,t1.lx,t1.mid,t1.cnt from (select count(*) cnt,t.lx,t.mid from ratings r, (select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx) t where r.mid = t.mid group by t.lx,t.mid)t1)t2 where t2.rn<4)t4 join (select t2.lx,t2.uid,t2.cnt from (select row_number() over (partition by t1.uid order by t1.cnt desc)rn,t1.lx,t1.uid,t1.cnt from (select count(*) cnt,t.lx,r.uid from ratings r, (select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx) t where r.mid = t.mid group by t.lx,r.uid)t1)t2 where t2.rn=1)t3 where t3.lx = t4.lx //hive 最不值钱的是表 一半 人 和 类型 create table result3( uid string, type string ) row format delimited fields terminated by ‘\t‘ lines terminated by ‘\n‘ stored as textfile; insert into result3 select users.uid,t4.type from( select t3.uid uid,t3.type type from( select t2.uid uid,t2.type type,t2.num num,row_number() over(partition by t2.uid order by num desc) rk from( select t1.uid uid,t1.type type,count(*) num from( select rat.uid uid,mov.type type from ratings rat,movies mov where rat.mid=mov.mid) t1 group by t1.uid,t1.type) t2) t3 where t3.rk=1) t4,users where t4.uid=users.uid; create table result4( type string, mname string ) row format delimited fields terminated by ‘\t‘ lines terminated by ‘\n‘ stored as textfile; insert into result4 select t2.type,t2.mname from( select t1.type type,t1.mname mname,row_number() over(partition by t1.type order by t1.score desc) rk from( select mov.type type,mov.mname mname,avg(rat.score) score from ratings rat,movies mov where rat.mid=mov.mid group by mov.type,mov.mname) t1 ) t2 where t2.rk<4; select res3.uid,res4.mname from result3 res3,result4 res4 where res3.type=res4.type; 3.每个年龄层次,前3种最受欢迎的电影类型。 select t3.agescore,t3.lx,t3.cnt from (select row_number() over (partition by t2.agescore order by t2.cnt desc) rn,t2.agescore,t2.lx,t2.cnt from (select t.lx,r.agescore,count(*) cnt from (select mid,mname,lx from movies lateral view explode(split(type,"\\|")) types as lx) t ,ratings r where t.mid = r.mid)t1,users u,age a where u.uid=t1.uid and u.aid=a.aid group by a.agescore,t1.lx)t2)t3 where t3.rn<=3 movies ratings users age ratings users movie uid mid aid,type 基础 方式 两两关联 没有优化 job数量是最大的 select age.agescore,t4.type from( select t3.aid aid,t3.type type from( select t2.aid aid,t2.type type,t2.num num,row_number() over(partition by t2.aid order by num desc) rk from( select t1.aid aid,t1.type type,count(*) num from( select users.aid aid,mov.type type from ratings rat,users,movies mov where rat.uid=users.uid and rat.mid=mov.mid) t1 group by t1.aid,t1.type) t2 ) t3 where t3.rk<4) t4,age where t4.aid=age.aid; 最终目标 1sql 做所有 sql编程 sql语句由复杂的sql嵌套组成 能在一次sql当中 尽量的做做些事情 hive-》job 分布式-》提交 计算资源(数据在哪 需要哪些资源)->run create table result5( agescore string, type string ) row format delimited fields terminated by ‘\t‘ lines terminated by ‘\n‘ stored as textfile; insert into result5 select age.agescore,t2.type from( select t1.aid aid,t1.type type,t1.num num,row_number() over(partition by t1.aid order by t1.num desc) rk from( select users.aid aid,mov.type type,count(*) num from ratings rat,users,movies mov where rat.uid=users.uid and rat.mid=mov.mid group by users.aid,mov.type) t1 ) t2,age where t2.rk<4 and t2.aid=age.aid; 4.算出所有电影都有那些工作岗位的人评论过,相对应的人数是多少? select count(*),j.jobname,r.mid from ratings r,users u,job j where r.uid = u.uid and j.jid=u.jid group by j.jobname,r.mid limit 10; rat->user->job select t1.mname,t1.jobname,count(*) num from( select job.jobname jobname,mov.mname from ratings rat,users,job,movies mov where rat.uid=users.uid and users.jid=job.jid and rat.mid=mov.mid) t1 group by t1.jobname,t1.mname order by t1.mname,num desc; from where group by having select order by limit 5.根据电影上映时间,得出当年最受欢迎和最不受欢迎的 10个名单 select t5.year,m1.mname,t5.cnt1,m2.mname,t5.cnt2,t5.rn from (select t3.year,t3.mid mid1,t3.cnt cnt1,t4.mid mid2,t4.cnt cnt2,t3.rn from (select t2.year,t2.mid,t2.cnt,t2.rn from (select row_number() over (partition by t1.year order by t1.cnt desc) rn, t1.year,t1.mid,t1.cnt from (select getDate(m.mname) year,r.mid,count(*) cnt from movies m,ratings r where m.mid = r.mid group by getDate(m.mname),r.mid)t1)t2 where t2.rn<11)t3 , (select t2.year,t2.mid,t2.cnt,t2.rn from (select row_number() over (partition by t1.year order by t1.cnt )rn, t1.year,t1.mid,t1.cnt from (select getDate(m.mname) year,r.mid,count(*) cnt from movies m,ratings r where m.mid = r.mid group by getDate(m.mname),r.mid)t1)t2 where t2.rn<11)t4 where t4.year=t3.year and t3.rn=t4.rn) t5,movies m1,movies m2 where m1.mid = t5.mid1 and t5.mid2=m2.mid 1.hive udf add jar /home/data/myTime.jar; create temporary function myTime as ‘com.beiwang.project.MyTime‘; 2.每年最受欢迎和最不受欢迎 top10 最受欢迎top10 create table result6( year string, mname string, rk string, flag string ) row format delimited fields terminated by ‘\t‘ lines terminated by ‘\n‘ stored as textfile; insert into result6 select t3.year,t3.mname,t3.rk,‘good‘ from( select t2.year year,t2.mname mname,row_number() over(partition by year order by num desc) rk from( select t1.year year,t1.mname mname,count(*) num from( select myTime(mov.mname) year,mov.mname mname from ratings rat,movies mov where rat.mid=mov.mid) t1 group by t1.year,t1.mname ) t2 ) t3 where t3.rk<=10; 最不受欢迎top10 insert into result6 select t3.year,t3.mname,t3.rk,‘bad‘ from( select t2.year year,t2.mname mname,row_number() over(partition by year order by num) rk from( select t1.year year,t1.mname mname,count(*) num from( select myTime(mov.mname) year,mov.mname mname from ratings rat,movies mov where rat.mid=mov.mid) t1 group by t1.year,t1.mname ) t2 ) t3 where t3.rk<=10; sqoop hive --》 mysql 6.得出每个岗位 最喜欢的电影类型 前3名 7.每年 每个类型 最受欢迎的电影 前3名 ( group by year type ) where rk<3
age.txt
1 "Under 18" 18 "18-24" 25 "25-34" 35 "35-44" 45 "45-49" 50 "50-55" 56 "56+"
job.txt
0 "other" or not specified 1 "academic/educator" 2 "artist" 3 "clerical/admin" 4 "college/grad student" 5 "customer service" 6 "doctor/health care" 7 "executive/managerial" 8 "farmer" 9 "homemaker" 10 "K-12 student" 11 "lawyer" 12 "programmer" 13 "retired" 14 "sales/marketing" 15 "scientist" 16 "self-employed" 17 "technician/engineer" 18 "tradesman/craftsman" 19 "unemployed" 20 "writer"
标签:field div 准备 insert cti com 要求 sqoop 最大的
原文地址:https://www.cnblogs.com/JBLi/p/10856921.html