标签:hadoop mapreducer partitioner
Partitioner就是对map输出的key进行分组,不同的组可以指定不同的reduce task处理;Partition功能由partitioner的实现子类来实现
每写一段代码都会加深理解,程序里记录了自己的理解
FlowBean类源码:
package cn.zxl.flowcountpartitioner; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.WritableComparable; public class FlowBean implements WritableComparable<FlowBean>{ private long upflow;//上行流量 private long downflow;//下行流量 private long sumflow;//总流量 public long getUpflow() { return upflow; } public void setUpflow(long upflow) { this.upflow = upflow; } public long getDownflow() { return downflow; } public void setDownflow(long downflow) { this.downflow = downflow; } public long getSumflow() { return sumflow; } public void setSumflow(long sumflow) { this.sumflow = sumflow; } public FlowBean() { } public FlowBean(long upflow, long downflow) { super(); this.upflow = upflow; this.downflow = downflow; this.sumflow = upflow+downflow; } @Override public void readFields(DataInput in) throws IOException { upflow=in.readLong(); downflow=in.readLong(); sumflow=in.readLong(); } @Override public void write(DataOutput out) throws IOException { out.writeLong(upflow); out.writeLong(downflow); out.writeLong(sumflow); } @Override public int compareTo(FlowBean bean) { return sumflow>bean.getSumflow()?-1:1; } @Override public String toString() { return upflow+"\t"+downflow+"\t"+sumflow; } }ProvicePartition类源码:
package cn.zxl.flowcountpartitioner; import java.util.HashMap; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Partitioner; public class ProvicePartition extends Partitioner<Text, FlowBean>{ //根据手机号前三位划分分组 //Partitioner就是对key进行分组 private static HashMap<String, Integer> pmap = new HashMap<String, Integer>(); static{ pmap.put("136", 0); pmap.put("137", 1); pmap.put("138", 2); pmap.put("139", 3); } @Override public int getPartition(Text key, FlowBean bean, int numPartitions) { String prex=key.toString().substring(0,3); Integer partNum=pmap.get(prex);//根据key截取的前三位做key和map的值是否匹配 return partNum==null?4:partNum; } }
package cn.zxl.flowcountpartitioner; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class FlowCount { static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{ @Override protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException { String line=value.toString(); String[] phoneinfo=line.split("\t"); String phoneN=phoneinfo[0]; String upflow=phoneinfo[phoneinfo.length-3]; String downflow=phoneinfo[phoneinfo.length-2]; FlowBean fb=new FlowBean(Long.parseLong(upflow),Long.parseLong(downflow)); context.write(new Text(phoneN),fb); } } //reducer里的值是<key,list(value)>,也就是相同的键里对应一个集合 //reducer是根据key排序的,而不是value,要根据什么排序,那就得已什么作为key输出 static class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{ @Override protected void reduce(Text key, Iterable<FlowBean> values,Context context) throws IOException, InterruptedException { long upflow_sum=0; long downflow_sum=0; for(FlowBean bean:values){ upflow_sum+=bean.getUpflow(); downflow_sum+=bean.getDownflow(); } FlowBean fb=new FlowBean(upflow_sum,downflow_sum); context.write(new Text(key), fb); } } public static void main(String[] args) throws Exception { Configuration conf=new Configuration();// cn.zxl.flowcountpartitioner.FlowCount Job job=Job.getInstance(conf); job.setJarByClass(FlowCount.class); job.setMapperClass(FlowCountMapper.class); job.setReducerClass(FlowCountReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(FlowBean.class); //job指定自定义的Partitioner组件 //job.setPartitionerClass(ProvicePartition.class); /*job中指定reducertask的数量,说明:这里的reducertask数量可以指定为1个,如果是1个reducertask, *那么所有的分区数据都输入到一个文件里,如果指定个数小于分区个数(这里是5个),那么程序会报错, *因为不知道对应的一个分区数据放置到哪里,如果指定个数超过分区个数,那么后面产生的文件是空的 */ //job.setNumReduceTasks(5); FileInputFormat.setInputPaths(job, new Path(args[0])); Path output = new Path(args[1]); FileSystem fs = output.getFileSystem(conf); //看输出是否存在,存在就删除,特别说明:安全起见正式的线上建议最好不要做这个判断,如果这样做,会把以前产生的数据删除 //补充:正式生产环境最好指定删除多久后正式删除数据,以便错删时可以恢复数据 /* * 添加在hdfs-site的配置文件里 * <property> <name>fs.trash.interval</name> <value>60</value><!-- 回收站过期机制检查频率(分钟) --> </property> <property> <name>fs.trash.checkpoint.interval</name> <value>20</value><!-- 回收站中文件过期的时间限制(分钟) --> </property> */ if(fs.exists(output)){ fs.delete(output, true); } FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); } }
测试数据:
1363157985066 1372623050300-FD-07-A4-72-B8:CMCC
120.196.100.82i02.c.aliimg.com
2427
248124681
200
1363157995052 138265441015C-0E-8B-C7-F1-E0:CMCC
120.197.40.44
0 264
0 200
1363157991076 1392643565620-10-7A-28-CC-0A:CMCC
120.196.100.992
4 132
1512 200
1363154400022 139262511065C-0E-8B-8B-B1-50:CMCC
120.197.40.44
0 240
0 200
1363157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY
120.196.100.99iface.qiyi.com
视频网站15
12 1527
2106 200
1363157995074 841384135C-0E-8B-8C-E8-20:7DaysInn
120.197.40.4122.72.52.12
2016
41161432
200
1363157993055 13560439658C4-17-FE-BA-DE-D9:CMCC
120.196.100.9918
15 1116
954 200
1363157995033 159201332575C-0E-8B-C7-BA-20:CMCC
120.197.40.4sug.so.360.cn
信息安全20
20 3156
2936 200
1363157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY
120.196.100.824
0 240
0 200
1363157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY
120.197.40.4s19.cnzz.com
站点统计24
9 6960
690 200
1363157973098 150136858585C-0E-8B-C7-F7-90:CMCC
120.197.40.4rank.ie.sogou.com
搜索引擎28
27 3659
3538 200
1363157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY
120.196.100.99www.umeng.com
站点统计3
3 1938
180 200
1363157992093 13560439658C4-17-FE-BA-DE-D9:CMCC
120.196.100.9915
9 918
4938 200
1363157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY
120.197.40.43
3 180
180 200
1363157984040 136028465655C-0E-8B-8B-B6-00:CMCC
120.197.40.42052.flash2-http.qq.com
综合门户15
12 1938
2910 200
1363157995093 1392231446600-FD-07-A2-EC-BA:CMCC
120.196.100.82img.qfc.cn
1212
30083720
200
1363157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY
120.196.100.99y0.ifengimg.com
综合门户57
102 7335
110349 200
1363157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY
120.196.100.99input.shouji.sogou.com
搜索引擎21
18 9531
2412 200
1363157990043 1392505741300-1F-64-E1-E6-9A:CMCC
120.196.100.55t3.baidu.com
搜索引擎69
63 11058
48243 200
1363157988072 1376077871000-FD-07-A4-7B-08:CMCC
120.196.100.822
2 120
120 200
1363157985066 1372623888800-FD-07-A4-72-B8:CMCC
120.196.100.82i02.c.aliimg.com
2427
248124681
200
1363157993055 13560436666C4-17-FE-BA-DE-D9:CMCC
120.196.100.9918
15 1116
954 200
1373157985066 1372623050300-FD-07-A4-72-B8:CMCC
120.196.100.82i02.c.aliimg.com
2427
248124681
200
1373157995052 138265441015C-0E-8B-C7-F1-E0:CMCC
120.197.40.44
0 264
0 200
1373157991076 1392643565620-10-7A-28-CC-0A:CMCC
120.196.100.992
4 132
1512 200
1373154400022 139262511065C-0E-8B-8B-B1-50:CMCC
120.197.40.44
0 240
0 200
1373157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY
120.196.100.99iface.qiyi.com
视频网站15
12 1527
2106 200
1373157995074 841384135C-0E-8B-8C-E8-20:7DaysInn
120.197.40.4122.72.52.12
2016
41161432
200
1373157993055 13560439658C4-17-FE-BA-DE-D9:CMCC
120.196.100.9918
15 1116
954 200
1373157995033 159201332575C-0E-8B-C7-BA-20:CMCC
120.197.40.4sug.so.360.cn
信息安全20
20 3156
2936 200
1373157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY
120.196.100.824
0 240
0 200
1373157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY
120.197.40.4s19.cnzz.com
站点统计24
9 6960
690 200
1373157973098 150136858585C-0E-8B-C7-F7-90:CMCC
120.197.40.4rank.ie.sogou.com
搜索引擎28
27 3659
3538 200
1373157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY
120.196.100.99www.umeng.com
站点统计3
3 1938
180 200
1373157992093 13560439658C4-17-FE-BA-DE-D9:CMCC
120.196.100.9915
9 918
4938 200
1373157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY
120.197.40.43
3 180
180 200
1373157984040 136028465655C-0E-8B-8B-B6-00:CMCC
120.197.40.42052.flash2-http.qq.com
综合门户15
12 1938
2910 200
1373157995093 1392231446600-FD-07-A2-EC-BA:CMCC
120.196.100.82img.qfc.cn
1212
30083720
200
1373157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY
120.196.100.99y0.ifengimg.com
综合门户57
102 7335
110349 200
1373157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY
120.196.100.99input.shouji.sogou.com
搜索引擎21
18 9531
2412 200
1373157990043 1392505741300-1F-64-E1-E6-9A:CMCC
120.196.100.55t3.baidu.com
搜索引擎69
63 11058
48243 200
1373157988072 1376077871000-FD-07-A4-7B-08:CMCC
120.196.100.822
2 120
120 200
1373157985066 1372623888800-FD-07-A4-72-B8:CMCC
120.196.100.82i02.c.aliimg.com
2427
248124681
200
1373157993055 13560436666C4-17-FE-BA-DE-D9:CMCC
120.196.100.9918
15 1116
954 200
1383157985066 1372623050300-FD-07-A4-72-B8:CMCC
120.196.100.82i02.c.aliimg.com
2427
248124681
200
1383157995052 138265441015C-0E-8B-C7-F1-E0:CMCC
120.197.40.44
0 264
0 200
1383157991076 1392643565620-10-7A-28-CC-0A:CMCC
120.196.100.992
4 132
1512 200
1383154400022 139262511065C-0E-8B-8B-B1-50:CMCC
120.197.40.44
0 240
0 200
1383157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY
120.196.100.99iface.qiyi.com
视频网站15
12 1527
2106 200
1383157995074 841384135C-0E-8B-8C-E8-20:7DaysInn
120.197.40.4122.72.52.12
2016
41161432
200
1383157993055 13560439658C4-17-FE-BA-DE-D9:CMCC
120.196.100.9918
15 1116
954 200
1383157995033 159201332575C-0E-8B-C7-BA-20:CMCC
120.197.40.4sug.so.360.cn
信息安全20
20 3156
2936 200
1383157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY
120.196.100.824
0 240
0 200
1383157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY
120.197.40.4s19.cnzz.com
站点统计24
9 6960
690 200
1383157973098 150136858585C-0E-8B-C7-F7-90:CMCC
120.197.40.4rank.ie.sogou.com
搜索引擎28
27 3659
3538 200
1383157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY
120.196.100.99www.umeng.com
站点统计3
3 1938
180 200
1383157992093 13560439658C4-17-FE-BA-DE-D9:CMCC
120.196.100.9915
9 918
4938 200
1383157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY
120.197.40.43
3 180
180 200
1383157984040 136028465655C-0E-8B-8B-B6-00:CMCC
120.197.40.42052.flash2-http.qq.com
综合门户15
12 1938
2910 200
1383157995093 1392231446600-FD-07-A2-EC-BA:CMCC
120.196.100.82img.qfc.cn
1212
30083720
200
1383157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY
120.196.100.99y0.ifengimg.com
综合门户57
102 7335
110349 200
1383157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY
120.196.100.99input.shouji.sogou.com
搜索引擎21
18 9531
2412 200
1383157990043 1392505741300-1F-64-E1-E6-9A:CMCC
120.196.100.55t3.baidu.com
搜索引擎69
63 11058
48243 200
1383157988072 1376077871000-FD-07-A4-7B-08:CMCC
120.196.100.822
2 120
120 200
1383157985066 1372623888800-FD-07-A4-72-B8:CMCC
120.196.100.82i02.c.aliimg.com
2427
248124681
200
1383157993055 13560436666C4-17-FE-BA-DE-D9:CMCC
120.196.100.9918
15 1116
954 200
1393157985066 1372623050300-FD-07-A4-72-B8:CMCC
120.196.100.82i02.c.aliimg.com
2427
248124681
200
1393157995052 138265441015C-0E-8B-C7-F1-E0:CMCC
120.197.40.44
0 264
0 200
1393157991076 1392643565620-10-7A-28-CC-0A:CMCC
120.196.100.992
4 132
1512 200
13963154400022 139262511065C-0E-8B-8B-B1-50:CMCC
120.197.40.44
0 240
0 200
1393157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY
120.196.100.99iface.qiyi.com
视频网站15
12 1527
2106 200
1393157995074 841384135C-0E-8B-8C-E8-20:7DaysInn
120.197.40.4122.72.52.12
2016
41161432
200
1393157993055 13560439658C4-17-FE-BA-DE-D9:CMCC
120.196.100.9918
15 1116
954 200
1393157995033 159201332575C-0E-8B-C7-BA-20:CMCC
120.197.40.4sug.so.360.cn
信息安全20
20 3156
2936 200
1393157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY
120.196.100.824
0 240
0 200
1393157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY
120.197.40.4s19.cnzz.com
站点统计24
9 6960
690 200
1393157973098 150136858585C-0E-8B-C7-F7-90:CMCC
120.197.40.4rank.ie.sogou.com
搜索引擎28
27 3659
3538 200
1393157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY
120.196.100.99www.umeng.com
站点统计3
3 1938
180 200
1393157992093 13560439658C4-17-FE-BA-DE-D9:CMCC
120.196.100.9915
9 918
4938 200
1393157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY
120.197.40.43
3 180
180 200
1393157984040 136028465655C-0E-8B-8B-B6-00:CMCC
120.197.40.42052.flash2-http.qq.com
综合门户15
12 1938
2910 200
1393157995093 1392231446600-FD-07-A2-EC-BA:CMCC
120.196.100.82img.qfc.cn
1212
30083720
200
1393157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY
120.196.100.99y0.ifengimg.com
综合门户57
102 7335
110349 200
1393157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY
120.196.100.99input.shouji.sogou.com
搜索引擎21
18 9531
2412 200
1393157990043 1392505741300-1F-64-E1-E6-9A:CMCC
120.196.100.55t3.baidu.com
搜索引擎69
63 11058
48243 200
1393157988072 1376077871000-FD-07-A4-7B-08:CMCC
120.196.100.822
2 120
120 200
1393157985066 1372623888800-FD-07-A4-72-B8:CMCC
120.196.100.82i02.c.aliimg.com
2427
248124681
200
1393157993055 13560436666C4-17-FE-BA-DE-D9:CMCC
120.196.100.9918
15 1116
954 200
1503157985066 1372623050300-FD-07-A4-72-B8:CMCC
120.196.100.82i02.c.aliimg.com
2427
248124681
200
1503157995052 138265441015C-0E-8B-C7-F1-E0:CMCC
120.197.40.44
0 264
0 200
1503157991076 1392643565620-10-7A-28-CC-0A:CMCC
120.196.100.992
4 132
1512 200
1503154400022 139262511065C-0E-8B-8B-B1-50:CMCC
120.197.40.44
0 240
0 200
1503157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY
120.196.100.99iface.qiyi.com
视频网站15
12 1527
2106 200
1513157995074 841384135C-0E-8B-8C-E8-20:7DaysInn
120.197.40.4122.72.52.12
2016
41161432
200
1513157993055 13560439658C4-17-FE-BA-DE-D9:CMCC
120.196.100.9918
15 1116
954 200
1513157995033 159201332575C-0E-8B-C7-BA-20:CMCC
120.197.40.4sug.so.360.cn
信息安全20
20 3156
2936 200
1513157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY
120.196.100.824
0 240
0 200
1513157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY
120.197.40.4s19.cnzz.com
站点统计24
9 6960
690 200
1523157973098 150136858585C-0E-8B-C7-F7-90:CMCC
120.197.40.4rank.ie.sogou.com
搜索引擎28
27 3659
3538 200
1523157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY
120.196.100.99www.umeng.com
站点统计3
3 1938
180 200
1523157992093 13560439658C4-17-FE-BA-DE-D9:CMCC
120.196.100.9915
9 918
4938 200
1523157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY
120.197.40.43
3 180
180 200
1523157984040 136028465655C-0E-8B-8B-B6-00:CMCC
120.197.40.42052.flash2-http.qq.com
综合门户15
12 1938
2910 200
1533157995093 1392231446600-FD-07-A2-EC-BA:CMCC
120.196.100.82img.qfc.cn
1212
30083720
200
1533157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY
120.196.100.99y0.ifengimg.com
综合门户57
102 7335
110349 200
1533157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY
120.196.100.99input.shouji.sogou.com
搜索引擎21
18 9531
2412 200
1533157990043 1392505741300-1F-64-E1-E6-9A:CMCC
120.196.100.55t3.baidu.com
搜索引擎69
63 11058
48243 200
1533157988072 1376077871000-FD-07-A4-7B-08:CMCC
120.196.100.822
2 120
120 200
1533157985066 1372623888800-FD-07-A4-72-B8:CMCC
120.196.100.82i02.c.aliimg.com
2427
248124681
200
1533157993055 13560436666C4-17-FE-BA-DE-D9:CMCC
120.196.100.9918
15 1116
954 200
Hadoop学习---第四篇Mapreducer里的Partitioner
标签:hadoop mapreducer partitioner
原文地址:http://blog.csdn.net/zxl333/article/details/46286561