码迷,mamicode.com
首页 > 其他好文 > 详细

Hadoop学习---第四篇Mapreducer里的Partitioner

时间:2015-05-31 12:30:41      阅读:274      评论:0      收藏:0      [点我收藏+]

标签:hadoop   mapreducer   partitioner   

Partitioner就是对map输出的key进行分组,不同的组可以指定不同的reduce task处理;

Partition功能由partitioner的实现子类来实现

每写一段代码都会加深理解,程序里记录了自己的理解

FlowBean类源码:

package cn.zxl.flowcountpartitioner;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class FlowBean implements WritableComparable<FlowBean>{
	private long upflow;//上行流量
	private long downflow;//下行流量
	private long sumflow;//总流量
	public long getUpflow() {
		return upflow;
	}
	public void setUpflow(long upflow) {
		this.upflow = upflow;
	}
	public long getDownflow() {
		return downflow;
	}
	public void setDownflow(long downflow) {
		this.downflow = downflow;
	}
	public long getSumflow() {
		return sumflow;
	}
	public void setSumflow(long sumflow) {
		this.sumflow = sumflow;
	}
	public FlowBean() {
	}
	public FlowBean(long upflow, long downflow) {
		super();
		this.upflow = upflow;
		this.downflow = downflow;
		this.sumflow = upflow+downflow;
	}
	@Override
	public void readFields(DataInput in) throws IOException {
		upflow=in.readLong();
		downflow=in.readLong();
		sumflow=in.readLong();
	}
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeLong(upflow);
		out.writeLong(downflow);
		out.writeLong(sumflow);
		
	}
	@Override
	public int compareTo(FlowBean bean) {
		return sumflow>bean.getSumflow()?-1:1;
	}
	
	@Override
	public String toString() {
		return upflow+"\t"+downflow+"\t"+sumflow;
	}
}
ProvicePartition类源码:

package cn.zxl.flowcountpartitioner;

import java.util.HashMap;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class ProvicePartition extends Partitioner<Text, FlowBean>{
	//根据手机号前三位划分分组
	//Partitioner就是对key进行分组
	private static HashMap<String, Integer> pmap = new HashMap<String, Integer>();
	static{
		pmap.put("136", 0);
		pmap.put("137", 1);
		pmap.put("138", 2);
		pmap.put("139", 3);
	}
	@Override
	public int getPartition(Text key, FlowBean bean, int numPartitions) {
		String prex=key.toString().substring(0,3);
		Integer partNum=pmap.get(prex);//根据key截取的前三位做key和map的值是否匹配
		return partNum==null?4:partNum;
	}
}

FlowCount类源码:

package cn.zxl.flowcountpartitioner;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FlowCount {
	static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{
		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {
			String line=value.toString();
			String[] phoneinfo=line.split("\t");
			String phoneN=phoneinfo[0];
			String upflow=phoneinfo[phoneinfo.length-3];
			String downflow=phoneinfo[phoneinfo.length-2];
			FlowBean fb=new FlowBean(Long.parseLong(upflow),Long.parseLong(downflow));
			context.write(new Text(phoneN),fb);
		}
	}
	//reducer里的值是<key,list(value)>,也就是相同的键里对应一个集合
	//reducer是根据key排序的,而不是value,要根据什么排序,那就得已什么作为key输出
	static class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
		@Override
		protected void reduce(Text key, Iterable<FlowBean> values,Context context)
				throws IOException, InterruptedException {
			long upflow_sum=0;
			long downflow_sum=0;
			for(FlowBean bean:values){
				upflow_sum+=bean.getUpflow();
				downflow_sum+=bean.getDownflow();
			}
			FlowBean fb=new FlowBean(upflow_sum,downflow_sum);
			context.write(new Text(key), fb);
		}
	}
	
	public static void main(String[] args) throws Exception {
		Configuration conf=new Configuration();// cn.zxl.flowcountpartitioner.FlowCount
		
		Job job=Job.getInstance(conf);
		
		job.setJarByClass(FlowCount.class);
		
		job.setMapperClass(FlowCountMapper.class);
		job.setReducerClass(FlowCountReducer.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		//job指定自定义的Partitioner组件
		//job.setPartitionerClass(ProvicePartition.class);
		/*job中指定reducertask的数量,说明:这里的reducertask数量可以指定为1个,如果是1个reducertask,
		*那么所有的分区数据都输入到一个文件里,如果指定个数小于分区个数(这里是5个),那么程序会报错,
		*因为不知道对应的一个分区数据放置到哪里,如果指定个数超过分区个数,那么后面产生的文件是空的
		*/
		//job.setNumReduceTasks(5);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		
		Path output = new Path(args[1]);
		FileSystem fs = output.getFileSystem(conf);
		//看输出是否存在,存在就删除,特别说明:安全起见正式的线上建议最好不要做这个判断,如果这样做,会把以前产生的数据删除
		//补充:正式生产环境最好指定删除多久后正式删除数据,以便错删时可以恢复数据
		/*
		 * 添加在hdfs-site的配置文件里
		 * <property>
			<name>fs.trash.interval</name>
			<value>60</value><!-- 回收站过期机制检查频率(分钟) -->
			</property>
			
			<property>
			<name>fs.trash.checkpoint.interval</name>
			<value>20</value><!-- 回收站中文件过期的时间限制(分钟) -->
			</property>
		 */
		if(fs.exists(output)){
			fs.delete(output, true);
		}
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.waitForCompletion(true);
	}
}

测试数据:

1363157985066 1372623050300-FD-07-A4-72-B8:CMCC 120.196.100.82i02.c.aliimg.com 2427 248124681 200
1363157995052 138265441015C-0E-8B-C7-F1-E0:CMCC 120.197.40.44 0 264 0 200
1363157991076 1392643565620-10-7A-28-CC-0A:CMCC 120.196.100.992 4 132 1512 200
1363154400022 139262511065C-0E-8B-8B-B1-50:CMCC 120.197.40.44 0 240 0 200
1363157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99iface.qiyi.com 视频网站15 12 1527 2106 200
1363157995074 841384135C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4122.72.52.12 2016 41161432 200
1363157993055 13560439658C4-17-FE-BA-DE-D9:CMCC 120.196.100.9918 15 1116 954 200
1363157995033 159201332575C-0E-8B-C7-BA-20:CMCC 120.197.40.4sug.so.360.cn 信息安全20 20 3156 2936 200
1363157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY 120.196.100.824 0 240 0 200
1363157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4s19.cnzz.com 站点统计24 9 6960 690 200
1363157973098 150136858585C-0E-8B-C7-F7-90:CMCC 120.197.40.4rank.ie.sogou.com 搜索引擎28 27 3659 3538 200
1363157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99www.umeng.com 站点统计3 3 1938 180 200
1363157992093 13560439658C4-17-FE-BA-DE-D9:CMCC 120.196.100.9915 9 918 4938 200
1363157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.43 3 180 180 200
1363157984040 136028465655C-0E-8B-8B-B6-00:CMCC 120.197.40.42052.flash2-http.qq.com 综合门户15 12 1938 2910 200
1363157995093 1392231446600-FD-07-A2-EC-BA:CMCC 120.196.100.82img.qfc.cn 1212 30083720 200
1363157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99y0.ifengimg.com 综合门户57 102 7335 110349 200
1363157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99input.shouji.sogou.com 搜索引擎21 18 9531 2412 200
1363157990043 1392505741300-1F-64-E1-E6-9A:CMCC 120.196.100.55t3.baidu.com 搜索引擎69 63 11058 48243 200
1363157988072 1376077871000-FD-07-A4-7B-08:CMCC 120.196.100.822 2 120 120 200
1363157985066 1372623888800-FD-07-A4-72-B8:CMCC 120.196.100.82i02.c.aliimg.com 2427 248124681 200
1363157993055 13560436666C4-17-FE-BA-DE-D9:CMCC 120.196.100.9918 15 1116 954 200
1373157985066 1372623050300-FD-07-A4-72-B8:CMCC 120.196.100.82i02.c.aliimg.com 2427 248124681 200
1373157995052 138265441015C-0E-8B-C7-F1-E0:CMCC 120.197.40.44 0 264 0 200
1373157991076 1392643565620-10-7A-28-CC-0A:CMCC 120.196.100.992 4 132 1512 200
1373154400022 139262511065C-0E-8B-8B-B1-50:CMCC 120.197.40.44 0 240 0 200
1373157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99iface.qiyi.com 视频网站15 12 1527 2106 200
1373157995074 841384135C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4122.72.52.12 2016 41161432 200
1373157993055 13560439658C4-17-FE-BA-DE-D9:CMCC 120.196.100.9918 15 1116 954 200
1373157995033 159201332575C-0E-8B-C7-BA-20:CMCC 120.197.40.4sug.so.360.cn 信息安全20 20 3156 2936 200
1373157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY 120.196.100.824 0 240 0 200
1373157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4s19.cnzz.com 站点统计24 9 6960 690 200
1373157973098 150136858585C-0E-8B-C7-F7-90:CMCC 120.197.40.4rank.ie.sogou.com 搜索引擎28 27 3659 3538 200
1373157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99www.umeng.com 站点统计3 3 1938 180 200
1373157992093 13560439658C4-17-FE-BA-DE-D9:CMCC 120.196.100.9915 9 918 4938 200
1373157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.43 3 180 180 200
1373157984040 136028465655C-0E-8B-8B-B6-00:CMCC 120.197.40.42052.flash2-http.qq.com 综合门户15 12 1938 2910 200
1373157995093 1392231446600-FD-07-A2-EC-BA:CMCC 120.196.100.82img.qfc.cn 1212 30083720 200
1373157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99y0.ifengimg.com 综合门户57 102 7335 110349 200
1373157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99input.shouji.sogou.com 搜索引擎21 18 9531 2412 200
1373157990043 1392505741300-1F-64-E1-E6-9A:CMCC 120.196.100.55t3.baidu.com 搜索引擎69 63 11058 48243 200
1373157988072 1376077871000-FD-07-A4-7B-08:CMCC 120.196.100.822 2 120 120 200
1373157985066 1372623888800-FD-07-A4-72-B8:CMCC 120.196.100.82i02.c.aliimg.com 2427 248124681 200
1373157993055 13560436666C4-17-FE-BA-DE-D9:CMCC 120.196.100.9918 15 1116 954 200
1383157985066 1372623050300-FD-07-A4-72-B8:CMCC 120.196.100.82i02.c.aliimg.com 2427 248124681 200
1383157995052 138265441015C-0E-8B-C7-F1-E0:CMCC 120.197.40.44 0 264 0 200
1383157991076 1392643565620-10-7A-28-CC-0A:CMCC 120.196.100.992 4 132 1512 200
1383154400022 139262511065C-0E-8B-8B-B1-50:CMCC 120.197.40.44 0 240 0 200
1383157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99iface.qiyi.com 视频网站15 12 1527 2106 200
1383157995074 841384135C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4122.72.52.12 2016 41161432 200
1383157993055 13560439658C4-17-FE-BA-DE-D9:CMCC 120.196.100.9918 15 1116 954 200
1383157995033 159201332575C-0E-8B-C7-BA-20:CMCC 120.197.40.4sug.so.360.cn 信息安全20 20 3156 2936 200
1383157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY 120.196.100.824 0 240 0 200
1383157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4s19.cnzz.com 站点统计24 9 6960 690 200
1383157973098 150136858585C-0E-8B-C7-F7-90:CMCC 120.197.40.4rank.ie.sogou.com 搜索引擎28 27 3659 3538 200
1383157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99www.umeng.com 站点统计3 3 1938 180 200
1383157992093 13560439658C4-17-FE-BA-DE-D9:CMCC 120.196.100.9915 9 918 4938 200
1383157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.43 3 180 180 200
1383157984040 136028465655C-0E-8B-8B-B6-00:CMCC 120.197.40.42052.flash2-http.qq.com 综合门户15 12 1938 2910 200
1383157995093 1392231446600-FD-07-A2-EC-BA:CMCC 120.196.100.82img.qfc.cn 1212 30083720 200
1383157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99y0.ifengimg.com 综合门户57 102 7335 110349 200
1383157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99input.shouji.sogou.com 搜索引擎21 18 9531 2412 200
1383157990043 1392505741300-1F-64-E1-E6-9A:CMCC 120.196.100.55t3.baidu.com 搜索引擎69 63 11058 48243 200
1383157988072 1376077871000-FD-07-A4-7B-08:CMCC 120.196.100.822 2 120 120 200
1383157985066 1372623888800-FD-07-A4-72-B8:CMCC 120.196.100.82i02.c.aliimg.com 2427 248124681 200
1383157993055 13560436666C4-17-FE-BA-DE-D9:CMCC 120.196.100.9918 15 1116 954 200
1393157985066 1372623050300-FD-07-A4-72-B8:CMCC 120.196.100.82i02.c.aliimg.com 2427 248124681 200
1393157995052 138265441015C-0E-8B-C7-F1-E0:CMCC 120.197.40.44 0 264 0 200
1393157991076 1392643565620-10-7A-28-CC-0A:CMCC 120.196.100.992 4 132 1512 200
13963154400022 139262511065C-0E-8B-8B-B1-50:CMCC 120.197.40.44 0 240 0 200
1393157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99iface.qiyi.com 视频网站15 12 1527 2106 200
1393157995074 841384135C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4122.72.52.12 2016 41161432 200
1393157993055 13560439658C4-17-FE-BA-DE-D9:CMCC 120.196.100.9918 15 1116 954 200
1393157995033 159201332575C-0E-8B-C7-BA-20:CMCC 120.197.40.4sug.so.360.cn 信息安全20 20 3156 2936 200
1393157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY 120.196.100.824 0 240 0 200
1393157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4s19.cnzz.com 站点统计24 9 6960 690 200
1393157973098 150136858585C-0E-8B-C7-F7-90:CMCC 120.197.40.4rank.ie.sogou.com 搜索引擎28 27 3659 3538 200
1393157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99www.umeng.com 站点统计3 3 1938 180 200
1393157992093 13560439658C4-17-FE-BA-DE-D9:CMCC 120.196.100.9915 9 918 4938 200
1393157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.43 3 180 180 200
1393157984040 136028465655C-0E-8B-8B-B6-00:CMCC 120.197.40.42052.flash2-http.qq.com 综合门户15 12 1938 2910 200
1393157995093 1392231446600-FD-07-A2-EC-BA:CMCC 120.196.100.82img.qfc.cn 1212 30083720 200
1393157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99y0.ifengimg.com 综合门户57 102 7335 110349 200
1393157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99input.shouji.sogou.com 搜索引擎21 18 9531 2412 200
1393157990043 1392505741300-1F-64-E1-E6-9A:CMCC 120.196.100.55t3.baidu.com 搜索引擎69 63 11058 48243 200
1393157988072 1376077871000-FD-07-A4-7B-08:CMCC 120.196.100.822 2 120 120 200
1393157985066 1372623888800-FD-07-A4-72-B8:CMCC 120.196.100.82i02.c.aliimg.com 2427 248124681 200
1393157993055 13560436666C4-17-FE-BA-DE-D9:CMCC 120.196.100.9918 15 1116 954 200
1503157985066 1372623050300-FD-07-A4-72-B8:CMCC 120.196.100.82i02.c.aliimg.com 2427 248124681 200
1503157995052 138265441015C-0E-8B-C7-F1-E0:CMCC 120.197.40.44 0 264 0 200
1503157991076 1392643565620-10-7A-28-CC-0A:CMCC 120.196.100.992 4 132 1512 200
1503154400022 139262511065C-0E-8B-8B-B1-50:CMCC 120.197.40.44 0 240 0 200
1503157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99iface.qiyi.com 视频网站15 12 1527 2106 200
1513157995074 841384135C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4122.72.52.12 2016 41161432 200
1513157993055 13560439658C4-17-FE-BA-DE-D9:CMCC 120.196.100.9918 15 1116 954 200
1513157995033 159201332575C-0E-8B-C7-BA-20:CMCC 120.197.40.4sug.so.360.cn 信息安全20 20 3156 2936 200
1513157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY 120.196.100.824 0 240 0 200
1513157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4s19.cnzz.com 站点统计24 9 6960 690 200
1523157973098 150136858585C-0E-8B-C7-F7-90:CMCC 120.197.40.4rank.ie.sogou.com 搜索引擎28 27 3659 3538 200
1523157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99www.umeng.com 站点统计3 3 1938 180 200
1523157992093 13560439658C4-17-FE-BA-DE-D9:CMCC 120.196.100.9915 9 918 4938 200
1523157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.43 3 180 180 200
1523157984040 136028465655C-0E-8B-8B-B6-00:CMCC 120.197.40.42052.flash2-http.qq.com 综合门户15 12 1938 2910 200
1533157995093 1392231446600-FD-07-A2-EC-BA:CMCC 120.196.100.82img.qfc.cn 1212 30083720 200
1533157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99y0.ifengimg.com 综合门户57 102 7335 110349 200
1533157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99input.shouji.sogou.com 搜索引擎21 18 9531 2412 200
1533157990043 1392505741300-1F-64-E1-E6-9A:CMCC 120.196.100.55t3.baidu.com 搜索引擎69 63 11058 48243 200
1533157988072 1376077871000-FD-07-A4-7B-08:CMCC 120.196.100.822 2 120 120 200
1533157985066 1372623888800-FD-07-A4-72-B8:CMCC 120.196.100.82i02.c.aliimg.com 2427 248124681 200
1533157993055 13560436666C4-17-FE-BA-DE-D9:CMCC 120.196.100.9918 15 1116 954 200

Hadoop学习---第四篇Mapreducer里的Partitioner

标签:hadoop   mapreducer   partitioner   

原文地址:http://blog.csdn.net/zxl333/article/details/46286561

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!