MapReduce 中的两表 join 实例(二)

时间：2014-10-04 03:32:06 阅读：303 评论：0 收藏：0 [点我收藏+]

package com.baidu.uilt;
import java.io.*;

import org.apache.hadoop.io.*;

public class TextPair implements WritableComparable<TextPair> {

  private Text first;
  private Text second;
  
  public TextPair() {
    set(new Text(), new Text());
  }
  
  public TextPair(String first, String second) {
    set(new Text(first), new Text(second));
  }
  
  public TextPair(Text first, Text second) {
    set(first, second);
  }
  
  public void set(Text first, Text second) {
    this.first = first;
    this.second = second;
  }
  
  public Text getFirst() {
    return first;
  }

  public Text getSecond() {
    return second;
  }

  @Override
  public void write(DataOutput out) throws IOException {
    first.write(out);
    second.write(out);
  }

  @Override
  public void readFields(DataInput in) throws IOException {
    first.readFields(in);
    second.readFields(in);
  }
  
  @Override
  public int hashCode() {
    return first.hashCode() * 163 + second.hashCode();
  }
  
  @Override
  public boolean equals(Object o) {
    if (o instanceof TextPair) {
      TextPair tp = (TextPair) o;
      return first.equals(tp.first) && second.equals(tp.second);
    }
    return false;
  }

  @Override
  public String toString() {
    return first + "\t" + second;
  }
  
  @Override
  public int compareTo(TextPair tp) {
    int cmp = first.compareTo(tp.first);
    if (cmp != 0) {
      return cmp;
    }
    return second.compareTo(tp.second);
  }

  public static class Comparator extends WritableComparator {
    
    private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
    
    public Comparator() {
      super(TextPair.class);
    }

    @Override
    public int compare(byte[] b1, int s1, int l1,
                       byte[] b2, int s2, int l2) {
      
      try {
        int firstL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
        int firstL2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);
        int cmp = TEXT_COMPARATOR.compare(b1, s1, firstL1, b2, s2, firstL2);
        if (cmp != 0) {
          return cmp;
        }
        return TEXT_COMPARATOR.compare(b1, s1 + firstL1, l1 - firstL1,
                                       b2, s2 + firstL2, l2 - firstL2);
      } catch (IOException e) {
        throw new IllegalArgumentException(e);
      }
    }
  }

  static {
    WritableComparator.define(TextPair.class, new Comparator());
  }
  public static class FirstComparator extends WritableComparator {
    
    private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
    
    public FirstComparator() {
      super(TextPair.class);
    }

    @Override
    public int compare(byte[] b1, int s1, int l1,
                       byte[] b2, int s2, int l2) {
      
      try {
        int firstL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
        int firstL2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);
        return TEXT_COMPARATOR.compare(b1, s1, firstL1, b2, s2, firstL2);
      } catch (IOException e) {
        throw new IllegalArgumentException(e);
      }
    }
    
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
      if (a instanceof TextPair && b instanceof TextPair) {
        return ((TextPair) a).first.compareTo(((TextPair) b).first);
      }
      return super.compare(a, b);
    }
  }

}

package com.baidu.loan;
/***
 * 
 * /home/users/ouerqiang/hadoop/hadoop-client-palo/hadoop/bin/hadoop jar LoanIdeaInfoText.jar  com.baidu.loan.LoanIdeainfoJoinIterialByDAILI6  /test/fbiz/loan/ideainfo/LoanIdeainfoByDAILIUnitID_0928  /test/fbiz/loan/ideainfo/LoanIterialByDAI_0928  /test/fbiz/loan/ideainfo/LoanIdeainfoJoinIterialByDAILI6_1_0928
 * 
 * **/
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.lib.MultipleInputs;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;

import com.baidu.uilt.TextPair;

public class LoanIdeainfoJoinIterialByDAILI6 extends Configured implements Tool {

	public static class JoinUnitMapper extends MapReduceBase implements
			Mapper<LongWritable, Text, TextPair, Text> {

		public void map(LongWritable key, Text value,
				OutputCollector<TextPair, Text> output, Reporter reporter)
				throws IOException {
			String gbkStr = value.toString();
			if (gbkStr.split("\t").length < 2 && gbkStr.split(",").length == 4) {
				String[] strs = gbkStr.split(",");
				output.collect(new TextPair(strs[0], "0"), value);
			}

		}
	}

	public static class JoinIterialMapper extends MapReduceBase implements
			Mapper<LongWritable, Text, TextPair, Text> {

		public void map(LongWritable key, Text value,
				OutputCollector<TextPair, Text> output, Reporter reporter)
				throws IOException {
			String gbkStr = value.toString();
			if (gbkStr.split("\t").length > 4) {// LoanIterial
				String[] strs = gbkStr.split("\t");
				output.collect(new TextPair(strs[0], "1"), value);
			}
		}
	}

	public static class JoinReducer extends MapReduceBase implements
			Reducer<TextPair, Text, Text, Text> {

		public void reduce(TextPair key, Iterator<Text> values,
				OutputCollector<Text, Text> output, Reporter reporter)
				throws IOException {

			Text stationName = new Text(values.next());
			while (values.hasNext()) {
				Text record = values.next();
				Text outValue = new Text(stationName.toString() + "\t"
						+ record.toString());
				output.collect(stationName, record);
				//output.collect(key.getFirst(), outValue);
			}
		}
	}
	
	public static class KeyPartitioner implements Partitioner<TextPair, Text> {
	    @Override
	    public void configure(JobConf job) {}
	    
	    @Override
	    public int getPartition(TextPair key, Text value, int numPartitions) {
	      return (key.getFirst().hashCode() & Integer.MAX_VALUE) % numPartitions;
	    }
	  }
	
	@Override
	public int run(String[] args) throws Exception {
		if (args.length != 3) {
		      return -1;
		    }
		    
		    JobConf conf = new JobConf(getConf(), getClass());
		    conf.setJobName("Join record with station name");
		    
		    String strPathUnit =args[0];
		    String strPathIterial =args[1];
		    Path outputPath= new Path(args[2]);
		    
		    MultipleInputs.addInputPath(conf, new Path(strPathUnit),
		        TextInputFormat.class, JoinUnitMapper.class);
		    MultipleInputs.addInputPath(conf, new Path(strPathIterial),
		        TextInputFormat.class, JoinIterialMapper.class);
		    FileOutputFormat.setOutputPath(conf, outputPath);

		    conf.setPartitionerClass(KeyPartitioner.class);
		    conf.setOutputValueGroupingComparator(TextPair.FirstComparator.class);
		    
		    conf.setMapOutputKeyClass(TextPair.class);
		    
		    conf.setReducerClass(JoinReducer.class);

		    conf.setOutputKeyClass(Text.class);
		    
		    JobClient.runJob(conf);
		    return 0;
	}
	
	public static void main(String[] args) throws Exception {
		int exitCode = ToolRunner.run(new LoanIdeainfoJoinIterialByDAILI6(), args);
	    System.exit(exitCode);
	}

	

}

本文出自 “梦朝思夕” 博客，请务必保留此出处http://qiangmzsx.blog.51cto.com/2052549/1560553

标签：hadoop join

原文地址：http://qiangmzsx.blog.51cto.com/2052549/1560553

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行