标签:
import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class InvertedIndex { public static class InversedIndexMapper extends Mapper<Object, Text, Text, Text> { private Text outKey = new Text(); private Text outVal = new Text(); @Override public void map (Object key,Text value,Context context) { StringTokenizer tokens = new StringTokenizer(value.toString()); FileSplit split = (FileSplit) context.getInputSplit(); while(tokens.hasMoreTokens()) { String token = tokens.nextToken(); try { outKey.set(token + ":" + split.getPath()); outVal.set("1"); context.write(outKey, outVal); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } } System.out.println("Map phase finished ..."); } } public static class InversedIndexCombiner extends Reducer<Text, Text, Text, Text> { private Text outKey = new Text(); private Text outVal = new Text(); @Override public void reduce(Text key,Iterable<Text> values,Context context) { String[] keys = key.toString().split(":"); int sum = 0; for(Text val : values) { sum += Integer.parseInt(val.toString()); } try { outKey.set(keys[0]); int index = keys[keys.length-1].lastIndexOf(‘/‘); outVal.set(keys[keys.length-1].substring(index+1) + ":" + sum); context.write(outKey, outVal); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } System.out.println("Combine phase finished ..."); } } public static class InversedIndexReducer extends Reducer<Text, Text, Text, Text> { @Override public void reduce (Text key,Iterable<Text> values,Context context) { StringBuffer sb = new StringBuffer(); for(Text text : values) { sb.append(text.toString() + " ,"); } try { context.write(key, new Text(sb.toString())); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } System.out.println("Reduce phase finished ..."); } } public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); @SuppressWarnings("deprecation") Job job = new Job(conf,"index inverted"); job.setJarByClass(InvertedIndex.class); job.setMapperClass(InversedIndexMapper.class); job.setCombinerClass(InversedIndexCombiner.class); job.setReducerClass(InversedIndexReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path("input")); FileOutputFormat.setOutputPath(job, new Path("output")); if(job.waitForCompletion(true)) { System.out.println("All job finished ..."); System.exit(0); } } }
标签:
原文地址:http://www.cnblogs.com/shenbingyu/p/4940676.html