码迷,mamicode.com
首页 > 其他好文 > 详细

hadoop 不同URLTitle文件提取关联URL

时间:2014-06-05 19:23:04      阅读:335      评论:0      收藏:0      [点我收藏+]

标签:c   style   class   blog   code   java   

bubuko.com,布布扣
package com.sogou.web.selector.updana.wapPc;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.sogou.web.selector.wapcoverage.GBKOutputFormat;

public class URLTitle extends Configured implements Tool {
    private static class KeyPartitioner extends Partitioner<TextPair, Text>{

        @Override
        public int getPartition(TextPair key, Text value, int numPartitions) {
            // TODO Auto-generated method stub
            return (key.getFirst().hashCode()&Integer.MAX_VALUE)%numPartitions;
        }
        
    }
    private static class GroupPartitioner extends WritableComparator{

        protected GroupPartitioner() {
            super(TextPair.class,true);
        }
        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            // TODO Auto-generated method stub
            TextPair t1=(TextPair)a;
            TextPair t2=(TextPair)b;
            return t1.getFirst().compareTo(t2.getFirst());
        }
        
    }
    public int run(String[] args) throws Exception {
        // TODO Auto-generated method stub
        Job job = new Job(this.getConf(), "URL_Title_Analysis");
        //设置运行job
        job.setJarByClass(this.getClass());
        //设置Map相关内容
        job.setMapperClass(WapPCMapper.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);
        //设子reduce
        job.setReducerClass(WapPcReducer.class);
        job.setOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        //设置输出入格式文件
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(GBKOutputFormat.class);
        
        //设置分区和分组
        job.setPartitionerClass(KeyPartitioner.class);
        job.setGroupingComparatorClass(GroupPartitioner.class);
        
        System.exit(job.waitForCompletion(true) ? 0 : 1);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        Tool UrlTitle = new URLTitle();
        ToolRunner.run(UrlTitle, args);
    }
}
bubuko.com,布布扣

可以提取A,B两个文件中的URL和Title中相等的Title,并输出需要的Title的关联URL

hadoop 不同URLTitle文件提取关联URL,布布扣,bubuko.com

hadoop 不同URLTitle文件提取关联URL

标签:c   style   class   blog   code   java   

原文地址:http://www.cnblogs.com/csxf/p/3768503.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!