码迷,mamicode.com
首页 > 其他好文 > 详细

对节目微博进行强过滤之后的处理

时间:2014-07-16 20:11:53      阅读:120      评论:0      收藏:0      [点我收藏+]

标签:style   blog   http   java   color   文件   

1,对原始数据.data进行过滤,利用java实现

bubuko.com,布布扣
package com.bobo.DataPre;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;

import com.bobo.util.Constants;
import com.bobo.util.Fenci;
import com.bobo.util.StopwordsRemover;
import com.bobo.util.StringUtil;
import com.bobo.util.UtilityForRemoveAtName;

public class ProgramDataFilter {

    /**
     * @param args
     */
    public static void main(String[] args) {
        long start = System.currentTimeMillis();
        for (int i = 0; i < Constants.ProgramNameList.length; i++) {

        }
        ProgramDataFilter pre = new ProgramDataFilter();
        String inFilePath;
        String outFilePath;
        String programName;
        String[] keywords;
        for (int i = 0; i < Constants.ProgramNameList.length; i++) {
            programName = Constants.ProgramNameList[i];
            keywords = Constants.keywordsList[i];
            inFilePath = Constants.TitleDir + File.separator + programName
                    + ".title.uniqByWeiboId";
            outFilePath = Constants.FilterDir + File.separator + programName
                    + ".filter.fenci";
            pre.dataSetAndRmStop(inFilePath, outFilePath, programName, keywords);
            long end = System.currentTimeMillis();
            System.out.println(programName + "数据预处理,分词、去处停用时、去除@花费的时间为:"
                    + (end - start) / 1000);
        }

    }

    public boolean isRelative(String weiboText, String programName,
            String[] filterWords) {
        // 包含节目名称
        if (!weiboText.contains(programName)) {
            return false;
        }
        // 对于歧义性小的,单独利用名字就够了
        if (filterWords.length < 1) {
            return true;
        }

        if (weiboText.contains("" + programName + "")) {
            return true;
        }

        // 包含节目名称的同时,包含演员名称或者节目类别

        for (String keyword : filterWords) {
            if (weiboText.contains(keyword)) {
                return true;
            }
        }

        return false;
    }

    // 第一步,进行分词、去除停用词、去除@后的用户名称?
    private void dataSetAndRmStop(String inFilePath, String outFilePath,
            String programName, String[] keywords) {
        FileReader fr = null;
        BufferedReader br = null;
        FileWriter fw = null;
        BufferedWriter bw = null;
        PrintWriter pw = null;
        String line = null;
        Fenci fenci = new Fenci();
        fenci.initial();
        StopwordsRemover stop = new StopwordsRemover();
        stop.setStoppingListSet(stop
                .loadStoppingListSet("./conf/stopwords.list"));
        String weiboText;

        try {
            fr = new FileReader(inFilePath);
            br = new BufferedReader(fr);
            fw = new FileWriter(outFilePath);
            bw = new BufferedWriter(fw);
            pw = new PrintWriter(bw);

            while ((line = br.readLine()) != null) {
                String[] lineArr = line.split("\t");
                if (lineArr.length != 3) {
                    continue;
                }
                weiboText = lineArr[1];
                if (StringUtil.isNullOrEmpty(weiboText)) {
                    continue;
                }
                if (!isRelative(weiboText, programName, keywords)) {
                    continue;
                }

                String fenciString = stop.removeStoppingWords(fenci
                        .testICTCLAS_ParagraphProcess((UtilityForRemoveAtName
                                .removeName(weiboText))));
                if (!StringUtil.isNullOrEmpty(fenciString)) {
                    pw.println(lineArr[0]+"\t"+fenciString);
                }

            }

        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("RemoveUrlUtil.java文件去除链接出现异常");
        } finally {
            try {
                br.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            pw.close();
        }

    }

}
过滤和分词

生成的文件格式是

用户id “\t” 微博文本

2,提取每个节目下的用户列表,并将用户的列表和用户的profile进行对应

 

 

对节目微博进行强过滤之后的处理,布布扣,bubuko.com

对节目微博进行强过滤之后的处理

标签:style   blog   http   java   color   文件   

原文地址:http://www.cnblogs.com/bobodeboke/p/3836982.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!