码迷,mamicode.com
首页 > Web开发 > 详细

lucene 3.0.2 + 多文件夹微博数据(时间,微博)构建索引

时间:2015-04-18 17:26:39      阅读:174      评论:0      收藏:0      [点我收藏+]

标签:

 

技术分享
package lia.meetlucene;

import java.io.File;
import java.io.IOException;
import java.util.LinkedList;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
//import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;

public class Unicode1 {

    static boolean numTime = false;
    static boolean numText = false;
    static String timeTmp = null;
    static String textTmp = null;
    
    static void indexer(IndexWriter writer) throws CorruptIndexException, IOException
    {
        org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
        Field field = new Field("context",textTmp,Field.Store.YES,
                Field.Index.ANALYZED);
        doc.add(field);
        
        field = new Field("time",timeTmp,Field.Store.YES,
                Field.Index.NOT_ANALYZED);
        doc.add(field);
        
        writer.addDocument(doc);
        //System.out.println("微博: " + textTmp+ "  "+timeTmp);
    }
    
    
    static void Dfs(NodeList nodecur,IndexWriter writer) {
        for (int j = 0; j < nodecur.getLength(); j++) {

            if ("timestamp".equals(nodecur.item(j).getNodeName())) // 输出pass
            {
                //System.out.println("时间: " + nodecur.item(j).getTextContent());
                timeTmp = nodecur.item(j).getTextContent();
                numTime = true;
            }
            /*
             * else if ("origtext".equals(nodecur.item(j).getNodeName()))
             * System.out.println("原微博: " + nodecur.item(j).getTextContent());
             */
            else if ("text".equals(nodecur.item(j).getNodeName())) // 输出code
            {
                //System.out.println("微博: " + nodecur.item(j).getTextContent());
                textTmp = nodecur.item(j).getTextContent();
                numText = true; 
            }
            if(numText&&numTime)
                try {
                    indexer(writer);
                    numText = false;
                    numTime = false;
                } catch (CorruptIndexException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                
            NodeList childNodes = nodecur.item(j).getChildNodes();
            Dfs(childNodes,writer);
        }
    }

    public static void main(String[] args) throws IOException {

        long a = System.currentTimeMillis();

        // String dataDir ="C:/Users/Administrator/Desktop/xdj/tengxun/A__Vae";
        //File dataDir = new File("C:/Users/Administrator/Desktop/xdj/tengxun");
        //String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";
        File dataDir = new File("E:/xdj/tengxun");
        String indexDir = "E:/xdj/tengxunsuoying";
        Directory dir = FSDirectory.open(new File(indexDir));

        IndexWriter writer = new IndexWriter(dir, 
                new SmartChineseAnalyzer(Version.LUCENE_20),
                //new StandardAnalyzer(Version.LUCENE_30), 
                true, 
                IndexWriter.MaxFieldLength.UNLIMITED);
        
        

        LinkedList list = new LinkedList();
        File file[] = dataDir.listFiles();
        for (int i = 0; i < file.length; i++) {
            if (file[i].isDirectory())
                list.add(file[i]);
        }
        File tmp;
        int num = 0;
        while (!list.isEmpty()) {
            
            tmp = (File) list.removeFirst();
            file = tmp.listFiles();
            for (int i = 0; i < file.length; i++) {
                System.out.println(file[i].getAbsolutePath());

                Element element = null;
                // documentBuilder为抽象不能直接实例化(将XML文件转换为DOM文件)
                DocumentBuilder db = null;
                DocumentBuilderFactory dbf = null;
                try {
                    // 返回documentBuilderFactory对象
                    dbf = DocumentBuilderFactory.newInstance();
                    // 返回db对象用documentBuilderFatory对象获得返回documentBuildr对象
                    db = dbf.newDocumentBuilder();
                    // 得到一个DOM并返回给document对象
                    Document dt = db.parse(file[i]);
                    // 得到一个elment根元素
                    element = dt.getDocumentElement();
                    // 获得根节点
                    System.out.println("根元素:" + element.getNodeName());
                    // 获得根元素下的子节点

                    Dfs(element.getChildNodes(),writer);

                    num++;
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }

            System.out.println(System.currentTimeMillis() - a + "    " + num);
        }
        writer.close();
        
    }
}
View Code

 

lucene 3.0.2 + 多文件夹微博数据(时间,微博)构建索引

标签:

原文地址:http://www.cnblogs.com/XDJjy/p/4437539.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!