标签:
package lia.meetlucene; import java.io.File; import java.io.IOException; import java.util.LinkedList; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; //import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NodeList; public class Unicode1 { static boolean numTime = false; static boolean numText = false; static String timeTmp = null; static String textTmp = null; static void indexer(IndexWriter writer) throws CorruptIndexException, IOException { org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); Field field = new Field("context",textTmp,Field.Store.YES, Field.Index.ANALYZED); doc.add(field); field = new Field("time",timeTmp,Field.Store.YES, Field.Index.NOT_ANALYZED); doc.add(field); writer.addDocument(doc); //System.out.println("微博: " + textTmp+ " "+timeTmp); } static void Dfs(NodeList nodecur,IndexWriter writer) { for (int j = 0; j < nodecur.getLength(); j++) { if ("timestamp".equals(nodecur.item(j).getNodeName())) // 输出pass { //System.out.println("时间: " + nodecur.item(j).getTextContent()); timeTmp = nodecur.item(j).getTextContent(); numTime = true; } /* * else if ("origtext".equals(nodecur.item(j).getNodeName())) * System.out.println("原微博: " + nodecur.item(j).getTextContent()); */ else if ("text".equals(nodecur.item(j).getNodeName())) // 输出code { //System.out.println("微博: " + nodecur.item(j).getTextContent()); textTmp = nodecur.item(j).getTextContent(); numText = true; } if(numText&&numTime) try { indexer(writer); numText = false; numTime = false; } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } NodeList childNodes = nodecur.item(j).getChildNodes(); Dfs(childNodes,writer); } } public static void main(String[] args) throws IOException { long a = System.currentTimeMillis(); // String dataDir ="C:/Users/Administrator/Desktop/xdj/tengxun/A__Vae"; //File dataDir = new File("C:/Users/Administrator/Desktop/xdj/tengxun"); //String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin"; File dataDir = new File("E:/xdj/tengxun"); String indexDir = "E:/xdj/tengxunsuoying"; Directory dir = FSDirectory.open(new File(indexDir)); IndexWriter writer = new IndexWriter(dir, new SmartChineseAnalyzer(Version.LUCENE_20), //new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED); LinkedList list = new LinkedList(); File file[] = dataDir.listFiles(); for (int i = 0; i < file.length; i++) { if (file[i].isDirectory()) list.add(file[i]); } File tmp; int num = 0; while (!list.isEmpty()) { tmp = (File) list.removeFirst(); file = tmp.listFiles(); for (int i = 0; i < file.length; i++) { System.out.println(file[i].getAbsolutePath()); Element element = null; // documentBuilder为抽象不能直接实例化(将XML文件转换为DOM文件) DocumentBuilder db = null; DocumentBuilderFactory dbf = null; try { // 返回documentBuilderFactory对象 dbf = DocumentBuilderFactory.newInstance(); // 返回db对象用documentBuilderFatory对象获得返回documentBuildr对象 db = dbf.newDocumentBuilder(); // 得到一个DOM并返回给document对象 Document dt = db.parse(file[i]); // 得到一个elment根元素 element = dt.getDocumentElement(); // 获得根节点 System.out.println("根元素:" + element.getNodeName()); // 获得根元素下的子节点 Dfs(element.getChildNodes(),writer); num++; } catch (Exception e) { e.printStackTrace(); } } System.out.println(System.currentTimeMillis() - a + " " + num); } writer.close(); } }
lucene 3.0.2 + 多文件夹微博数据(时间,微博)构建索引
标签:
原文地址:http://www.cnblogs.com/XDJjy/p/4437539.html