码迷,mamicode.com
首页 > Web开发 > 详细

Lucene实战-Indexer索引创建

时间:2014-12-22 16:14:01      阅读:128      评论:0      收藏:0      [点我收藏+]

标签:

package com.lin.util;

import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class Indexer {

	private IndexWriter writer;

	/**
	 * 主程序
	 * 
	 * @param indexDir
	 *            索引位置
	 * @param dataDir
	 *            数据来源
	 * @throws Exception
	 */
	public static void index(String indexDir, String dataDir) throws Exception {
		if (indexDir == null || dataDir == null) {
			throw new IllegalArgumentException("请检查你的参数是否正确");
		}
		long start = System.currentTimeMillis();
		Indexer indexer = new Indexer(indexDir);
		int numIndexed;
		try {
			numIndexed = indexer.index(dataDir, new TextFilesFilter());
		} finally {
			indexer.close();
		}
		long end = System.currentTimeMillis();
		System.out.println("Indexing " + numIndexed + " files took "
				+ (end - start) + " milliseconds");
	}

	/**
	 * 初始化writer(用与建立索引)
	 * 
	 * @param indexDir
	 * @throws IOException
	 */
	private Indexer(String indexDir) throws IOException {
		Directory dir = FSDirectory.open(new File(indexDir));
		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_2,
				new IKAnalyzer());
		writer = new IndexWriter(dir, config);
	}

	/**
	 * 
	 * @param dataDir
	 * @param filter
	 * @return 索引的文件数
 	 * @throws IOException
	 */
	public int index(String dataDir, FileFilter filter) throws IOException {
		File[] files = new File(dataDir).listFiles();
		for (File f : files) {
			if (!f.isDirectory() && !f.isHidden() && f.canRead() && f.exists()
					&& (filter == null || filter.accept(f))) {
				indexFile(f);
			}
		}
		return writer.numDocs();
	}

	private void indexFile(File f) throws IOException {
		System.out.println("indexing " + f.getCanonicalPath());
		Document doc = getDocument(f);
		writer.addDocument(doc);

	}

	@SuppressWarnings("deprecation")
	protected Document getDocument(File f) throws IOException {
		Document doc = new Document();
		doc.add(new Field("contents", new FileReader(f)));
		doc.add(new Field("filename", f.getName(), Field.Store.YES,
				Field.Index.NOT_ANALYZED));
		doc.add(new Field("fullpath", f.getCanonicalPath(), Field.Store.YES,
				Field.Index.NOT_ANALYZED));
		return doc;
	}

	/**
	 * 文件过滤器
	 * 
	 * @author zan
	 * 
	 */
	private static class TextFilesFilter implements FileFilter {

		public boolean accept(File f) {
			return f.getName().toLowerCase().endsWith(".txt");
		}

	}

	public void close() throws IOException {
		if (writer != null) {
			writer.close();
		}

	}

	public static void main(String[] args) throws Exception {
		Indexer.index("d:\\index", "D:\\Program Files\\TortoiseSVN");
	}
}

Lucene实战-Indexer索引创建

标签:

原文地址:http://blog.csdn.net/hackcoder/article/details/42080877

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!