标签:
首先得到索引:
package com.wp.util; import java.io.File; import java.io.FileReader; import java.nio.file.Paths; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; public class Indexer { private IndexWriter writer; // 写索引实例 /** * 构造方法 实例化IndexWriter * * @param indexDir * @throws Exception */ public Indexer(String indexDir) throws Exception { Directory dir = FSDirectory.open(Paths.get(indexDir));// 根据路径获取存储索引的目录 Analyzer analyzer = new StandardAnalyzer(); // 这里用了多态,StandardAnalyzer是一个标准分词器, IndexWriterConfig iwc = new IndexWriterConfig(analyzer);//将分词器加入到索引中 writer = new IndexWriter(dir, iwc); } /** * 关闭写索引 * * @throws Exception */ public void close() throws Exception { writer.close(); } /** * 索引指定目录的所有文件 * * @param dataDir * @throws Exception */ public int index(String dataDir) throws Exception { File[] files = new File(dataDir).listFiles();//得到目录下的所有文件 for (File f : files) { indexFile(f); } return writer.numDocs(); } /** * 索引指定文件 * * @param f */ private void indexFile(File f) throws Exception { // 关于f.getCanonicalPath()查看http://www.blogjava.net/dreamstone/archive/2007/08/08/134968.html System.out.println("索引文件:" + f.getCanonicalPath());//得到绝对路径 Document doc = getDocument(f); writer.addDocument(doc); } /** * 获取文档,文档里再设置每个字段 * * @param f */ private Document getDocument(File f) throws Exception { Document doc = new Document();//创建文档 doc.add(new TextField("contents", new FileReader(f)));//以文件流的形式读取文件内容 doc.add(new TextField("fileName", f.getName(), Field.Store.YES));//Field.Store.YES表示将属性存入内存中 doc.add(new TextField("fullPath", f.getCanonicalPath(),Field.Store.YES));//存入内存中 return doc; } public static void main(String[] args) { String indexDir = "D:\\lucene\\luceneIndex"; String dataDir = "D:\\lucene\\data"; Indexer indexer = null; int numIndexed = 0; long start = System.currentTimeMillis(); try { indexer = new Indexer(indexDir);//创建索引 numIndexed = indexer.index(dataDir);//索引指定目录的所有文件 } catch (Exception e) { e.printStackTrace(); } finally { try { indexer.close(); } catch (Exception e) { e.printStackTrace(); } } long end = System.currentTimeMillis(); System.out.println("索引:" + numIndexed + " 个文件 花费了" + (end - start) + " 毫秒"); } }
增加的知识:
getPath():
返回的是定义时的路径,可能是相对路径,也可能是绝对路径,这个取决于定义时用的是相对路径还是绝对路径。如果定义时用的是绝对路径,那么使用getPath()返回的结果跟用getAbsolutePath()返回的结果一样
getAbsolutePath():
返回的是定义时的路径对应的相对路径,但不会处理“.”和“..”的情况
getCanonicalPath():
返回的是规范化的绝对路径,相当于将getAbsolutePath()中的“.”和“..”解析成对应的正确的路径
搜索:
package com.wp.util; import java.nio.file.Paths; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.junit.After; import org.junit.Before; import org.junit.Test; public class SearchTest { private Directory dir; private IndexReader reader; private IndexSearcher is; @Before public void setUp() throws Exception { dir = FSDirectory.open(Paths.get("D:\\lucene\\luceneIndex"));// 得到存储索引的目录 reader = DirectoryReader.open(dir);// 读取索引 is = new IndexSearcher(reader);// //创建索引搜索 } @After public void tearDown() throws Exception { reader.close(); } /** * 对特定项搜索(只能是指定的字符串,而不是其中的字母) * * @throws Exception */ @Test public void testTermQuery() throws Exception { String searchField = "contents"; String q = "particular"; Term t = new Term(searchField, q);// 查询contents中的particular字符 Query query = new TermQuery(t);// 对特定字符查询 TopDocs hits = is.search(query, 10);// 得到查询的最前面10条数据 System.out.println("匹配 ‘" + q + "‘,总共查询到" + hits.totalHits + "个文档"); for (ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = is.doc(scoreDoc.doc);// 得到查询到的得分文档个数 System.out.println(doc.get("fullPath")); } } /** * 解析查询表达式 * * @throws Exception */ @Test public void testQueryParser() throws Exception { Analyzer analyzer = new StandardAnalyzer(); // 标准分词器 String searchField = "contents"; // String q = "particular AND SA";//表示查询都: 注意AND要大写 // String q = "particular or a";// 表示查询或者 String q = "particular~";// ~符号表示查询相近的 QueryParser parser = new QueryParser(searchField, analyzer); Query query = parser.parse(q);// 进行解析 TopDocs hits = is.search(query, 100); System.out.println("匹配 " + q + "查询到" + hits.totalHits + "个记录"); for (ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = is.doc(scoreDoc.doc); System.out.println(doc.get("fullPath")); } } }
标签:
原文地址:http://www.cnblogs.com/lirenzhujiu/p/5914034.html