花了一段时间学习lucene今天有时间把所学的写下来,网上有很多文章但大部分都是2.X和3.X版本的(当前最新版本4.9),希望这篇文章对自己和初学者有所帮助。
private String filePath = "F:/myEclipse10/workspace/luceneTest/src/resource.txt";// 源文件所在位置 private String indexDir = "F:/myEclipse10/workspace/luceneTest/src/index";// 索引目录 private static final Version VERSION = Version.LUCENE_47;// lucene版本3、创建索引方法
/** * 创建索引 * * @throws IOException */ @Test public void createIndex() throws IOException { Directory director = FSDirectory.open(new File(indexDir));// 创建Directory关联源文件 Analyzer analyzer = new StandardAnalyzer(VERSION);// 创建一个分词器 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(VERSION, analyzer);// 创建索引的配置信息 IndexWriter indexWriter = new IndexWriter(director, indexWriterConfig); Document doc = new Document();// 创建文档 String str = fileToString();// 读取txt中内容 Field field1 = new StringField("title", "lucene测试", Store.YES);// 标题 StringField索引存储不分词 Field field2 = new TextField("content", str, Store.NO);// 内容 TextField索引分词不存储 Field field3 = new DoubleField("version", 1.2, Store.YES);// 版本 DoubleField类型 Field field4 = new IntField("score", 90, Store.YES);// 评分 IntField类型 doc.add(field1);// 添加field域到文档中 doc.add(field2); doc.add(field3); doc.add(field4); indexWriter.addDocument(doc);// 添加文本到索引中 indexWriter.close();// 关闭索引 }4、查询搜索方法
/** * 查询搜索 * * @throws IOException * @throws ParseException */ @Test public void query() throws IOException, ParseException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));// 索引读取类 IndexSearcher search = new IndexSearcher(reader);// 搜索入口工具类 String queryStr = "life";// 搜索关键字 QueryParser queryParser = new QueryParser(VERSION, "content", new StandardAnalyzer(VERSION));// 实例查询条件类 Query query = queryParser.parse(queryStr); TopDocs topdocs = search.search(query, 100);// 查询前100条 System.out.println("查询结果总数---" + topdocs.totalHits); ScoreDoc scores[] = topdocs.scoreDocs;// 得到所有结果集 for (int i = 0; i < scores.length; i++) { int num = scores[i].doc;// 得到文档id Document document = search.doc(num);// 拿到指定的文档 System.out.println("内容====" + document.get("content"));// 由于内容没有存储所以执行结果为null System.out.println("标题====" + document.get("title")); System.out.println("版本====" + document.get("version")); System.out.println("评分====" + document.get("score")); System.out.println("id--" + num + "---scors--" + scores[i].score + "---index--" + scores[i].shardIndex); } }
/** * 读取文件的内容 * * @return * @throws IOException */ public String fileToString() throws IOException { StringBuffer sb = new StringBuffer(); InputStream inputStream = new FileInputStream(new File(filePath)); InputStreamReader inputStreamReader = new InputStreamReader(inputStream); BufferedReader br = new BufferedReader(inputStreamReader); String line = null; while ((line = br.readLine()) != null) { sb.append(line); } br.close(); inputStreamReader.close(); inputStream.close(); return sb.toString(); }
public class AnalyzerTest { private static final Version VERSION = Version.LUCENE_47;// lucene版本 @Test public void test() throws IOException { String txt = "我是中国人"; Analyzer analyzer1 = new StandardAnalyzer(VERSION);// 标准分词器 // Analyzer analyzer2 = new SimpleAnalyzer(VERSION);// 简单分词器 // Analyzer analyzer3 = new CJKAnalyzer(VERSION);// 二元切分 // Analyzer analyzer4 = new IKAnalyzer(false);// 语意分词 TokenStream tokenstream = analyzer1.tokenStream("content", new StringReader(txt));// 生成一个分词流 // TokenStream tokenstream = analyzer2.tokenStream("content", new StringReader(txt)); // TokenStream tokenstream = analyzer3.tokenStream("content", new StringReader(txt)); // TokenStream tokenstream = analyzer4.tokenStream("content", new StringReader(txt)); CharTermAttribute termAttribute = tokenstream.addAttribute(CharTermAttribute.class);// 为token设置属性类 tokenstream.reset();// 重新设置 while (tokenstream.incrementToken()) {// 遍历得到token System.out.print(new String(termAttribute.buffer(), 0, termAttribute.length()) + " "); } } }
public class MultiseQueryTest { private String indexDir = "F:/myEclipse10/workspace/luceneTest/src/index";// 索引目录 private static final Version VERSION = Version.LUCENE_47;// lucene版本 /** * 多条件查询 查询内容必须包含life内容和评分大于等于80分的结果 * * @throws IOException * @throws ParseException */ @Test public void query() throws IOException, ParseException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));// 索引读取类 IndexSearcher search = new IndexSearcher(reader);// 搜索入口工具类 String queryStr1 = "life";// 搜索关键字 BooleanQuery booleanQuery = new BooleanQuery(); // 条件一内容中必须要有life内容 QueryParser queryParser = new QueryParser(VERSION, "content", new StandardAnalyzer(VERSION));// 实例查询条件类 Query query1 = queryParser.parse(queryStr1); // 条件二评分大于等于80 Query query2 = NumericRangeQuery.newIntRange("score", 80, null, true, false); booleanQuery.add(query1, BooleanClause.Occur.MUST); booleanQuery.add(query2, BooleanClause.Occur.MUST); TopDocs topdocs = search.search(booleanQuery, 100);// 查询前100条 System.out.println("查询结果总数---" + topdocs.totalHits); ScoreDoc scores[] = topdocs.scoreDocs;// 得到所有结果集 for (int i = 0; i < scores.length; i++) { int num = scores[i].doc;// 得到文档id Document document = search.doc(num);// 拿到指定的文档 System.out.println("内容====" + document.get("content"));// 由于内容没有存储所以执行结果为null System.out.println("标题====" + document.get("title")); System.out.println("版本====" + document.get("version")); System.out.println("评分====" + document.get("score")); System.out.println("id--" + num + "---scors--" + scores[i].score + "---index--" + scores[i].shardIndex); } } }运行结果就不粘上去了。本例子运用了NumericRangeQuery创建一个查询条件(还有很多其他的类有兴趣可以一一实验下),五个参数分别为字段域、最小值、最大值、是否包含最小值、是否包含最大值。一开始很迷茫为什么必须要设置最大值和最小值呢?如果我是单范围查询呢?后来看api才发现单范围时可以用null或者把是否包含范围值设为false就行了。
/** * 修改索引 * * @throws IOException */ @Test public void updateIndex() throws IOException { Directory director = FSDirectory.open(new File(indexDir));// 创建Directory关联源文件 Analyzer analyzer = new StandardAnalyzer(VERSION);// 创建一个分词器 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(VERSION, analyzer);// 创建索引的配置信息 IndexWriter indexWriter = new IndexWriter(director, indexWriterConfig); Document doc = new Document();// 创建文档 Field field1 = new StringField("title", "lucene", Store.YES);// 标题 StringField索引存储不分词 Field field2 = new TextField("content", "Is there life on Mars", Store.NO);// 内容 TextField索引分词不存储 Field field3 = new DoubleField("version", 2.0, Store.YES);// 版本 DoubleField类型 Field field4 = new IntField("score", 90, Store.YES);// 评分 IntField类型 doc.add(field1);// 添加field域到文档中 doc.add(field2); doc.add(field3); doc.add(field4); indexWriter.updateDocument(new Term("title", "lucene测试"), doc); indexWriter.commit(); indexWriter.close(); }运行结果:前后对比
/** * 删除索引 * * @throws IOException */ @Test public void deleteIndex() throws IOException { Directory director = FSDirectory.open(new File(indexDir));// 创建Directory关联源文件 Analyzer analyzer = new StandardAnalyzer(VERSION);// 创建一个分词器 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(VERSION, analyzer);// 创建索引的配置信息 IndexWriter indexWriter = new IndexWriter(director, indexWriterConfig); indexWriter.deleteDocuments(new Term("title", "lucene")); indexWriter.commit(); // indexWriter.rollback(); indexWriter.close(); }
/** * 优化 * * @throws IOException */ @Test public void optimize() throws IOException { Directory director = FSDirectory.open(new File(indexDir));// 创建Directory关联源文件 Analyzer analyzer = new StandardAnalyzer(VERSION);// 创建一个分词器 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(VERSION, analyzer);// 创建索引的配置信息 IndexWriter indexWriter = new IndexWriter(director, indexWriterConfig); indexWriter.forceMerge(1);// 当小文件达到多少个时,就自动合并多个小文件为一个大文件 indexWriter.close(); }(2)排序lucene lucene默认情况下是根据“评分机制”来进行排序的,也就是scores[i].score属性值。如果两个文档得分相同,那么就按照发布时间倒序排列;否则就按照分数排列。
/** * 创建索引 * * @throws IOException */ @Test public void createIndex() throws IOException { Directory director = FSDirectory.open(new File(indexDir));// 创建Directory关联源文件 Analyzer analyzer = new StandardAnalyzer(VERSION);// 创建一个分词器 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(VERSION, analyzer);// 创建索引的配置信息 IndexWriter indexWriter = new IndexWriter(director, indexWriterConfig); for (int i = 1; i <= 5; i++) { Document doc = new Document();// 创建文档 Field field1 = new StringField("title", "标题" + i, Store.YES);// 标题 StringField索引存储不分词 Field field2 = new TextField("content", "201" + i + "文章内容", Store.NO);// 内容 TextField索引分词不存储 Field field3 = new DoubleField("version", 1.2, Store.YES);// 版本 DoubleField类型 Field field4 = new IntField("score", 90 + i, Store.YES);// 评分 IntField类型 Field field5 = new StringField("date", "2014-07-0" + i, Store.YES);// 评分 IntField类型 doc.add(field1);// 添加field域到文档中 doc.add(field2); doc.add(field3); doc.add(field4); doc.add(field5); indexWriter.addDocument(doc);// 添加文本到索引中 } indexWriter.close();// 关闭索引 }
/** * 排序 * * @throws IOException * @throws ParseException */ @Test public void defaultSortTest() throws IOException, ParseException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));// 索引读取类 IndexSearcher search = new IndexSearcher(reader);// 搜索入口工具类 String queryStr = "文章";// 搜索关键字 QueryParser queryParser = new QueryParser(VERSION, "content", new StandardAnalyzer(VERSION));// 实例查询条件类 Query query = queryParser.parse(queryStr); TopDocs topdocs = search.search(query, 100);// 查询前100条 System.out.println("查询结果总数---" + topdocs.totalHits); ScoreDoc scores[] = topdocs.scoreDocs;// 得到所有结果集 for (int i = 0; i < scores.length; i++) { int num = scores[i].doc;// 得到文档id Document document = search.doc(num);// 拿到指定的文档 System.out.println("内容====" + document.get("content"));// 由于内容没有存储所以执行结果为null System.out.println("标题====" + document.get("title")); System.out.println("版本====" + document.get("version")); System.out.println("评分====" + document.get("score")); System.out.println("日期====" + document.get("date")); System.out.println("id--" + num + "---scors--" + scores[i].score + "---index--" + scores[i].shardIndex); } } }
Sort sort = new Sort(new SortField("score", SortField.Type.INT, true));// false升序true降序 TopDocs topdocs = search.search(query, 100, sort);// 查询前100条
/** * 高亮 * * @throws IOException * @throws ParseException * @throws InvalidTokenOffsetsException */ @Test public void highlighter() throws IOException, ParseException, InvalidTokenOffsetsException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));// 索引读取类 IndexSearcher search = new IndexSearcher(reader);// 搜索入口工具类 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);// 分词器 QueryParser qp = new QueryParser(Version.LUCENE_47, "content", analyzer);// 实例查询条件类 Query query = qp.parse("文章"); TopDocs topDocs = search.search(query, 100);// 查询前100条 System.out.println("共查询出:" + topDocs.totalHits + "条数据"); ScoreDoc scoreDoc[] = topDocs.scoreDocs;// 结果集 // 高亮 Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");// 高亮html格式 Scorer score = new QueryScorer(query);// 检索评份 Fragmenter fragmenter = new SimpleFragmenter(100);// 设置最大片断为100 Highlighter highlighter = new Highlighter(formatter, score);// 高亮显示类 highlighter.setTextFragmenter(fragmenter);// 设置格式 for (int i = 0; i < scoreDoc.length; i++) {// 遍历结果集 int docnum = scoreDoc[i].doc; Document doc = search.doc(docnum); String content = doc.get("content"); System.out.println(content);// 原内容 if (content != null) { TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(content)); String str = highlighter.getBestFragment(tokenStream, content);// 得到高亮显示后的内容 System.out.println(str); } } }
/** * 分页 * * @throws IOException * @throws ParseException */ @Test public void pageTest() throws IOException, ParseException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));// 索引读取类 IndexSearcher search = new IndexSearcher(reader);// 搜索入口工具类 String queryStr = "文章";// 搜索关键字 QueryParser queryParser = new QueryParser(VERSION, "content", new StandardAnalyzer(VERSION));// 实例查询条件类 Query query = queryParser.parse(queryStr);// 查询 TopScoreDocCollector results = TopScoreDocCollector.create(100, false);// 结果集 search.search(query, results);// 查询前100条 TopDocs topdocs = results.topDocs(1, 2);// 从结果集中第1条开始取2条 ScoreDoc scores[] = topdocs.scoreDocs;// 得到所有结果集 for (int i = 0; i < scores.length; i++) { int num = scores[i].doc;// 得到文档id Document document = search.doc(num);// 拿到指定的文档 System.out.println("内容====" + document.get("content"));// 由于内容没有存储所以执行结果为null System.out.println("标题====" + document.get("title")); System.out.println("版本====" + document.get("version")); System.out.println("评分====" + document.get("score")); System.out.println("id--" + num + "---scors--" + scores[i].score + "---index--" + scores[i].shardIndex); } }
原文地址:http://blog.csdn.net/mdcmy/article/details/38167955