花了一段时间学习lucene今天有时间把所学的写下来,网上有很多文章但大部分都是2.X和3.X版本的(当前最新版本4.9),希望这篇文章对自己和初学者有所帮助。
private String filePath = "F:/myEclipse10/workspace/luceneTest/src/resource.txt";// 源文件所在位置 private String indexDir = "F:/myEclipse10/workspace/luceneTest/src/index";// 索引目录 private static final Version VERSION = Version.LUCENE_47;// lucene版本3、创建索引方法
/**
* 创建索引
*
* @throws IOException
*/
@Test
public void createIndex() throws IOException {
Directory director = FSDirectory.open(new File(indexDir));// 创建Directory关联源文件
Analyzer analyzer = new StandardAnalyzer(VERSION);// 创建一个分词器
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(VERSION, analyzer);// 创建索引的配置信息
IndexWriter indexWriter = new IndexWriter(director, indexWriterConfig);
Document doc = new Document();// 创建文档
String str = fileToString();// 读取txt中内容
Field field1 = new StringField("title", "lucene测试", Store.YES);// 标题 StringField索引存储不分词
Field field2 = new TextField("content", str, Store.NO);// 内容 TextField索引分词不存储
Field field3 = new DoubleField("version", 1.2, Store.YES);// 版本 DoubleField类型
Field field4 = new IntField("score", 90, Store.YES);// 评分 IntField类型
doc.add(field1);// 添加field域到文档中
doc.add(field2);
doc.add(field3);
doc.add(field4);
indexWriter.addDocument(doc);// 添加文本到索引中
indexWriter.close();// 关闭索引
} 4、查询搜索方法/**
* 查询搜索
*
* @throws IOException
* @throws ParseException
*/
@Test
public void query() throws IOException, ParseException {
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));// 索引读取类
IndexSearcher search = new IndexSearcher(reader);// 搜索入口工具类
String queryStr = "life";// 搜索关键字
QueryParser queryParser = new QueryParser(VERSION, "content", new StandardAnalyzer(VERSION));// 实例查询条件类
Query query = queryParser.parse(queryStr);
TopDocs topdocs = search.search(query, 100);// 查询前100条
System.out.println("查询结果总数---" + topdocs.totalHits);
ScoreDoc scores[] = topdocs.scoreDocs;// 得到所有结果集
for (int i = 0; i < scores.length; i++) {
int num = scores[i].doc;// 得到文档id
Document document = search.doc(num);// 拿到指定的文档
System.out.println("内容====" + document.get("content"));// 由于内容没有存储所以执行结果为null
System.out.println("标题====" + document.get("title"));
System.out.println("版本====" + document.get("version"));
System.out.println("评分====" + document.get("score"));
System.out.println("id--" + num + "---scors--" + scores[i].score + "---index--" + scores[i].shardIndex);
}
}/**
* 读取文件的内容
*
* @return
* @throws IOException
*/
public String fileToString() throws IOException {
StringBuffer sb = new StringBuffer();
InputStream inputStream = new FileInputStream(new File(filePath));
InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
BufferedReader br = new BufferedReader(inputStreamReader);
String line = null;
while ((line = br.readLine()) != null) {
sb.append(line);
}
br.close();
inputStreamReader.close();
inputStream.close();
return sb.toString();
}public class AnalyzerTest {
private static final Version VERSION = Version.LUCENE_47;// lucene版本
@Test
public void test() throws IOException {
String txt = "我是中国人";
Analyzer analyzer1 = new StandardAnalyzer(VERSION);// 标准分词器
// Analyzer analyzer2 = new SimpleAnalyzer(VERSION);// 简单分词器
// Analyzer analyzer3 = new CJKAnalyzer(VERSION);// 二元切分
// Analyzer analyzer4 = new IKAnalyzer(false);// 语意分词
TokenStream tokenstream = analyzer1.tokenStream("content", new StringReader(txt));// 生成一个分词流
// TokenStream tokenstream = analyzer2.tokenStream("content", new StringReader(txt));
// TokenStream tokenstream = analyzer3.tokenStream("content", new StringReader(txt));
// TokenStream tokenstream = analyzer4.tokenStream("content", new StringReader(txt));
CharTermAttribute termAttribute = tokenstream.addAttribute(CharTermAttribute.class);// 为token设置属性类
tokenstream.reset();// 重新设置
while (tokenstream.incrementToken()) {// 遍历得到token
System.out.print(new String(termAttribute.buffer(), 0, termAttribute.length()) + " ");
}
}
}public class MultiseQueryTest {
private String indexDir = "F:/myEclipse10/workspace/luceneTest/src/index";// 索引目录
private static final Version VERSION = Version.LUCENE_47;// lucene版本
/**
* 多条件查询 查询内容必须包含life内容和评分大于等于80分的结果
*
* @throws IOException
* @throws ParseException
*/
@Test
public void query() throws IOException, ParseException {
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));// 索引读取类
IndexSearcher search = new IndexSearcher(reader);// 搜索入口工具类
String queryStr1 = "life";// 搜索关键字
BooleanQuery booleanQuery = new BooleanQuery();
// 条件一内容中必须要有life内容
QueryParser queryParser = new QueryParser(VERSION, "content", new StandardAnalyzer(VERSION));// 实例查询条件类
Query query1 = queryParser.parse(queryStr1);
// 条件二评分大于等于80
Query query2 = NumericRangeQuery.newIntRange("score", 80, null, true, false);
booleanQuery.add(query1, BooleanClause.Occur.MUST);
booleanQuery.add(query2, BooleanClause.Occur.MUST);
TopDocs topdocs = search.search(booleanQuery, 100);// 查询前100条
System.out.println("查询结果总数---" + topdocs.totalHits);
ScoreDoc scores[] = topdocs.scoreDocs;// 得到所有结果集
for (int i = 0; i < scores.length; i++) {
int num = scores[i].doc;// 得到文档id
Document document = search.doc(num);// 拿到指定的文档
System.out.println("内容====" + document.get("content"));// 由于内容没有存储所以执行结果为null
System.out.println("标题====" + document.get("title"));
System.out.println("版本====" + document.get("version"));
System.out.println("评分====" + document.get("score"));
System.out.println("id--" + num + "---scors--" + scores[i].score + "---index--" + scores[i].shardIndex);
}
}
}运行结果就不粘上去了。本例子运用了NumericRangeQuery创建一个查询条件(还有很多其他的类有兴趣可以一一实验下),五个参数分别为字段域、最小值、最大值、是否包含最小值、是否包含最大值。一开始很迷茫为什么必须要设置最大值和最小值呢?如果我是单范围查询呢?后来看api才发现单范围时可以用null或者把是否包含范围值设为false就行了。/**
* 修改索引
*
* @throws IOException
*/
@Test
public void updateIndex() throws IOException {
Directory director = FSDirectory.open(new File(indexDir));// 创建Directory关联源文件
Analyzer analyzer = new StandardAnalyzer(VERSION);// 创建一个分词器
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(VERSION, analyzer);// 创建索引的配置信息
IndexWriter indexWriter = new IndexWriter(director, indexWriterConfig);
Document doc = new Document();// 创建文档
Field field1 = new StringField("title", "lucene", Store.YES);// 标题 StringField索引存储不分词
Field field2 = new TextField("content", "Is there life on Mars", Store.NO);// 内容 TextField索引分词不存储
Field field3 = new DoubleField("version", 2.0, Store.YES);// 版本 DoubleField类型
Field field4 = new IntField("score", 90, Store.YES);// 评分 IntField类型
doc.add(field1);// 添加field域到文档中
doc.add(field2);
doc.add(field3);
doc.add(field4);
indexWriter.updateDocument(new Term("title", "lucene测试"), doc);
indexWriter.commit();
indexWriter.close();
}运行结果:/**
* 删除索引
*
* @throws IOException
*/
@Test
public void deleteIndex() throws IOException {
Directory director = FSDirectory.open(new File(indexDir));// 创建Directory关联源文件
Analyzer analyzer = new StandardAnalyzer(VERSION);// 创建一个分词器
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(VERSION, analyzer);// 创建索引的配置信息
IndexWriter indexWriter = new IndexWriter(director, indexWriterConfig);
indexWriter.deleteDocuments(new Term("title", "lucene"));
indexWriter.commit();
// indexWriter.rollback();
indexWriter.close();
} /**
* 优化
*
* @throws IOException
*/
@Test
public void optimize() throws IOException {
Directory director = FSDirectory.open(new File(indexDir));// 创建Directory关联源文件
Analyzer analyzer = new StandardAnalyzer(VERSION);// 创建一个分词器
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(VERSION, analyzer);// 创建索引的配置信息
IndexWriter indexWriter = new IndexWriter(director, indexWriterConfig);
indexWriter.forceMerge(1);// 当小文件达到多少个时,就自动合并多个小文件为一个大文件
indexWriter.close();
}(2)排序lucene lucene默认情况下是根据“评分机制”来进行排序的,也就是scores[i].score属性值。如果两个文档得分相同,那么就按照发布时间倒序排列;否则就按照分数排列。/**
* 创建索引
*
* @throws IOException
*/
@Test
public void createIndex() throws IOException {
Directory director = FSDirectory.open(new File(indexDir));// 创建Directory关联源文件
Analyzer analyzer = new StandardAnalyzer(VERSION);// 创建一个分词器
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(VERSION, analyzer);// 创建索引的配置信息
IndexWriter indexWriter = new IndexWriter(director, indexWriterConfig);
for (int i = 1; i <= 5; i++) {
Document doc = new Document();// 创建文档
Field field1 = new StringField("title", "标题" + i, Store.YES);// 标题 StringField索引存储不分词
Field field2 = new TextField("content", "201" + i + "文章内容", Store.NO);// 内容 TextField索引分词不存储
Field field3 = new DoubleField("version", 1.2, Store.YES);// 版本 DoubleField类型
Field field4 = new IntField("score", 90 + i, Store.YES);// 评分 IntField类型
Field field5 = new StringField("date", "2014-07-0" + i, Store.YES);// 评分 IntField类型
doc.add(field1);// 添加field域到文档中
doc.add(field2);
doc.add(field3);
doc.add(field4);
doc.add(field5);
indexWriter.addDocument(doc);// 添加文本到索引中
}
indexWriter.close();// 关闭索引
}/**
* 排序
*
* @throws IOException
* @throws ParseException
*/
@Test
public void defaultSortTest() throws IOException, ParseException {
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));// 索引读取类
IndexSearcher search = new IndexSearcher(reader);// 搜索入口工具类
String queryStr = "文章";// 搜索关键字
QueryParser queryParser = new QueryParser(VERSION, "content", new StandardAnalyzer(VERSION));// 实例查询条件类
Query query = queryParser.parse(queryStr);
TopDocs topdocs = search.search(query, 100);// 查询前100条
System.out.println("查询结果总数---" + topdocs.totalHits);
ScoreDoc scores[] = topdocs.scoreDocs;// 得到所有结果集
for (int i = 0; i < scores.length; i++) {
int num = scores[i].doc;// 得到文档id
Document document = search.doc(num);// 拿到指定的文档
System.out.println("内容====" + document.get("content"));// 由于内容没有存储所以执行结果为null
System.out.println("标题====" + document.get("title"));
System.out.println("版本====" + document.get("version"));
System.out.println("评分====" + document.get("score"));
System.out.println("日期====" + document.get("date"));
System.out.println("id--" + num + "---scors--" + scores[i].score + "---index--" + scores[i].shardIndex);
}
}
}Sort sort = new Sort(new SortField("score", SortField.Type.INT, true));// false升序true降序
TopDocs topdocs = search.search(query, 100, sort);// 查询前100条/**
* 高亮
*
* @throws IOException
* @throws ParseException
* @throws InvalidTokenOffsetsException
*/
@Test
public void highlighter() throws IOException, ParseException, InvalidTokenOffsetsException {
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));// 索引读取类
IndexSearcher search = new IndexSearcher(reader);// 搜索入口工具类
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);// 分词器
QueryParser qp = new QueryParser(Version.LUCENE_47, "content", analyzer);// 实例查询条件类
Query query = qp.parse("文章");
TopDocs topDocs = search.search(query, 100);// 查询前100条
System.out.println("共查询出:" + topDocs.totalHits + "条数据");
ScoreDoc scoreDoc[] = topDocs.scoreDocs;// 结果集
// 高亮
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");// 高亮html格式
Scorer score = new QueryScorer(query);// 检索评份
Fragmenter fragmenter = new SimpleFragmenter(100);// 设置最大片断为100
Highlighter highlighter = new Highlighter(formatter, score);// 高亮显示类
highlighter.setTextFragmenter(fragmenter);// 设置格式
for (int i = 0; i < scoreDoc.length; i++) {// 遍历结果集
int docnum = scoreDoc[i].doc;
Document doc = search.doc(docnum);
String content = doc.get("content");
System.out.println(content);// 原内容
if (content != null) {
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(content));
String str = highlighter.getBestFragment(tokenStream, content);// 得到高亮显示后的内容
System.out.println(str);
}
}
}/**
* 分页
*
* @throws IOException
* @throws ParseException
*/
@Test
public void pageTest() throws IOException, ParseException {
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));// 索引读取类
IndexSearcher search = new IndexSearcher(reader);// 搜索入口工具类
String queryStr = "文章";// 搜索关键字
QueryParser queryParser = new QueryParser(VERSION, "content", new StandardAnalyzer(VERSION));// 实例查询条件类
Query query = queryParser.parse(queryStr);// 查询
TopScoreDocCollector results = TopScoreDocCollector.create(100, false);// 结果集
search.search(query, results);// 查询前100条
TopDocs topdocs = results.topDocs(1, 2);// 从结果集中第1条开始取2条
ScoreDoc scores[] = topdocs.scoreDocs;// 得到所有结果集
for (int i = 0; i < scores.length; i++) {
int num = scores[i].doc;// 得到文档id
Document document = search.doc(num);// 拿到指定的文档
System.out.println("内容====" + document.get("content"));// 由于内容没有存储所以执行结果为null
System.out.println("标题====" + document.get("title"));
System.out.println("版本====" + document.get("version"));
System.out.println("评分====" + document.get("score"));
System.out.println("id--" + num + "---scors--" + scores[i].score + "---index--" + scores[i].shardIndex);
}
}
原文地址:http://blog.csdn.net/mdcmy/article/details/38167955