标签:
对索引的文档进行增删改查
http://mvnrepository.com/
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
</dependencies>
IndexingTest.java
package com.matrix.lucene;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Before;
import org.junit.Test;
public class IndexingTest {
// 测试数据
private String ids[] = { "1", "2", "3" };
private String citys[] = { "qingdao", "nanjing", "shanghai" };
private String descs[] = { "Qingdao is a beautiful city.", "Nanjing is a city of culture",
"Shanghai is a bustling city" };
private Directory dir;
@Before
public void setUp() throws Exception {
dir = FSDirectory.open(Paths.get("E:\\software\\lucene\\demo2"));
IndexWriter writer = getWriter();
// 遍历数组
for (int i = 0; i < ids.length; i++) {
Document doc = new Document();
// 存储的话,能提高效率,用空间换时间
doc.add(new StringField("id", ids[i], Field.Store.YES));
doc.add(new StringField("city", citys[i], Field.Store.YES));
// 存储内容很多的时候用TextField
doc.add(new TextField("desc", descs[i], Field.Store.NO));
// 添加文档操作
writer.addDocument(doc);
// 写的时候在内存中有缓存
}
// 关闭
writer.close();
}
/**
*
* 描述:获取IndexWriter实例<BR>
* 方法名:getWriter<BR>
* 创建人:Matrix <BR>
* 时间:2016年4月26日-上午1:51:47 <BR>
*
* @return
* @throws Exception
* IndexWriter<BR>
* @exception <BR>
* @see
* @since 1.0.0
*/
private IndexWriter getWriter() throws Exception {
// 创建标准分词器
Analyzer analyzer = new StandardAnalyzer();
// 配置,传入分析器实例
// 要将索引写入到数据源目录下文件中的时候需要解析/分析
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
IndexWriter writer = new IndexWriter(dir, iwc);
return writer;
}
/**
*
* 描述:测试写了几个文档<BR>
* 方法名:testIndexWriter<BR>
* 创建人:Matrix <BR>
* 时间:2016年4月26日-上午2:08:05 <BR>
*
* @throws Exception
* void<BR>
* @exception <BR>
* @see
* @since 1.0.0
*/
@Test
public void testIndexWriter() throws Exception {
IndexWriter writer = getWriter();
System.out.println("写入了" + writer.numDocs() + "个文档");
writer.close();
}
}
E:\software\lucene\demo2
前提:安装JDK1.8,并配置环境变量
@Test
public void testIndexReader() throws Exception {
IndexReader reader = DirectoryReader.open(dir);
System.out.println("最大文档数:" + reader.maxDoc());
System.out.println("实际文档数:" + reader.numDocs());
reader.close();
}
// 第一种方式:不合并索引,删除文档,索引不动,只是做标记
// 第二种方式:
@Test
public void testDeleteBeforeMerge() throws Exception {
IndexWriter writer = getWriter();
System.out.println("删除前:" + writer.numDocs());
writer.deleteDocuments(new Term("id", "1"));
writer.commit();
System.out.println("删除后:最大文档数即writer.maxDoc()" + writer.maxDoc());
System.out.println("删除后:实际文档数即writer.numDocs()" + writer.numDocs());
writer.close();
}
@Test
public void testDeleteAfterMerge() throws Exception {
IndexWriter writer = getWriter();
System.out.println("删除前:"+writer.numDocs());
writer.deleteDocuments(new Term("id","1"));
// 强制删除
writer.forceMergeDeletes();
writer.commit();
System.out.println("writer.maxDocs():"+writer.maxDoc());
System.out.println("writer.numDocs():"+writer.numDocs());
writer.close();
}
使用工具查看索引是否删除
@Test
public void testUpdate() throws Exception {
IndexWriter writer = getWriter();
// 进行更新操作
Document doc = new Document();
doc.add(new StringField("id", "1", Field.Store.YES));
doc.add(new StringField("city", "shenzhen", Field.Store.YES));
doc.add(new TextField("city", "shenzhen is a great city", Field.Store.NO));
writer.updateDocument(new Term("id", "1"), doc);
writer.close();
}
加权之后,搜索排名会提高
生成索引
@Test
public void index() throws Exception {
// 打开索引目录,该目录存放创建的索引文件
dir = FSDirectory.open(Paths.get("E:\\software\\lucene\\demo3"));
IndexWriter writer = getWriter();
// 遍历数组
for (int i = 0; i < ids.length; i++) {
Document doc = new Document();
// 存储的话,能提高效率,用空间换时间
doc.add(new StringField("id", ids[i], Field.Store.YES));
doc.add(new StringField("author", authors[i], Field.Store.YES));
doc.add(new StringField("position", positions[i], Field.Store.YES));
// 使用StringField字段则不会进行分词
doc.add(new TextField("title", titles[i], Field.Store.YES));
// 存储内容很多的时候用TextField
doc.add(new TextField("content", contents[i], Field.Store.NO));
// 添加文档
writer.addDocument(doc);
// 写的时候在内存中有缓存
}
// 关闭
writer.close();
}
查询
@Test
public void search() throws Exception {
dir = FSDirectory.open(Paths.get("E:\\software\\lucene\\demo3"));
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher is = new IndexSearcher(reader);
// 定义要查询的字段
String seachField = "title";
// 定义要查询的值
String q = "java";
Term t = new Term(seachField, q);
Query query = new TermQuery(t);
TopDocs hits = is.search(query, 10);
System.out.println("匹配‘" + q + "‘,总共查询到" + hits.totalHits + "个文档");
// 遍历TopDocs
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = is.doc(scoreDoc.doc);
System.out.println("document:" + doc.get("author"));
}
reader.close();
}
加权操作
IndexingText2.java
package com.matrix.lucene;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Test;
public class IndexingText2 {
private String ids[] = { "1", "2", "3", "4" };
private String authors[] = { "Jack", "Marry", "John", "Json" };
private String positions[] = { "accounting", "technician", "salesperson", "boss" };
private String titles[] = { "Java is a good language.", "Java is a cross platform language", "Java powerful",
"You should learn java" };
private String contents[] = { "If possible, use the same JRE major version at both index and search time.",
"When upgrading to a different JRE major version, consider re-indexing. ",
"Different JRE major versions may implement different versions of Unicode,",
"For example: with Java 1.4, `LetterTokenizer` will split around the character U+02C6," };
private Directory dir;
// 获取IndexWriter实例
// indexDir:将索引写入到索引目录(该目录存放创建的索引文件)去
public IndexWriter getWriter() throws Exception {
// 创建标准分词器
Analyzer analyzer = new StandardAnalyzer();
// 配置,传入分析器实例
// 要将索引写入到数据源目录下文件中的时候需要解析/分析
IndexWriterConfig wc = new IndexWriterConfig(analyzer);
IndexWriter writer = new IndexWriter(dir, wc);
return writer;
}
// 生成索引
@Test
public void index() throws Exception {
// 打开索引目录,该目录存放创建的索引文件
dir = FSDirectory.open(Paths.get("E:\\software\\lucene\\demo3"));
IndexWriter writer = getWriter();
// 遍历数组
for (int i = 0; i < ids.length; i++) {
Document doc = new Document();
// 存储的话,能提高效率,用空间换时间
doc.add(new StringField("id", ids[i], Field.Store.YES));
doc.add(new StringField("author", authors[i], Field.Store.YES));
doc.add(new StringField("position", positions[i], Field.Store.YES));
// 使用StringField字段则不会进行分词
TextField field = new TextField("title", titles[i], Field.Store.YES);
// 加权操作
if ("boss".equals(positions[i])) {
field.setBoost(1.5f);
}
doc.add(field);
// 存储内容很多的时候用TextField
doc.add(new TextField("content", contents[i], Field.Store.NO));
// 添加文档
writer.addDocument(doc);
// 写的时候在内存中有缓存
}
// 关闭
writer.close();
}
// 查询
@Test
public void search() throws Exception {
dir = FSDirectory.open(Paths.get("E:\\software\\lucene\\demo3"));
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher is = new IndexSearcher(reader);
// 定义要查询的字段
String seachField = "title";
// 定义要查询的值
String q = "java";
Term t = new Term(seachField, q);
Query query = new TermQuery(t);
TopDocs hits = is.search(query, 10);
System.out.println("匹配‘" + q + "‘,总共查询到" + hits.totalHits + "个文档");
// 遍历TopDocs
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = is.doc(scoreDoc.doc);
System.out.println("document:" + doc.get("author"));
}
reader.close();
}
public static void main(String[] args) {
}
}
Json在最前面是因为权值变高了
标签:
原文地址:http://blog.csdn.net/qq_25371579/article/details/51252084