标签:lucene
在全文索引工具中,都是由这样的三部分组成
1.索引部分
2.分词部分
3.搜索部分
----------------------------------
索引创建域选项
----------------------------------
Field.Store.YES或者NO(存储域选项)
YES:表示会把这个域中的内容完全存储到文件中,方便进行还原[对于主键,标题可以是这种方式存储]
NO:表示把这个域的内容不存储到文件中,但是可以被索引,此时内容无法完全还原(doc.get())[对于内容而言,没有必要进行存储,可以设置为No]
Field.index(索引选项)
Index.ANALYZED:进行分词和索引,适用于标题,内容等
Index.NOT_ANALYZED:进行索引,但不进行分词,比如身份证号,姓名,ID等,使用于精确搜索
Index.ANALYZED_NOT_NORMS:进行分词但是不存储norms信息,这个norms中包含了创建索引的时间和权值(排序)等信息
Index.NOT_ANALYZED_NOT_NORMS:即不进行分词也不存储norms信息
Index.NO:不进行索引
最佳实践
NOT_ANALYZED_NOT_NORMS Store.YES 标识符(主键,文件名),电话号码,身份证号,姓名,日期
ANALYZED Store.YES 文档标题和摘要
ANALYZED Store.NO 文档正文
NO Store.YES 文档类型,数据库主键(不进行索引)
NOT_ANALYZED Store.NO 隐藏关键字
索引文件结构剖析
.fnm保存着域字段的信息
.fdt和.fdx保存着store=yes的数据
.frq保存着哪些相同的单词出现多少次(可用作排序和评级)
.nrm专门用来保持一些评级信息
.tii和.tis保存着索引里面的所有信息
文档和域的概念
文档相当于表中的每一条记录,域相当于表中的每一个字段
----------------------------------
索引的删除与更新
----------------------------------
1.删除
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
// 参数是一个选项,可以是一个query,也可以是一个term,term是一个精确查找的值
// 此时删除的文档并不会被完全删除,而是存储在一个回收站中,可以恢复
writer.deleteDocuments(new Term("id", "1"));
2.恢复删除
// 使用indexreader恢复
// 将readeronly=false
IndexReader reader = IndexReader.open(directory, false);
reader.undeleteAll();
reader.close();
3.强制删除(清空回收站)
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
// 强制删除数据,清空回收站
// Lucene3.5之前是optimize()方法进行处理,但此方法消耗大量内存已经被弃用
writer.forceMergeDeletes();
4.优化和合并
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
// 添加了多次索引,可以设置允许的最大段索引,会将索引合并为两段,这两段中的被删除的数据会被情空
// 特别注意:此次不建议使用,会消耗大量的开销,Lucene会根据情况自动优化
writer.forceMerge(2);
5.更新索引
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
/*
* Lucene并没有提供更新的操作,这里的更新是两个操作的合并 先删除之后再添加
*/
Document doc = new Document();
// 先将文档id=1的索引删除,再添加一个新的文档索引
// 先删除再代替的工作
writer.updateDocument(new Term("id", "1"), doc);
--------------------------------------------------
lucene索引_加权操作
--------------------------------------------------
通过Map<String, Float> scores = new HashMap<String, Float>();方式进行设置
假设对特定邮箱进行评级
/*
*
document.setBoost(float) 设置评级
*/
String et = emails[i].substring(emails[i].lastIndexOf("@") + 1);
//System.out.println(et);
if (scores.containsKey(et)) {
document.setBoost(scores.get(et));
} else {
document.setBoost(0.5f);
}
--------------------------------------------------
对日期和数字进行索引
--------------------------------------------------
private int[] attachs = { 2, 3, 1, 4, 5, 5 };
private Date[] dates = null;
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
dates = new Date[ids.length];
dates[0] = sdf.parse("2015-1-1");
dates[1] = sdf.parse("2015-2-1");
dates[2] = sdf.parse("2015-3-1");
dates[3] = sdf.parse("2015-4-1");
dates[4] = sdf.parse("2015-5-1");
dates[5] = sdf.parse("2015-6-1");
// 为数字添加索引
document.add(new NumericField("attach", Field.Store.YES, true).setIntValue(attachs[i]));
// 给日期添加索引
document.add(new NumericField("date", Field.Store.YES, true).setLongValue(dates[i].getTime()));
1.创建索引代码
* 一.建立索引 */ public void index() { IndexWriter writer = null; try { // 1.创建Directory(索引位置) // 创建内存的索引 // Directory directory = new RAMDirectory(); // 创建自定义的索引位置 Directory directory = FSDirectory .open(new File( "F:/BaiduYunDownload/Cache/lucune/LuceneExamples/indexdata")); // 2.创建IndexWriter(写入索引) IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35));// 参数2Analyzer表示创建的分词器 writer = new IndexWriter(directory, conf); // 3.创建Document对象 Document document = null; // 4.为Document添加Field(相当于添加些属性) File fs = new File( "F:/BaiduYunDownload/Cache/lucune/LuceneExamples/testdata"); // 遍历所有文件 for (File f : fs.listFiles()) { document = new Document(); // 将内容添加成索引 document.add(new Field("content", new FileReader(f))); // 添加文件的名字 第三个参数将文件的名字存储到索引中 第四个参数是否进行分词 document.add(new Field("fileName", f.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); // 添加文件的路径 document.add(new Field("path", f.getAbsolutePath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); // 5.通过IndexWriter添加文档到索引中 writer.addDocument(document); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) { try { writer.close(); writer = null; } catch (IOException e) { e.printStackTrace(); } } } }
2.对索引进行的增删改更新操作
/* * 用作专门创建索引的工具 */ public class IndexUtil { /* * 假设6个文档 */ private String[] ids = { "1", "2", "3", "4", "5", "6" }; private String[] emails = { "aa@qq.com", "bb@sina.com", "cc@163.com", "dd@google.com", "ee@baidu.com", "ff@heima.com" }; private String[] contents = { "hello boy,i like pingpang", "like boy", "xx bye i like swim", "hehe, i like basketball", "dd fsfs, i like movie", "hello xxx,i like game" }; private int[] attachs = { 2, 3, 1, 4, 5, 5 }; private Date[] dates = null; private String[] names = { "lili", "wangwu", "lisi", "jack", "tom", "mark" }; // 设置加权map private Map<String, Float> scores = new HashMap<String, Float>(); /* * 创建索引 */ private Directory directory = null; public IndexUtil() throws Exception { // 创建日期索引时,给日期赋值 createDate(); // 给Emails加权处理 scores.put("sina", 2.0f); scores.put("google", 1.5f); directory = FSDirectory.open(new File( "F:/BaiduYunDownload/Cache/lucune/Code/code01/indexdata")); } /* * 给日期属性初始化 */ private void createDate() throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); dates = new Date[ids.length]; dates[0] = sdf.parse("2015-1-1"); dates[1] = sdf.parse("2015-2-1"); dates[2] = sdf.parse("2015-3-1"); dates[3] = sdf.parse("2015-4-1"); dates[4] = sdf.parse("2015-5-1"); dates[5] = sdf.parse("2015-6-1"); } public void index() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig( Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); writer.deleteAll(); // 创建documents Document document = null; for (int i = 0; i < ids.length; i++) { document = new Document(); document.add(new Field("id", ids[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); document.add(new Field("email", emails[i], Field.Store.YES, Field.Index.NOT_ANALYZED)); // 不分词 document.add(new Field("content", contents[i], Field.Store.NO, Field.Index.ANALYZED)); document.add(new Field("name", names[i], Field.Store.YES, Field.Index.NOT_ANALYZED)); // 为数字添加索引 document.add(new NumericField("attach", Field.Store.YES, true) .setIntValue(attachs[i])); // 给日期添加索引 document.add(new NumericField("date", Field.Store.YES, true) .setLongValue(dates[i].getTime())); /* * document.setBoost(float) 设置评级 */ String et = emails[i].substring(emails[i].lastIndexOf("@") + 1); // System.out.println(et); if (scores.containsKey(et)) { document.setBoost(scores.get(et)); } else { document.setBoost(0.5f); } writer.addDocument(document); } } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) { try { writer.close(); writer = null; } catch (IOException e) { e.printStackTrace(); } } } } /* * 强制优化索引(forceMerge()将所有索引都重新优化一遍) */ public void forceMerge() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig( Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); // 强制删除数据,清空回收站 // Lucene3.5之前是optimize()方法进行处理,但此方法消耗大量内存已经被弃用 writer.forceMergeDeletes(); } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) { try { writer.close(); writer = null; } catch (IOException e) { e.printStackTrace(); } } } } /* * 手动进行Merge优化 */ public void merge() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig( Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); // 添加了多次索引,可以设置允许的最大段索引,会将索引合并为两段,这两段中的被删除的数据会被情空 // 特别注意:此次不建议使用,会消耗大量的开销,Lucene会根据情况自动优化 writer.forceMerge(2); } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) { try { writer.close(); writer = null; } catch (IOException e) { e.printStackTrace(); } } } } /* * 恢复索引文件 */ public void undelete() throws Exception { // 使用indexreader恢复 // 将readeronly=false IndexReader reader = IndexReader.open(directory, false); reader.undeleteAll(); reader.close(); } /* * 删除索引文件 */ public void deleteIndex() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig( Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); // 参数是一个选项,可以是一个query,也可以是一个term,term是一个精确查找的值 // 此时删除的文档并不会被完全删除,而是存储在一个回收站中,可以恢复 writer.deleteDocuments(new Term("id", "1")); } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) { try { writer.close(); writer = null; } catch (IOException e) { e.printStackTrace(); } } } } /* * 更新索引 */ public void update() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig( Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); /* * Lucene并没有提供更新的操作,这里的更新是两个操作的合并 先删除之后再添加 */ Document doc = new Document(); // 先将文档id=1的索引删除,再添加一个新的文档索引 // 先删除再代替的工作 writer.updateDocument(new Term("id", "1"), doc); } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) { try { writer.close(); writer = null; } catch (IOException e) { e.printStackTrace(); } } } } /* * 查询 */ public void Query() throws Exception { IndexReader reader = IndexReader.open(directory); // 通过reader可以有效获取文档的数量 System.out.println("本索引存储的文档数:" + reader.numDocs()); System.out.println("总文档数(包括回收站):" + reader.maxDoc()); } /* * search */ public void Search() { try { IndexReader reader = IndexReader.open(directory); IndexSearcher search = new IndexSearcher(reader); // 精确搜索 TermQuery query = new TermQuery(new Term("content", "like")); TopDocs tds = search.search(query, 10); for (ScoreDoc sd : tds.scoreDocs) { Document doc = search.doc(sd.doc); System.out.println(sd.doc + doc.get("name") + "[" + doc.get("email") + "," + doc.get("id") + "," + doc.get("attach") + "," + doc.get("date") + "]"); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
标签:lucene
原文地址:http://blog.csdn.net/u010366796/article/details/44808287