1.前言
之前的博客《Lucene全文检索之HelloWorld》已经简单介绍了Lucene的索引生成和检索。本文着重介绍Lucene的索引删除。
2.应用场景:
索引建立完成后,因为有些原因,被索引的文件已经删除。此时,索引仍然存在,为了不产生“虚假检索结果”,需要将失效的索引删除
3.HelloLucene类(重点关注deleteIndexByQuery方法)
- package com.njupt.zhb;
-
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileNotFoundException;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.LongField;
- import org.apache.lucene.document.StringField;
- import org.apache.lucene.document.TextField;
- import org.apache.lucene.index.DirectoryReader;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.index.IndexWriterConfig.OpenMode;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.queryparser.classic.ParseException;
- import org.apache.lucene.queryparser.classic.QueryParser;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.util.Version;
- public class HelloLucene {
-
- public void index(String indexPath,String docsPath) {
- try {
-
- Directory dir = FSDirectory.open(new File(indexPath));
-
- Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
- IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44,
- analyzer);
- iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
- IndexWriter writer = new IndexWriter(dir, iwc);
- final File docDir = new File(docsPath);
- indexDocs(writer, docDir);
- writer.close();
- } catch (IOException e) {
-
- e.printStackTrace();
- }
- }
-
- public void indexDocs(IndexWriter writer, File file) throws IOException {
- if (file.canRead()) {
- if (file.isDirectory()) {
- String[] files = file.list();
-
- if (files != null) {
- for (int i = 0; i < files.length; i++) {
- indexDocs(writer, new File(file, files[i]));
- }
- }
- } else {
- FileInputStream fis;
- try {
- fis = new FileInputStream(file);
- } catch (FileNotFoundException fnfe) {
- return;
- }
- try {
-
- Document doc = new Document();
-
-
-
-
-
-
-
-
- Field pathField = new StringField("path", file.getPath(),Field.Store.YES);
- doc.add(pathField);
-
- doc.add( new StringField("filename", file.getName(),Field.Store.YES));
-
-
-
-
-
-
-
-
-
-
-
-
- doc.add(new LongField("modified", file.lastModified(),Field.Store.YES));
-
-
-
-
-
-
-
-
-
- doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));
- if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
-
-
- System.out.println("adding " + file);
- writer.addDocument(doc);
- } else {
-
-
-
-
-
- System.out.println("updating " + file);
- writer.updateDocument(new Term("path", file.getPath()),doc);
- }
- } finally {
- fis.close();
- }
- }
- }
- }
-
- public void searcher(String indexPath,String searchKeyword){
- try {
- IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
- IndexSearcher searcher = new IndexSearcher(reader);
- Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
- String field = "contents";
- QueryParser parser = new QueryParser(Version.LUCENE_44, field, analyzer);
- Query query= parser.parse(searchKeyword);
- TopDocs tds=searcher.search(query, 10);
- ScoreDoc[] sds= tds.scoreDocs;
- for (ScoreDoc sd:sds) {
- Document document=searcher.doc(sd.doc);
- System.out.println("score:"+sd.score+"--filename:"+document.get("filename")+
- "--path:"+document.get("path")+"--time"+document.get("modified"));
- }
- reader.close();
- } catch (IOException e) {
-
- e.printStackTrace();
- }catch (ParseException e) {
-
- e.printStackTrace();
- }
- }
-
- public void deleteIndexByQuery(String indexPath,String deleteKeyword){
- try {
-
- IndexWriter writer = new IndexWriter(FSDirectory.open(new File(indexPath)),new IndexWriterConfig(Version.LUCENE_44, new StandardAnalyzer(Version.LUCENE_44)));
-
- Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
- String field = "contents";
- QueryParser parser = new QueryParser(Version.LUCENE_44, field, analyzer);
- Query query= parser.parse(deleteKeyword);
-
- writer.deleteDocuments(query);
- writer.commit();
- writer.close();
-
-
- }catch (IOException e) {
-
- e.printStackTrace();
- }catch (ParseException e) {
-
- e.printStackTrace();
- }
- }
- }
4.编写Junit测试类
- package com.njupt.zhb;
-
- import org.junit.Test;
- public class TestJunit {
- @Test
- public void TestIndex(){
- HelloLucene hLucene=new HelloLucene();
- hLucene.index("index", "D:\\lucene");
- }
- @Test
- public void TestSearcher(){
- HelloLucene hLucene=new HelloLucene();
- hLucene.searcher("index","南京");
- }
- @Test
- public void TestDeleteIndexByQuery(){
- HelloLucene hLucene=new HelloLucene();
- System.out.println("未删除前,查询关键字:北京 --结果:");
- hLucene.searcher("index","北京");
- hLucene.deleteIndexByQuery("index", "北京");
- System.out.println("删除后,查询关键字:北京 --结果:");
- hLucene.searcher("index","北京");
- }
- }
5.实验结果
5.1运行TestIndex方法
>控制台打印的信息
- updating D:\lucene\lucene1.txt
- updating D:\lucene\lucene2.txt
- updating D:\lucene\lucene3.txt
- updating D:\lucene\北京.txt
- updating D:\lucene\南京.txt
此时的index目录下的截图:
5.2运行TestSearcher方法
>搜索含有关键字“南京”的文档
- score:0.53033006--filename:lucene3.txt--path:D:\lucene\lucene3.txt--time1376828819375
- score:0.48666292--filename:lucene2.txt--path:D:\lucene\lucene2.txt--time1376828783791
- score:0.2155931--filename:北京.txt--path:D:\lucene\北京.txt--time1377784223795
- score:0.1530931--filename:南京.txt--path:D:\lucene\南京.txt--time1377784261486
5.3运行TestDeleteIndexByQuery方法
>
- 未删除前,查询关键字:北京 --结果:
- score:0.4847152--filename:lucene2.txt--path:D:\lucene\lucene2.txt--time1376828783791
- score:0.39226472--filename:北京.txt--path:D:\lucene\北京.txt--time1377784223795
- score:0.10348864--filename:lucene3.txt--path:D:\lucene\lucene3.txt--time1376828819375
- score:0.029874597--filename:南京.txt--path:D:\lucene\南京.txt--time1377784261486
- 删除后,查询关键字:北京 --结果:
删除后,再次查询关键字时,无查询结果。
此时,index目录下的文件结构为:
多出了一个_0_1.del文件
项目源代码:http://download.csdn.net/detail/nuptboyzhb/6041239
未经允许,不得用于商业目的