lucene创建索引的几种方式(一)

时间：2016-09-27 13:28:55 阅读：216 评论：0 收藏：0 [点我收藏+]

标签：

什么是索引：

根据你输入的值去找，这个值就是索引

第一种创建索引的方式：

根据文件来生成索引，如后缀为.txt等的文件

步骤：

第一步：FSDirectory.open(Paths.get(url));根据路径获取存储索引的目录。

FSDirectory：表示对文件系统目录的操作。RAMDirectory ：内存中的目录操作。

Paths为NIO(new io)的一个类；Path 类是 java.io.File 类的升级版，File file=newFile("index.html")而Path path=Paths.get("index.html");由于 Path 类基于字符串创建，因此它引用的资源也有可能不存在。

关于nio:传统的io流都是通过字节的移动来处理的，也就是说输入/输出流一次只能处理一个字节，因此面向流的输入/输出系统通常效率不高；因此引进了新IO(new IO),NIO采用内存映射文件的方式来处理输入/输出，NIO将文件或文件的一段区域映射到内存中，这样就可以向访问内存一样来访问文件了(这种方式模拟了操作系统上的虚拟内存的概念)，所以NIO的效率很快。

第二步：new IndexWriter(Directory,IndexWriterConfig)创建索引

第三步：索引指定目录的文件

第四步：将文件写入lucene中的文档(Document)

package com.wp.util;

import java.io.File;
import java.io.FileReader;
import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Indexer {

    private IndexWriter writer; // 写索引实例

    /**
     * 构造方法 实例化IndexWriter
     * 
     * @param indexDir
     * @throws Exception
     */
    public Indexer(String indexDir) throws Exception {
        Directory dir = FSDirectory.open(Paths.get(indexDir));// 根据路径获取存储索引的目录
        Analyzer analyzer = new StandardAnalyzer(); // 标准分词器
        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
        writer = new IndexWriter(dir, iwc);
    }

    /**
     * 关闭写索引
     * 
     * @throws Exception
     */
    public void close() throws Exception {
        writer.close();
    }

    /**
     * 索引指定目录的所有文件
     * 
     * @param dataDir
     * @throws Exception
     */
    public int index(String dataDir) throws Exception {
        File[] files = new File(dataDir).listFiles();
        for (File f : files) {
            indexFile(f);
        }
        return writer.numDocs();
    }

    /**
     * 索引指定文件
     * 
     * @param f
     */
    private void indexFile(File f) throws Exception {
        // 关于f.getCanonicalPath()查看http://www.blogjava.net/dreamstone/archive/2007/08/08/134968.html
        System.out.println("索引文件：" + f.getCanonicalPath());
        Document doc = getDocument(f);
        writer.addDocument(doc);
    }

    /**
     * 获取文档，文档里再设置每个字段
     * 
     * @param f
     */
    private Document getDocument(File f) throws Exception {
        Document doc = new Document();
        doc.add(new TextField("contents", new FileReader(f)));
        doc.add(new TextField("fileName", f.getName(), Field.Store.YES));
        doc
                .add(new TextField("fullPath", f.getCanonicalPath(),
                        Field.Store.YES));
        return doc;
    }

    public static void main(String[] args) {
        String indexDir = "D:\\lucene4";
        String dataDir = "D:\\lucene4\\data";
        Indexer indexer = null;
        int numIndexed = 0;
        long start = System.currentTimeMillis();
        try {
            indexer = new Indexer(indexDir);
            numIndexed = indexer.index(dataDir);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                indexer.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        long end = System.currentTimeMillis();
        System.out.println("索引：" + numIndexed + " 个文件 花费了" + (end - start)
                + " 毫秒");
    }
}

第二种创建索引的方式：

根据字段来生成索引，我用的是数组

第一步：创建索引

第二步：将字段添加到文档中

package com.wp.util;

import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Before;
import org.junit.Test;

public class IndexIngTest {

    private String ids[] = { "1", "2", "3" };
    private String citys[] = { "qingdao", "nanjing", "shanghai" };
    private String descs[] = { "Qingdao is a beautiful city.",
            "Nanjing is a city of culture.", "Shanghai is a bustling city." };

    private Directory dir;// 目录

    /**
     * 获取IndexWriter实例
     * 
     * @return
     * @throws Exception
     */
    private IndexWriter getWriter() throws Exception {
        Analyzer analyzer = new StandardAnalyzer(); // 标准分词器
        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
        IndexWriter writer = new IndexWriter(dir, iwc);
        return writer;
    }

    /**
     * 添加文档
     * 
     * @throws Exception
     */
    @Before
    public void setUp() throws Exception {
        dir = FSDirectory.open(Paths.get("D:\\lucene\\luceneIndex"));// 得到luceneIndex目录
        IndexWriter writer = getWriter();// 得到索引
        for (int i = 0; i < ids.length; i++) {
            Document doc = new Document();// 创建文档
            doc.add(new StringField("id", ids[i], Field.Store.YES));// 将id属性存入内存中
            doc.add(new StringField("city", citys[i], Field.Store.YES));
            doc.add(new TextField("desc", descs[i], Field.Store.NO));
            writer.addDocument(doc); // 添加文档
        }
        writer.close();
    }

    /**
     * 测试写了几个文档
     * 
     * @throws Exception
     */
    @Test
    public void testIndexWriter() throws Exception {
        IndexWriter writer = getWriter();
        System.out.println("写入了" + writer.numDocs() + "个文档");
        writer.close();
    }

    /**
     * 测试读取文档
     * 
     * @throws Exception
     */
    @Test
    public void testIndexReader() throws Exception {
        IndexReader reader = DirectoryReader.open(dir);
        System.out.println("最大文档数：" + reader.maxDoc());
        System.out.println("实际文档数：" + reader.numDocs());
        reader.close();
    }

    /**
     * 测试删除 在合并前
     * 
     * @throws Exception
     */
    @Test
    public void testDeleteBeforeMerge() throws Exception {
        IndexWriter writer = getWriter();
        System.out.println("删除前：" + writer.numDocs());
        writer.deleteDocuments(new Term("id", "1"));// term：根据id找到为1的
        writer.commit();
        System.out.println("writer.maxDoc()：" + writer.maxDoc());
        System.out.println("writer.numDocs()：" + writer.numDocs());
        writer.close();
    }

    /**
     * 测试删除 在合并后
     * 
     * @throws Exception
     */
    @Test
    public void testDeleteAfterMerge() throws Exception {
        IndexWriter writer = getWriter();
        System.out.println("删除前：" + writer.numDocs());
        writer.deleteDocuments(new Term("id", "1"));
        writer.forceMergeDeletes(); // 强制删除
        writer.commit();
        System.out.println("writer.maxDoc()：" + writer.maxDoc());
        System.out.println("writer.numDocs()：" + writer.numDocs());
        writer.close();
    }

    /**
     * 测试更新
     * 
     * @throws Exception
     */
    @Test
    public void testUpdate() throws Exception {
        IndexWriter writer = getWriter();
        Document doc = new Document();
        doc.add(new StringField("id", "1", Field.Store.YES));
        doc.add(new StringField("city", "qingdao", Field.Store.YES));
        doc.add(new TextField("desc", "dsss is a city.", Field.Store.NO));
        writer.updateDocument(new Term("id", "1"), doc);
        writer.close();
    }
}

生成的索引文件如下：

关于索引的搜索：

这里有一个要注意的地方：一定要先创建出索引后才能去进行查找，否则会报

org.apache.lucene.index.IndexNotFoundException:

no segments* file found in MMapDirectory@D:\lucene lockFactory=org.apache.lucene.store.NativeFSLockFactory@753f67a9: files: [data, lucene-5.3.1, lucene-5.3.1.zip]

package com.wp.lucene;

import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Searcher {
    /**
     * 
     * @param indexDir
     *            哪个目录
     * @param q
     *            要查询的字段
     * @throws Exception
     */
    public static void search(String indexDir, String q) throws Exception {
        Directory dir = FSDirectory.open(Paths.get(indexDir));// 打开目录
        IndexReader reader = DirectoryReader.open(dir);// 进行读取
        IndexSearcher is = new IndexSearcher(reader);// 索引查询器
        Analyzer analyzer = new StandardAnalyzer(); // 标准分词器
        QueryParser parser = new QueryParser("contents", analyzer);// 在哪查询，第一个参数为查询的Document，在Indexer中创建了
        Query query = parser.parse(q);// 对字段进行解析后返回给查询
        long start = System.currentTimeMillis();
        TopDocs hits = is.search(query, 10);// 开始查询，10代表前10条数据；返回一个文档
        long end = System.currentTimeMillis();
        System.out.println("匹配 " + q + " ，总共花费" + (end - start) + "毫秒" + "查询到"
                + hits.totalHits + "个记录");
        for (ScoreDoc scoreDoc : hits.scoreDocs) {
            Document doc = is.doc(scoreDoc.doc);// 根据文档的标识获取文档
            System.out.println(doc.get("fullPath"));
        }
        reader.close();
    }

    /**
     * 执行这个main方法进行查询之前，必须要有索引，即先执行Indexer这个类
     * 
     * @param args
     */
    public static void main(String[] args) {
        String indexDir = "D:\\lucene";
        String q = "ADD";
        try {
            search(indexDir, q);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

lucene创建索引的几种方式(一)

标签：

原文地址：http://www.cnblogs.com/lirenzhujiu/p/5912243.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行