lucene5.0建立索引并进行查找

时间：2015-04-22 18:46:15 阅读：209 评论：0 收藏：0 [点我收藏+]

标签：lucene

说白了就是两个函数一个建立索引（写），另一个来查找（读），所以涉及到java IO的一些知识。

import java.io.*;
import java.nio.file.Paths;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
/**
* This class demonstrate the process of creating index with Lucene
* for text files
*/
public class TxtFileIndexer {
public static void main(String[] args) throws Exception{
//indexDir is the directory that hosts Lucene‘s index files
Directory indexDir = FSDirectory.open(Paths.get("G:\\luceneout"));
//dataDir is the directory that hosts the text files that to be indexed
File dataDir = new File("G:\\downloads\\LJParser_release\\LJParser_Packet\\训练分类用文本\\交通");
Analyzer luceneAnalyzer = new StandardAnalyzer(); //新建一个分词器实例
IndexWriterConfig config = new IndexWriterConfig(luceneAnalyzer);
File[] dataFiles = dataDir.listFiles(); //所有训练样本文件
IndexWriter indexWriter = new IndexWriter(indexDir,config);//构造一个索引写入器
long startTime = new Date().getTime();
for(int i = 0; i < dataFiles.length; i++){
if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){
System.out.println("Indexing file " + dataFiles[i].getCanonicalPath()); //返回绝对路径
Document document = new Document();//每一个文件都变成一个document对象
Reader txtReader = new FileReader(dataFiles[i]);
Field field1 = new StringField("path",dataFiles[i].getPath(),Store.YES);
Field field2 = new TextField("content",txtReader);
Field field3 = new LongField("fileSize", dataFiles[i].length(), Store.YES);
Field field4 = new TextField("filename",dataFiles[i].getName(),Store.YES);
document.add(field1);
document.add(field2);
document.add(field3);
document.add(field4);
indexWriter.addDocument(document); //写进一个索引
}
}
//indexWriter.optimize();
indexWriter.close();
long endTime = new Date().getTime();
System.out.println("It takes " + (endTime - startTime)
+ " milliseconds to create index for the files in directory "
+ dataDir.getPath());
}
}

import java.io.*; 
import java.nio.file.Paths;
import java.util.Date; 
import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*; 
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
/** 
* This class demonstrate the process of creating index with Lucene 
* for text files 
*/ 
public class TxtFileIndexer { 
     public static void main(String[] args) throws Exception{ 
     //indexDir is the directory that hosts Lucene‘s index files 
     Directory indexDir = FSDirectory.open(Paths.get("G:\\luceneout"));
     //dataDir is the directory that hosts the text files that to be indexed 
     File   dataDir  = new File("G:\\downloads\\LJParser_release\\LJParser_Packet\\训练分类用文本\\交通"); 
     Analyzer luceneAnalyzer = new StandardAnalyzer(); //新建一个分词器实例
     IndexWriterConfig config = new IndexWriterConfig(luceneAnalyzer);
     File[] dataFiles  = dataDir.listFiles(); //所有训练样本文件
     IndexWriter indexWriter = new IndexWriter(indexDir,config);//构造一个索引写入器 
     long startTime = new Date().getTime(); 
     for(int i = 0; i < dataFiles.length; i++){ 
          if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){
               System.out.println("Indexing file " + dataFiles[i].getCanonicalPath()); //返回绝对路径
               Document document = new Document();//每一个文件都变成一个document对象 
               Reader txtReader = new FileReader(dataFiles[i]); 
               Field field1 = new StringField("path",dataFiles[i].getPath(),Store.YES);
               Field field2 = new TextField("content",txtReader);
               Field field3 = new LongField("fileSize", dataFiles[i].length(), Store.YES); 
               Field field4 = new TextField("filename",dataFiles[i].getName(),Store.YES);
               document.add(field1);
               document.add(field2);
               document.add(field3);
               document.add(field4);
               indexWriter.addDocument(document); //写进一个索引
          } 
     } 
     //indexWriter.optimize(); 
     indexWriter.close(); 
     long endTime = new Date().getTime(); 
        
     System.out.println("It takes " + (endTime - startTime) 
         + " milliseconds to create index for the files in directory "
         + dataDir.getPath());        
     } 
}

读取索引并查找

[java] view plain copy print ?

import java.io.File;
import java.nio.file.Paths;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.*;
/**
* This class is used to demonstrate the
* process of searching on an existing
* Lucene index
*
*/
public class TxtFileSearcher {
public static void main(String[] args) throws Exception{
//存储了索引文件
Directory indexDir = FSDirectory.open(Paths.get("G:\\luceneout"));
//读取器读取索引文件
DirectoryReader ireader = DirectoryReader.open(indexDir);
//查找
IndexSearcher searcher = new IndexSearcher(ireader);
//目的查找字符串
String queryStr = "大数据挖掘";
//构造一个词法分析器，并将查询结果返回到一个队列
QueryParser parser = new QueryParser("content",new StandardAnalyzer());
Query query = parser.parse(queryStr);
TopDocs docs = searcher.search(query, 100);
System.out.print("一共搜索到结果："+docs.totalHits+"条");
//输出查询结果信息
for(ScoreDoc scoreDoc:docs.scoreDocs){
System.out.print("序号为:"+scoreDoc.doc);
System.out.print("评分为:"+scoreDoc.score);
Document document = searcher.doc(scoreDoc.doc);
System.out.print("路径为:"+document.get("path"));
System.out.print("内容为"+document.get("content"));
System.out.print("文件大小为"+document.get("fileSize"));
System.out.print("文件名为"+document.get("filename"));
System.out.println();
}
}
}

import java.io.File; 
import java.nio.file.Paths;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document; 
import org.apache.lucene.index.DirectoryReader;  
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*; 
import org.apache.lucene.store.*;
 /** 
 * This class is used to demonstrate the 
 * process of searching on an existing 
 * Lucene index 
 * 
 */ 
 public class TxtFileSearcher { 
	 public static void main(String[] args) throws Exception{ 
		 //存储了索引文件
		 Directory indexDir = FSDirectory.open(Paths.get("G:\\luceneout"));
		 //读取器读取索引文件
		 DirectoryReader ireader = DirectoryReader.open(indexDir);
		 //查找
		 IndexSearcher searcher = new IndexSearcher(ireader);
		 //目的查找字符串
		 String queryStr = "大数据挖掘";
		 //构造一个词法分析器，并将查询结果返回到一个队列
		 QueryParser parser = new QueryParser("content",new StandardAnalyzer());
		 Query query = parser.parse(queryStr);
		 TopDocs docs = searcher.search(query, 100);
		 System.out.print("一共搜索到结果："+docs.totalHits+"条");
		 //输出查询结果信息
		 for(ScoreDoc scoreDoc:docs.scoreDocs){
			 System.out.print("序号为:"+scoreDoc.doc);
			 System.out.print("评分为:"+scoreDoc.score);
			 Document document = searcher.doc(scoreDoc.doc);
			 System.out.print("路径为:"+document.get("path"));
			 System.out.print("内容为"+document.get("content"));
			 System.out.print("文件大小为"+document.get("fileSize"));
			 System.out.print("文件名为"+document.get("filename"));
			 System.out.println();
		 }	 
	 } 
 }

技术分享

运行结果

下面是文件目录

技术分享

两个函数都需要用到分词器，前者是为了配置写入，后者则是为了配置词法分析器来查找

本文出自 “ssh互联与hadoop搭建” 博客，请务必保留此出处http://8492887.blog.51cto.com/8482887/1637187

lucene5.0建立索引并进行查找

标签：lucene

原文地址：http://8492887.blog.51cto.com/8482887/1637187

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行