jcseg是使用Java开发的一个开源的中文分词器,使用流行的mmseg算法实现。是一款独立的分词组件,不是针对lucene而开发,但是提供了最新版本的lucene和solr分词接口。
Java Code
<span style="font-size:14px;">package com.qiuzhping.lucene; import java.sql.Connection; import java.sql.ResultSet; import java.sql.Statement; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.lionsoul.jcseg.analyzer.JcsegAnalyzer5X; import org.lionsoul.jcseg.core.JcsegTaskConfig; /** * <Description functions in a word> * Jcseg[d??'ke's?]是使用Java开发的一个开源中文分词器,使用流行的mmseg算法实现,<BR> * 并且提供了最高版本的lucene, solr, elasticsearch(New)的分词接口.<BR> * 本程序测试的是jcseg 1.9.6,Lucene:5.2.1<BR> * 关于Jcseg介绍详细请参看 http://www.oschina.net/p/jcseg * <Detail description> * * @author Peter.Qiu * @version [Version NO, 2015年7月31日] * @see [Related classes/methods] * @since [product/module version] */ public class LuceneChineseSplit { public static void main(String[] args) throws Exception { Analyzer analyzer = new JcsegAnalyzer5X(JcsegTaskConfig.COMPLEX_MODE); // 非必须(用于修改默认配置): 获取分词任务配置实例 JcsegAnalyzer5X jcseg = (JcsegAnalyzer5X) analyzer; JcsegTaskConfig config = jcseg.getTaskConfig(); // 追加同义词到分词结果中, 需要在 jcseg.properties 中配置 jcseg.loadsyn=1 config.setAppendCJKSyn(true); // 追加拼音到分词结果中, 需要在 jcseg.properties 中配置 jcseg.loadpinyin=1 config.setAppendCJKPinyin(true); // 更多配置, 请查看 com.webssky.jcseg.core.JcsegTaskConfig 类 // ====建立索引 // 建立内存索引对象 Directory directory = new RAMDirectory(); IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); IndexWriter iwriter = new IndexWriter(directory, iwConfig); Connection conn = QueryDataFromDb.getConnection(); Statement st = conn.createStatement(); long count = 0; for(int i = 0 ; i < 10; i ++){ String query = "select * from student limit "+ i * 100000+","+ 100000; ResultSet result = st.executeQuery(query); while (result.next()) { Document document = new Document(); document.add(new StringField("id", result.getString("id"), Field.Store.YES)); document.add(new TextField("name", result .getString("name"), Field.Store.YES)); document.add(new StringField("math", result .getString("math"), Field.Store.YES)); iwriter.addDocument(document); count ++; } } System.out.println("Total record : "+count); iwriter.commit(); iwriter.close(); // ==搜索 IndexReader ireader = DirectoryReader.open(directory); IndexSearcher isearcher = new IndexSearcher(ireader); String keyword = "你好"; // 使用QueryParser查询分析器构造Query对象 QueryParser qp = new QueryParser("name", analyzer); qp.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = qp.parse(keyword); System.out.println("Query = " + query); long start = System.currentTimeMillis(); //搜索相似度最高的2条记录 System.out.println("搜索相似度最高的2条记录"); TopDocs topDocs = isearcher.search(query, 2); System.out.println("命中:" + topDocs.totalHits); for (ScoreDoc sd : topDocs.scoreDocs) { Document doc = isearcher.doc(sd.doc); System.out.println("id:" + doc.get("id")); System.out.println("name:" + doc.get("name")); System.out.println("math:" + doc.get("math")); } System.out.println("Spend time:"+(System.currentTimeMillis() - start) + " ms"); } } </span>测试结果:
Total record : 1000000
Query = name:你好
搜索相似度最高的2条记录
命中:1000000
id:1
name:你好
math:38
id:2
name:你好
math:21
Spend time:52 ms
代码片段涉及到:
lucene-analyzers-common-5.2.1.jar
lucene-core-5.2.1.jar
lucene-queryparser-5.2.1.jar
mysql-connector-java-5.1.35.jar
jcseg-analyzer-1.9.6.jar
jcseg-core-1.9.6.jar
Lucene 4.10 + Mysql 5.5 创建数据库表索引(Lucene 学习序列1)
版权声明:本文为博主原创文章,未经博主允许不得转载。
Lucene 5.2.1 + jcseg 1.9.6中文分词索引(Lucene 学习序列2)
原文地址:http://blog.csdn.net/qiuzhping/article/details/47172555