标签:Lucene style class blog code java
分词器
作用:切分关键词的。
在什么地方使用到了:在建立索引和搜索时。
原文:An IndexWriter creates and maintains an index.
1,切分:
An
IndexWriter
creates
and
maintains
an
index
.
2,去除停用词
IndexWriter
creates
maintains
index
3,转为小写
indexwriter
creates
maintains
index
1 package cn.itcast.e_analyzer; 2 3 import java.io.StringReader; 4 5 import org.apache.lucene.analysis.Analyzer; 6 import org.apache.lucene.analysis.TokenStream; 7 import org.apache.lucene.analysis.cjk.CJKAnalyzer; 8 import org.apache.lucene.analysis.cn.ChineseAnalyzer; 9 import org.apache.lucene.analysis.standard.StandardAnalyzer; 10 import org.apache.lucene.analysis.tokenattributes.TermAttribute; 11 import org.apache.lucene.util.Version; 12 import org.junit.Test; 13 import org.wltea.analyzer.lucene.IKAnalyzer; 14 15 public class TestAnalyzer { 16 17 @Test 18 public void test() throws Exception { 19 String enText = "An IndexWriter creates and maintains an index."; 20 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); 21 testAnalyzer(analyzer, enText); 22 23 String cnText = "传智播客准备Lucene的开发环境"; 24 testAnalyzer(analyzer, cnText); // 单字分词 25 26 testAnalyzer(new ChineseAnalyzer(), cnText); // 单字分词 27 testAnalyzer(new CJKAnalyzer(Version.LUCENE_30), cnText); // 二分法分词 28 testAnalyzer(new IKAnalyzer(), cnText); // 词库分词(重点) 29 } 30 31 /** 32 * 使用指定的分词器对指定的文本进行分词,并打印出分出的词 33 * 34 * @param analyzer 35 * @param text 36 * @throws Exception 37 */ 38 private void testAnalyzer(Analyzer analyzer, String text) throws Exception { 39 System.out.println("当前使用的分词器:" + analyzer.getClass().getSimpleName()); 40 TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text)); 41 tokenStream.addAttribute(TermAttribute.class); 42 while (tokenStream.incrementToken()) { 43 TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); 44 //分词条件 45 System.out.println(termAttribute.term()); 46 } 47 System.out.println(); 48 } 49 50 }
标签:Lucene style class blog code java
原文地址:http://www.cnblogs.com/friends-wf/p/3796542.html