lucene3.0+版本中文分词测试+搜索结果+创建索引测试

时间：2015-07-10 15:26:45 阅读：166 评论：0 收藏：0 [点我收藏+]
标签：java
import java.io.File;  

import java.io.IOException;  

import java.io.StringReader;  

import org.apache.lucene.analysis.Analyzer;  

import org.apache.lucene.analysis.TokenStream;  

import org.apache.lucene.analysis.tokenattributes.TermAttribute;  

import org.apache.lucene.document.Document;  

import org.apache.lucene.document.Field;  

import org.apache.lucene.document.Field.Index;  

import org.apache.lucene.document.Field.Store;  

import org.apache.lucene.index.CorruptIndexException;  

import org.apache.lucene.index.IndexWriter;  

import org.apache.lucene.index.IndexWriter.MaxFieldLength;  

import org.apache.lucene.queryParser.ParseException;  

import org.apache.lucene.queryParser.QueryParser;  

import org.apache.lucene.search.IndexSearcher;  

import org.apache.lucene.search.Query;  

import org.apache.lucene.search.ScoreDoc;  

import org.apache.lucene.search.TopDocs;  

import org.apache.lucene.search.highlight.Highlighter;  

import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;  

import org.apache.lucene.search.highlight.QueryScorer;  

import org.apache.lucene.search.highlight.SimpleFragmenter;  

import org.apache.lucene.search.highlight.SimpleHTMLFormatter;  

import org.apache.lucene.store.FSDirectory;  

import org.apache.lucene.store.LockObtainFailedException;  

import org.apache.lucene.util.Version;  

import org.wltea.analyzer.lucene.IKAnalyzer;  

public class AnalzyerTest {  

    /** 

     * lucene3.0开始已经抛弃了原来的分词方式，转而使用新的分词方式<br> 

     * 本方法以SmartChineseAnalyzer为例，演示如何分词以及取得分词之后的term 

     * http://blog.csdn.net/yjflinchong/article/details/7906116 

     * @throws Exception 

     */  

    public static void analysis() throws Exception {  

        Analyzer analyzer = new IKAnalyzer();  

        String string = "据外媒报道，菲律宾国防部长加斯明9日称，多种新式战机、船只将于年内陆续交付军方，菲国防实力将得到大幅增强。但加斯明同时强调，此次军备采购与黄岩岛争端无关。";  

        StringReader reader = new StringReader(string);  

        TokenStream ts = analyzer.tokenStream("", reader);  

        TermAttribute termAttribute = ts.getAttribute(TermAttribute.class);  

        while (ts.incrementToken()) {  

            System.out.println(termAttribute.term() + "  ");  

        }  

        System.out.println();  

    }  

    /** 

     * 建索引 

     * 在构造IndexWriter时必须使用Directory作为参数了 

     *  

     * @throws CorruptIndexException 

     * @throws LockObtainFailedException 

     * @throws IOException 

     */  

    private static void build() throws CorruptIndexException, LockObtainFailedException, IOException {  

        String path = "index";  

        IndexWriter writer = new IndexWriter(FSDirectory.open(new File(path)), new IKAnalyzer(), true, MaxFieldLength.LIMITED);  

        Document document = new Document();  

        document.add(new Field("text", "中国人民银行采取了一系列措施防止人民币升值，但是很遗憾，这些措施在今天看来其作用是微乎其微的。难道真的就没有什么别的措施防止人民币再次疯狂升值了吗？", Store.YES, Index.ANALYZED));  

        writer.addDocument(document);  

        writer.optimize();  

        writer.close();  

    }  

    /** 

     *  

     * @param keyword 

     * @throws CorruptIndexException 

     * @throws IOException 

     * @throws ParseException 

     * @throws InvalidTokenOffsetsException 

     */  

    private static void search(String keyword) throws CorruptIndexException, IOException, ParseException, InvalidTokenOffsetsException {  

        Analyzer analyzer = new IKAnalyzer();  

        QueryParser parser = new QueryParser(Version.LUCENE_30, "text", analyzer);  

        IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("index")));  

        Query query = parser.parse(keyword);  

        System.out.println(query);  

        TopDocs topDocs = searcher.search(query, 10);  

        ScoreDoc[] scoreDocs = topDocs.scoreDocs;  

        System.out.println("hits:" + topDocs.totalHits);  

        for (ScoreDoc scoreDoc : scoreDocs) {  

            Document doc = searcher.doc(scoreDoc.doc);  

            String text = doc.get("text");  

            System.out.println(highlight(text, query, analyzer));  

        }  

    }  

    /** 

     * 高亮关键词 

     * http://blog.csdn.net/yjflinchong/article/details/7906116 

     * @param content 

     *            需要高亮的内容 

     * @param query 

     *            搜索时使用的Query对象 

     * @param analyzer 

     *            分词器 

     * @return 高亮之后的文本 

     * @throws IOException 

     * @throws InvalidTokenOffsetsException 

     */  

    private static String highlight(String content, Query query, Analyzer analyzer) throws IOException, InvalidTokenOffsetsException {  

        SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<b>", "</b>");  

        Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));  

        highlighter.setTextFragmenter(new SimpleFragmenter(100));  

        String resultString = highlighter.getBestFragment(analyzer.tokenStream("", new StringReader(content)), content);  

        return resultString + "...";  

    }  

    public static void main(String[] args) throws Exception {  

        analysis();  

        build();  

        search("人民币 升值");  

    }  

}
lucene3.0+版本中文分词测试+搜索结果+创建索引测试
标签：java
原文地址：http://blog.csdn.net/u013948187/article/details/46829031
踩
(0)
评论一句话评论（0）
分享档案
更多>
2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)
周排行