package com.zhishang.lucene; /** * Created by Administrator on 2017/7/8. */ public class HtmlBean { private String title; private String content; private String url; public void setTitle(String title) { this.title = title; } public void setContent(String content) { this.content = content; } public void setUrl(String url) { this.url = url; } public String getTitle() { return title; } public String getContent() { return content; } public String getUrl() { return url; } }
package com.zhishang.lucene; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.Source; import org.junit.Test; import java.io.File; import java.io.IOException; /** * Created by Administrator on 2017/7/8. */ public class HtmlBeanUtil { public static HtmlBean parseHtml(File file){ try { Source sc = new Source(file); Element element = sc.getFirstElement(HTMLElementName.TITLE); if (element == null || element.getTextExtractor() == null){ return null; } HtmlBean htmlBean = new HtmlBean(); htmlBean.setTitle(element.getTextExtractor().toString()); htmlBean.setContent(sc.getTextExtractor().toString()); htmlBean.setUrl(file.getAbsolutePath()); return htmlBean; } catch (IOException e) { e.printStackTrace(); } return null; } }
package com.zhishang.lucene; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import org.wltea.analyzer.lucene.IKAnalyzer; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.List; /** * Created by Administrator on 2017/7/7. */ public class SearchIndex { public List<HtmlBean> search(String keyword){ Directory dir = null; try { dir = FSDirectory.open(new File(CreateIndex.indexDir)); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new IKAnalyzer(); MultiFieldQueryParser multiFieldQueryParser = new MultiFieldQueryParser(Version.LUCENE_4_9,new String[]{"title","content"},analyzer); Query query = multiFieldQueryParser.parse(keyword); TopDocs search = searcher.search(query,10); ScoreDoc[] scoreDocs = search.scoreDocs; SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color=‘red‘>","</font>"); QueryScorer queryScorer = new QueryScorer(query,"title"); Highlighter highlighter = new Highlighter(simpleHTMLFormatter,queryScorer); List<HtmlBean> htmlBeanList = new ArrayList<HtmlBean>(); for (ScoreDoc scoreDoc:scoreDocs){ Document document = reader.document(scoreDoc.doc); String title = highlighter.getBestFragment(new IKAnalyzer(),"title",document.get("title")); String content = highlighter.getBestFragments(new IKAnalyzer().tokenStream("content",document.get("content")),document.get("content"),3,"..."); String url = document.get("url"); HtmlBean htmlBean = new HtmlBean(); htmlBean.setTitle(title); htmlBean.setContent(content); htmlBean.setUrl(url); htmlBeanList.add(htmlBean); } return htmlBeanList; // System.out.println(search.totalHits); } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } catch (InvalidTokenOffsetsException e) { e.printStackTrace(); } return null; } }
package com.zhishang.lucene; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import java.io.File; import java.util.List; /** * Created by Administrator on 2017/7/8. */ public class LuceneBean { @Test public void search(){ SearchIndex searchIndex = new SearchIndex(); List<HtmlBean> htmlBeanList = searchIndex.search("java"); for (HtmlBean bean:htmlBeanList){ System.out.println(bean.getTitle()); System.out.println(bean.getContent()); System.out.println(bean.getUrl()); System.out.println("-----------------------------------------------------"); } } /* 创建索引 */ @Test public void createIndex(){ File file = new File(CreateIndex.indexDir); if (file.exists()){ file.delete(); file.mkdirs(); } CreateIndex createIndex = new CreateIndex(); createIndex.createIndex(); } }
本文出自 “素颜” 博客,请务必保留此出处http://suyanzhu.blog.51cto.com/8050189/1945606
原文地址:http://suyanzhu.blog.51cto.com/8050189/1945606