码迷,mamicode.com
首页 > Web开发 > 详细

Lucene .NET 全文检索

时间:2014-07-12 23:52:43      阅读:509      评论:0      收藏:0      [点我收藏+]

标签:c#   lucene   搜索   .net   全文检索   

近期做项目中有用到过Lucene,那个模块是由一位前端大神负责的,空闲时间我也做了个关于Lucene做全文检索的Demo,记录下来,方便以后学习。
关于Lucene的原理,网上有长篇大论的文章,有兴趣的话可以去阅读,再次我就直奔主题,在代码中分析其原理。

1、创建索引(此处我用的是盘古分词)

注:在后台代码的第一行上加上 #define notes这样一行代码,目的是可以用外侧代码的#if,作用嘛 用过之后就很明白了,嘿嘿。

        #region 创建索引 void CreateIndex(object sender, EventArgs e)
        /// <summary>
        /// 创建索引
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void CreateIndex(object sender, EventArgs e)
        {
            //索引存放的物理路径
            //this.CreateDirectory();   //给 indexPath 赋值
            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
            bool isUpdate = IndexReader.IndexExists(directory); //判断索引库文件夹存在并且存在索引库特征文件
            if (isUpdate)
            {
                //同时只能有一段代码对索引库进行写操作!当使用IndexWriter打开directory的时候会自动给索引库上锁。!!!
                //如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
                if (IndexWriter.IsLocked(directory))   //如果索引库文件被锁定了  解锁
                {
                    IndexWriter.Unlock(directory);
                }
            }
            //IndexWriter writer = new IndexWriter(indexPath, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);  //该方法已过时。
            IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
            IEnumerable<Story> list = bllHelper.GetAllStory();
            foreach (Story story in list)
            {
                writer.DeleteDocuments(new Term("ID", story.ID.ToString()));
                Document document = new Document();  //一篇文章,一部小说
                //要进行全文检索的字段要设置 Field.Index.ANALYZED !!!!!!!!!!!!!!!!!!!!!!!!!!
                document.Add(new Field("ID", story.ID.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                document.Add(new Field("Title", story.Title, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
                document.Add(new Field("Author", story.Author, Field.Store.YES, Field.Index.NOT_ANALYZED));
                document.Add(new Field("Content", story.Content, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
                document.Add(new Field("URL", story.URL, Field.Store.YES, Field.Index.NOT_ANALYZED));
                writer.AddDocument(document);
            }
            writer.Close();
            directory.Close();
        }
        #endregion

2.接下来就是搜索了

        #region 搜索 IEnumerable<Story> Search(string keyWord)
        /// <summary>
        /// 搜索
        /// </summary>
        /// <param name="keyWords">关键字</param>
        private IEnumerable<Story> Search(string keyWord)
        {
            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());
            IndexReader reader = IndexReader.Open(directory, true);
            IndexSearcher searcher = new IndexSearcher(reader);

            //多条件查询
            //搜索条件
            PhraseQuery queryTitle = new PhraseQuery();
            //把用户输入的“北京是首都”分词为“北京 是 首都”三个词,然后添加查询条件
            foreach (string word in CommonHelper.SplitWords(keyWord))
            {
                queryTitle.Add(new Term("Title", word));
            }
            queryTitle.SetSlop(100); //多个查询条件的词之间的最大距离。在文章中相隔太远一般也就无意义

            //搜索条件
            PhraseQuery queryContent = new PhraseQuery();
            //把用户输入的“北京是首都”分词为“北京 是 首都”三个词,然后添加查询条件
            foreach (string word in CommonHelper.SplitWords(keyWord))
            {
                queryContent.Add(new Term("Content", word));
            }
            queryContent.SetSlop(100);

            //用BooleanQuery把多个查询条件拼接起来成为一个大的查询条件
            BooleanQuery query = new BooleanQuery();
            query.Add(queryTitle, BooleanClause.Occur.SHOULD);//可以有
            query.Add(queryContent, BooleanClause.Occur.SHOULD);//可以有

#if !notes

            //组合关系代表的意思如下: 
            //1、MUST和MUST表示“与”的关系,即“并集”。
            //2、MUST和MUST_NOT前者包含后者不包含。
            //3、MUST_NOT和MUST_NOT没意义
            //4、SHOULD与MUST表示MUST,SHOULD失去意义; 
            //5、SHOUlD与MUST_NOT相当于MUST与MUST_NOT。 
            //6、SHOULD与SHOULD表示“或”的概念。  
#endif
            //create 一个存储查询结果的容器
            TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);
            searcher.Search(query, null, collector);
            ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;  //得到所有查询结果中的文档

            List<Story> list = new List<Story>();
            foreach (ScoreDoc doc in docs)
            {
                int docID = doc.doc;  //得到查询结果文档的id(Lucene内部分配的id)
                Document document = searcher.Doc(docID);  //根据ID找到对应的Document
                Story story = new Story();
                story.ID = Convert.ToInt32(document.Get("ID"));
                story.Title = CommonHelper.Highlight(keyWord, document.Get("Title"));
                story.Author = document.Get("Author");
                story.Content = CommonHelper.Highlight(keyWord, document.Get("Content"));
                //story.Content = document.Get("Content");
                story.URL = document.Get("URL");
                list.Add(story);
            }
            return list;
        }
        #endregion

3.帮助类文件

3.1 BusinessHelper类

 #region 根据ID获取小说 +Story GetStoryById(int id)
        /// <summary>
        /// 根据ID获取小说
        /// </summary>
        /// <param name="id">ID</param>
        /// <returns></returns>
        public Story GetStoryById(int id)
        {
            string sql = "SELECT * FROM Story nolock WHERE Id = @Id";
            using (SqlDataReader reader = SqlHelper.ExecuteDataReader(sql, new SqlParameter("@Id", id)))
            {
                if (reader.Read())
                {
                    return ToModel(reader);
                }
                else
                {
                    return null;
                }
            }
        } 
        #endregion

        #region 获取所有的小说 +IEnumerable<Story> GetAllStory()
        /// <summary>
        /// 获取所有的小说
        /// </summary>
        /// <returns></returns>
        public IEnumerable<Story> GetAllStory()
        {
            var list = new List<Story>();
            string sql = "SELECT * FROM Story nolock";
            using (SqlDataReader reader = SqlHelper.ExecuteDataReader(sql))
            {
                while (reader.Read())
                {
                    list.Add(ToModel(reader));
                }
            }
            return list;
        } 
        #endregion

        #region 把SqlDataReader转换成实体 Story ToModel(SqlDataReader reader)
        /// <summary>
        /// 把SqlDataReader转换成实体
        /// </summary>
        /// <param name="reader"></param>
        /// <returns></returns>
        private Story ToModel(SqlDataReader reader)
        {
            Story story = new Story();
            story.ID = (int)ToModelValue(reader, "Id");
            story.Title = (string)ToModelValue(reader, "Title");
            story.Author = (string)ToModelValue(reader, "Author");
            story.Content = (string)ToModelValue(reader, "Content");
            story.URL = (string)ToModelValue(reader, "URL");
            return story;
        } 
        #endregion

        private object ToDBValue(object value)
        {
            if (value == null)
            {
                return DBNull.Value;
            }
            else
            {
                return value;
            }
        }

        private object ToModelValue(SqlDataReader reader, string columnName)
        {
            if (reader.IsDBNull(reader.GetOrdinal(columnName)))
            {
                return null;
            }
            else
            {
                return reader[columnName];
            }
        }

3.2 CommonHelper类

        /// <summary>
        /// 把用户传入的字符串s分割成一个个的词
        /// </summary>
        /// <param name="s"></param>
        /// <returns></returns>
        public static string[] SplitWords(string s)
        {
            List<string> list = new List<string>();

            Analyzer analyzer = new PanGuAnalyzer();

            TokenStream tokenStream = analyzer.TokenStream("", new StringReader(s));
            Lucene.Net.Analysis.Token token = null;
            while ((token = tokenStream.Next()) != null)  //Next继续分词,如果没有更多词,则返回null
            {
                list.Add(token.TermText());//得到分到的词
            }
            return list.ToArray();
        }


        public static string Highlight(string keyword, string content)
        {
            try
            {
                //创建HTMLFormatter,参数为高亮单词的前后缀
                PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter =
                       new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"red\"><b>", "</b></font>");
                //创建 Highlighter ,输入HTMLFormatter 和 盘古分词对象Semgent
                PanGu.HighLight.Highlighter highlighter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new Segment());
                //设置每个摘要段的字符数
                highlighter.FragmentSize = 5000;
                //获取最匹配的摘要段
                string result = highlighter.GetBestFragment(keyword, content);
                if (string.IsNullOrEmpty(result))
                {
                    return content;
                }
                else
                {
                    return result;
                }
            }
            catch
            {
                return content;
            }
        }

3.3 SqlHelper 类

        public static string CONNECTIONSTRING = ConfigurationManager.ConnectionStrings["connLuceneDB"].ConnectionString;

        #region 执行查询方法 +static DataTable ExecuteDataTable(string sql)
        /// <summary>
        /// 执行查询方法
        /// <para>返回DataTable</para>
        /// </summary>
        /// <param name="sql">sql语句</param>
        /// <param name="list"></param>
        public static DataTable ExecuteDataTable(string sql)
        {
            using (SqlConnection conn = new SqlConnection(SqlHelper.CONNECTIONSTRING))
            {
                conn.Open();
                using (SqlCommand cmd = new SqlCommand(sql, conn))
                {
                    SqlDataAdapter da = new SqlDataAdapter(cmd);
                    DataTable dt = new DataTable();
                    da.Fill(dt);
                    return dt;
                }
            };
        } 
        #endregion

        #region 执行查询方法,返回DataReader对象 +static SqlDataReader ExecuteDataReader(string cmdText,params SqlParameter[] parameters)
        /// <summary>
        /// 执行查询方法,返回DataReader对象
        /// </summary>
        /// <param name="cmdText"></param>
        /// <param name="parameters"></param>
        /// <returns></returns>
        public static SqlDataReader ExecuteDataReader(string cmdText,
            params SqlParameter[] parameters)
        {
            SqlConnection conn = new SqlConnection(CONNECTIONSTRING);
            conn.Open();
            using (SqlCommand cmd = conn.CreateCommand())
            {
                cmd.CommandText = cmdText;
                cmd.Parameters.AddRange(parameters);
                return cmd.ExecuteReader(CommandBehavior.CloseConnection);
            }
        } 
        #endregion

        #region 执行 增、删、改 的方法 +static void ExecuteNonQuery(string sql, out bool flag)
        /// <summary>
        /// 执行 增、删、改 的方法
        /// </summary>
        /// <param name="sql">SQL语句</param>
        /// <returns>返回执行结果  true OR false</returns>
        public static bool ExecuteNonQuery(string sql)
        {
            var flag = false;
            using (SqlConnection conn = new SqlConnection(SqlHelper.CONNECTIONSTRING))
            {
                conn.Open();
                using (SqlCommand cmd = new SqlCommand(sql, conn))
                {
                    flag = cmd.ExecuteNonQuery() > 0 ? true : false;
                }
            };
            return flag;
        } 
        #endregion

4.小说实体类

    /// <summary>
    /// 小说 实体类
    /// </summary>
    public class Story
    {
        /// <summary>
        /// 小说编号
        /// </summary>
        public int ID { get; set; }
        /// <summary>
        /// 小说标题
        /// </summary>
        public string Title { get; set; }
        /// <summary>
        /// 作者
        /// </summary>
        public string Author { get; set; }
        /// <summary>
        /// 小说内容
        /// </summary>
        public string Content { get; set; }
        /// <summary>
        /// 小说在线阅读地址
        /// </summary>
        public string URL { get; set; }
    }

5.前台

<form id="form1" runat="server" method="post">
    <asp:TextBox ID="txtKW" runat="server" Width="291px"></asp:TextBox>
    <asp:Button ID="btnSearch" runat="server" Text="搜索" onclick="btnSearch_Click" />
                    
    <asp:Button ID="btnCreateIndex" runat="server" Text="创建索引"
        onclick="btnCreateIndex_Click"/>
    <asp:GridView ID="gdvShowStory" runat="server" AutoGenerateColumns="False" CellPadding="4"
        ForeColor="#333333" GridLines="None">
        <AlternatingRowStyle BackColor="White" ForeColor="#284775" />
        <Columns>
            <asp:TemplateField HeaderStyle-Width="3%">
                <HeaderTemplate>
                    编号
                </HeaderTemplate>
                <ItemTemplate>
                    <asp:Label ID="Label1" runat="server" Text='<%# Eval("ID") %>'></asp:Label>
                </ItemTemplate>
            </asp:TemplateField>
            <asp:TemplateField HeaderStyle-Width="10%">
                <HeaderTemplate>
                    标题
                </HeaderTemplate>
                <ItemTemplate>
                    <asp:Label ID="Label2" Text='<%# Eval("Title") %>' runat="server"></asp:Label>
                </ItemTemplate>
            </asp:TemplateField>
            <asp:TemplateField HeaderStyle-Width="8%">
                <HeaderTemplate>
                    作者
                </HeaderTemplate>
                <ItemTemplate>
                    <asp:Label ID="Label2" Text='<%# Eval("Author") %>' runat="server"></asp:Label>
                </ItemTemplate>
            </asp:TemplateField>
            <asp:TemplateField HeaderStyle-Width="70%">
                <HeaderTemplate>
                    内容
                </HeaderTemplate>
                <ItemTemplate>
                    <asp:Label ID="Label2" Text='<%# Eval("Content") %>' runat="server"></asp:Label>
                </ItemTemplate>
            </asp:TemplateField>
            <asp:TemplateField HeaderStyle-Width="5%">
                <HeaderTemplate>
                    操作
                </HeaderTemplate>
                <ItemTemplate>
                    <a href='<%#Eval("URL") %>'>在线阅读</a>
                </ItemTemplate>
            </asp:TemplateField>
        </Columns>
        <EditRowStyle BackColor="#999999" />
        <FooterStyle BackColor="#5D7B9D" Font-Bold="True" ForeColor="White" />
        <HeaderStyle BackColor="#5D7B9D" Font-Bold="True" ForeColor="White" />
        <PagerStyle BackColor="#284775" ForeColor="White" HorizontalAlign="Center" />
        <RowStyle BackColor="#F7F6F3" ForeColor="#333333" />
        <SelectedRowStyle BackColor="#E2DED6" Font-Bold="True" ForeColor="#333333" />
        <SortedAscendingCellStyle BackColor="#E9E7E2" />
        <SortedAscendingHeaderStyle BackColor="#506C8C" />
        <SortedDescendingCellStyle BackColor="#FFFDF8" />
        <SortedDescendingHeaderStyle BackColor="#6F8DAE" />
    </asp:GridView>
    </form>

注:需要引入几个类库

bubuko.com,布布扣

 

 

 

 

 

OK,到此为止,一个简单的Demo出来了,看看效果吧:

bubuko.com,布布扣

 

 

 

 

 

 

 

 

 

 

 

(PS:欢迎广大朋友参与导论关于Lucene的问题,有兴趣的话,可以加偶的扣扣:1686336218,成功的路上有我也有你。)

Lucene .NET 全文检索,布布扣,bubuko.com

Lucene .NET 全文检索

标签:c#   lucene   搜索   .net   全文检索   

原文地址:http://blog.csdn.net/chenyblog/article/details/37697385

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!