用正则表达式去截取网页里文字的方法。参数为读取的网页源代码

时间：2015-06-16 20:50:24 阅读：156 评论：0 收藏：0 [点我收藏+]

标签：

 //抓取文字方法，参数为网页源代码
        public string ExtractText(string strHtml)
        {
            string result = strHtml;
            result = RemoveComment(result); //调用去掉注释等方法
            result = RemoveScript(result); //调用去除js 方法
            result = RemoveStyle(result);  //调用去除样式表方法
            result = RemoveTags(result);  //调用去掉符号方法
            return result.Trim();
        }
        #region 
        //去除符号方法。把网页源代码作为参数，根据正则表达式去除相应符号。代码需要背过
        private string RemoveComment(string input)
        {
            string result = input;
            result = Regex.Replace(result, @"<!--[^-]*-->", string.Empty, RegexOptions.IgnoreCase);
            return result;
        }
        private string RemoveStyle(string input)
        {
            string result = input;
            //remove all styles 
            result = Regex.Replace(result, @"<style[^>]*?>.*?</style>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
            return result;
        }
        //去掉js方法
        private string RemoveScript(string input)
        {
            string result = input;
            result = Regex.Replace(result, @"<script[^>]*?>.*?</script>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
            return result;
        }
        //去掉标点符号方法
        private string RemoveTags(string input)
        {
            string result = input;
            result = result.Replace(" ", " ");
            result = result.Replace("<", "<");
            result = result.Replace(">", ">");
            result = result.Replace("&", "&");
            result = result.Replace("<br>", "\r\n");
            result = Regex.Replace(result, @"<[\s\S]*?>", string.Empty, RegexOptions.IgnoreCase);
            return result;
        }
        #endregion
        //批量抓取邮箱
        private void 转换工具ZToolStripMenuItem_Click(object sender, EventArgs e)
        {
            //MatchCollection 通过不停的替换将正则表达式模式应用于输入字符串所找到的成功匹配的集合
            textBox2.Clear();
            MatchCollection mc = Regex.Matches(respHtml, @"[a-zA-Z0-9_\-\.]+@\w+(\.\w+)+");
            StringBuilder sb = new StringBuilder();  //可变字符串
            foreach (Match mm in mc)  //Macth这是一个匹配类
            {
                sb.AppendLine(mm.Value);
            }
            textBox2.Text = sb.ToString();
        }

标签：

原文地址：http://www.cnblogs.com/275147378abc/p/4581580.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行