码迷,mamicode.com
首页 > Web开发 > 详细

正则 挖网站表格复习

时间:2016-11-06 20:03:39      阅读:196      评论:0      收藏:0      [点我收藏+]

标签:foreach   empty   str   mat   express   group   reg   blog   form   

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Xml;
namespace WebApplication19
{
    public enum SearchRange
    {
        th=0,
        td=1
    }
    public partial class WebForm1 : System.Web.UI.Page
    {
        public string MKT;
        private string getHtml()
        {
            List<string> trList = new List<string>();
            try
            {
                WebClient wc = new WebClient();
                using (Stream stream = wc.OpenRead("http://srh.bankofchina.com/search/whpj/search.jsp?erectDate=2001-11-01&nothing=2016-11-04&pjname=1316&page=4"))
                {
                    using (StreamReader sr = new StreamReader(stream, Encoding.UTF8))
                    {

                        string content = sr.ReadToEnd();
                        //提取div内容开始
                        string divPatern = @"(?<=<div (.*)?class=""BOC_main publish""[^>]*?>)([\s\S]*?)(?=</div>)";
                        MatchCollection divMatches = Regex.Matches(content, divPatern);
                        string divContent = string.Empty;
                        foreach (Match match in divMatches)
                        {
                            divContent = match.Groups[0].Value;
                            break;
                        }
                        //提取div内容结束

                        //提取表格内容开始
                        string tablePatern = @"(?<=<table (.*)?[^>]*?>)([\s\S]*?)(?=</table>)";
                        MatchCollection tableMatches = Regex.Matches(divContent, tablePatern);
                        string tableContent = string.Empty;
                        foreach (Match match in tableMatches)
                        {
                            tableContent = match.Groups[0].Value;
                            break;
                        }

                        //提取表格内容结束


                        //提取行开始

                        string trPatern = @"(?<=<tr(.*)?[^>]*?>)([\s\S]*?)(?=</tr>)";
                        MatchCollection trMatchCollection = Regex.Matches(tableContent, trPatern);
                        for (int j = 0; j < trMatchCollection.Count; j++)
                        {
                            Match match = trMatchCollection[j];
                            string tr = string.Empty;
                            tr = match.Groups[0].Value;
                            trList.Add(tr);


                        }
                        //提取行结束

                    }

                    //获取表头列元素,或者内容行的单元格元素 trlist[0]是表头 SearchR,ange告诉程序要查表头 还是 内容行
                    List<string> thList = GET_TH_OR_TD_LIST(SearchRange.th, trList[0]);
                    System.Collections.ArrayList tdsList = new System.Collections.ArrayList();
                    for (int i = 1; i < trList.Count; i++)
                    {
                        tdsList.Add(GET_TH_OR_TD_LIST(SearchRange.td, trList[i]));
                    }
                  
                }
            }
            catch (Exception ex)
            {
               
            }
            return MKT;
        }

        private List<string> GET_TH_OR_TD_LIST(SearchRange range,string row)
        {
            string tmp = "";
            tmp = range.ToString();
            string tdPatern = $@"(?<=(<{tmp}[^>]*?>))(?<tdCell>[\s\S]*?)(?=</{tmp}>)";
            MatchCollection CurrenttdMatchCollection = Regex.Matches(row, tdPatern);
            string td = string.Empty;
            List<string> tdlList = new List<string>();
            List<string> contentList = new List<string>();
            foreach (Match match in CurrenttdMatchCollection)
            {

                td = match.Groups["tdCell"].Value;
                contentList.Add(td);

            }
            return contentList;

        }
        protected void Page_Load(object sender, EventArgs e)
        {
            getHtml();
        }
    }
}

 

正则 挖网站表格复习

标签:foreach   empty   str   mat   express   group   reg   blog   form   

原文地址:http://www.cnblogs.com/kexb/p/6035938.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!