码迷,mamicode.com
首页 > Web开发 > 详细

HtmlAgility 抓取网页上的数据

时间:2020-03-06 17:36:20      阅读:126      评论:0      收藏:0      [点我收藏+]

标签:ESS   数据   oid   private   file   pdo   lag   ace   eai   

 /// <summary>
    /// 财政部mca
    /// http://www.mca.gov.cn/article/sj/xzqh/1980/
    /// https://github.com/zzzprojects/html-agility-pack
    /// https://github.com/linezero/HtmlAgilityPack
    /// </summary>
    public partial class Form1 : Form
    {

        int codecell = 2;
        int namecell = 3;
        int yearnmae = 2019;
        string tableNo = "table";
        string trNo = "tr";
        string tdthNo = "th|td";
        DataTable  McaData()
        {
            DataTable dt = new DataTable();
            dt.Columns.Add("year", typeof(int)); //年份
            dt.Columns.Add("website", typeof(string)); //财政部网址
            dt.Columns.Add("codecell", typeof(int));  //行政区划编码在表格的第几列
            dt.Columns.Add("namecell", typeof(int));  //行政区划名称在表格的第几列
            dt.Columns.Add("tableNo", typeof(string));  //表格标识
            dt.Columns.Add("trNo", typeof(string));  //行标识
            dt.Columns.Add("tdthNo", typeof(string));  //列标识
            dt.Rows.Add(2019, "http://www.mca.gov.cn/article/sj/xzqh/1980/2019/202002281436.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2018, "http://www.mca.gov.cn/article/sj/xzqh/1980/201903/201903011447.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2017, "http://www.mca.gov.cn/article/sj/xzqh/1980/201803/201803131454.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2016, "http://www.mca.gov.cn/article/sj/xzqh/1980/201705/201705311652.html", 2, 3, "table//tbody", "tr", "th|td");//TBODY 都是大写
            dt.Rows.Add(2015, "http://www.mca.gov.cn/article/sj/tjbz/a/2015/201706011127.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2014, "http://files2.mca.gov.cn/cws/201502/20150225163817214.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2013, "http://files2.mca.gov.cn/cws/201404/20140404125552372.htm", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2012, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201707271556.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2011, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201707271552.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2010, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220946.html", 2, 3, "table", "tr", "th|td");          
            dt.Rows.Add(2009, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220943.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2008, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220941.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2007, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220939.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2006, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220936.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2005, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220935.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2004, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220930.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2003, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220928.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2002, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220927.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2001, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220925.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2000, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220923.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1999, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220921.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1998, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220918.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1997, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220916.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1996, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220914.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1995, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220913.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1994, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220911.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1993, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708041023.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1992, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220910.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1991, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708041020.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1990, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708041018.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1989, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708041017.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1988, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220903.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1987, "http://www.mca.gov.cn/article/sj/xzqh/1980/1980/201911180950.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1986, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220859.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1985, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220858.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1984, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220856.html", 1, 2, "table", "tr", "th|td");
            dt.Rows.Add(1983, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708160821.html", 1, 2, "table", "tr", "th|td");
            dt.Rows.Add(1982, "http://www.mca.gov.cn/article/sj/xzqh/1980/1980/201911180942.html", 1, 2, "table", "tr", "th|td");
            dt.Rows.Add(1981, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708041004.html", 1, 2, "table", "tr", "th|td");
            dt.Rows.Add(1980, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708040959.html", 1, 2, "table", "tr", "th|td");
            return dt;
        }

        /// <summary>
        /// 
        /// </summary>
        public Form1()
        {
            InitializeComponent();
        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void Form1_Load(object sender, EventArgs e)
        {
            this.comboBox1.DataSource = McaData();
            this.comboBox1.DisplayMember = "year";
            this.comboBox1.ValueMember = "website";
        }
        /// <summary>
        /// 抓取数据
        /// Geovin Du 涂聚文
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void button1_Click(object sender, EventArgs e)
        {
            try
            {
                string website = this.comboBox1.SelectedValue.ToString();

                //codecell =(int)this.numericUpDown1.Value;
                //namecell = (int)this.numericUpDown2.Value;
                HtmlAgilityPack.HtmlWeb webClient = new HtmlAgilityPack.HtmlWeb();
                HtmlAgilityPack.HtmlDocument doc = webClient.Load(website);
                this.richTextBox1.Text = doc.Text.ToLower();
                //HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("/html[1]/body[1]/div[1]/div[2]/div[3]/div[2]/div[1]/div[1]/div[1]/div");
                //foreach (HtmlNode node in nodes)
                //{
                //    Console.WriteLine(node.InnerText.Trim());
                //}
                //nodes = null;

                yearnmae = int.Parse(this.comboBox1.Text);
                DataRow[] drsselect = McaData().Select("year="+yearnmae+"");
                for (int i = 0; i < drsselect.Length; i++)
                {

                    codecell =int.Parse(drsselect[i]["codecell"].ToString());
                    namecell= int.Parse(drsselect[i]["namecell"].ToString());
                    tableNo = drsselect[i]["tableNo"].ToString();
                    trNo = drsselect[i]["trNo"].ToString();
                    tdthNo = drsselect[i]["tdthNo"].ToString();
                }


                List<AreaInfo> list = new List<AreaInfo>();
                foreach (HtmlNode table in doc.DocumentNode.SelectNodes("//"+tableNo))
                {
                    //Console.WriteLine("Found: " + table.Id);
                    AreaInfo info = null;
                    
                    foreach (HtmlNode rows in table.SelectNodes(trNo))
                    {
                        info = new AreaInfo();
                        int cell = 1;                        
                        foreach (HtmlNode cells in rows.SelectNodes(tdthNo))
                        {                            
                            if(cell==codecell)
                            {
                                info.AreaCode = cells.InnerText.Trim().Replace(" ", "").Trim(); 
      
                            }
                            if(cell==namecell)
                            {
                                info.AreaName = cells.InnerText.Trim().Replace(" ", "").Trim();

                            }                           
                            cell++;
                        }
                        if(!string.IsNullOrEmpty(info.AreaCode))
                        { 
                           list.Add(info);
                        }
                    }
                }


                doc = null;
                webClient = null;
                this.bindingSource1.DataSource = list;
                this.bindingNavigator1.BindingSource = this.bindingSource1;
                this.dataGridView1.DataSource = this.bindingSource1;
            }
            catch(Exception ex)
            {

                ex.Message.ToString();
            }
            

        }

  

HtmlAgility 抓取网页上的数据

标签:ESS   数据   oid   private   file   pdo   lag   ace   eai   

原文地址:https://www.cnblogs.com/geovindu/p/12427358.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!