码迷,mamicode.com
首页 > 其他好文 > 详细

CSDN爬虫

时间:2015-02-26 16:42:42      阅读:154      评论:0      收藏:0      [点我收藏+]

标签:

仅做技术交流。

技术分享

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Text.RegularExpressions;
using DotNet.Utilities;
using System.Xml;
using System.Net;

namespace CSDNEpt
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        public const string Category = "<div\\s*id=\"panel_Category\"\\s*class=\"panel\">[\\w\\W]*?</div>";
        public const string CategoryId = "(?<=<li(.*)\\s*<a(.*))\\d+(?=\"\\sonclick=(.*)\\s*</li>)";
        public const string CategoryName="(?<=<li(.*)\\s*<a(.*)\">)(.*)(?=</a><span>(.*)\\s*</li>)";
        public const string ArticleCount = "(?<=<li(.*)\\s*<a(.*)</a><span>\\()\\d*(?=\\)</span>\\s*</li>)";
        public const string ArticleName = "(?<=<div\\s*class=\"article_title\">[\\w\\W]*\">\\s*)(.*)(?=\\s*</a></span>\\s*(.*)\\s*</div>)";
        public const string PostDate = "(?<=<span\\s*class=\"link_postdate\">)(.*)(?=</span>)";
        public const string ReadCount = "(?<=<span\\s*class=\"link_view\"\\s*title=\"阅读次数\">)(.*)(?=人阅读</span>)";
        public const string ArticleContent = "(?<=<div\\s*id=\"article_content\"\\s*class=\"article_content\">)[\\w\\W]*?(?=</div>)";
        public const string ArticleId = "(?<=<span\\s*class=\"link_title\">(.*)details/)\\d*(?=\">[\\w\\W]*?</a></span>)";
        public const string IsOriginal="(?<=<span\\s*class=\"ico\\s*)(.*)(?=\"></span>)";//是否原创

        public List<string> MatchStr(string regexStr,string matchStr){
            List<string> lt = new List<string>();
            Regex reg = new Regex(regexStr, RegexOptions.IgnoreCase);
            Match m = reg.Match(matchStr);
            while (m.Success){
                lt.Add(m.ToString());
                m = m.NextMatch();
            }
            return lt;
        }

        private void button1_Click(object sender, EventArgs e)
        {
            string path = Application.StartupPath + "\\article\\";
            if(!Directory.Exists(path)){
                Directory.CreateDirectory(path);
            }
            List<string> lt_CategoryId = new List<string>();
            List<string> lt_CategoryName = new List<string>();
            List<string> lt_ArticleCount = new List<string>();
            List<string> lt_ArticleId = new List<string>();

            HttpHelper http = new HttpHelper();
            HttpItem item = new HttpItem();
            item.URL = textBox1.Text;
            item.Referer = textBox1.Text;
            item.ProxyIp = "ieproxy";
            item.Encoding = Encoding.GetEncoding("utf-8");
            string html=http.GetHtml(item).Html;

            string CategoryHtml = MatchStr(Category, html)[0];
            lt_CategoryId = MatchStr(CategoryId, CategoryHtml);
            lt_CategoryName = MatchStr(CategoryName, CategoryHtml);
            lt_ArticleCount = MatchStr(ArticleCount, CategoryHtml);

            //循环每一个分类 取分类下的文章集合
            for (int i = 0; i < lt_CategoryId.Count; i++)
            {
                listBox1.Items.Insert(0, "正在获取【" + lt_CategoryName[i] + "】分类...");
                int count = Convert.ToInt32(lt_ArticleCount[i]);
                int page = (count % 20 == 0) ? (count / 20) : (count / 20 + 1);

                lt_ArticleId.Clear();
                for (int k = 1; k < page+1; k++)
                {
                    string pageUrl = textBox1.Text.Trim() + "/article/category/" + lt_CategoryId[i] + "/" + k;
                    item.URL = pageUrl;
                    string pageHtml = http.GetHtml(item).Html;
                    lt_ArticleId.AddRange(MatchStr(ArticleId, pageHtml));
                }

                if (lt_ArticleId.Count != 0)
                {
                    string articleUrl = "";
                    for (int j = 0; j < lt_ArticleId.Count; j++)
                    {
                        articleUrl = textBox1.Text.Trim() + "/article/details/" + lt_ArticleId[j];
                        item.URL = articleUrl;
                        string articleHtml = http.GetHtml(item).Html;
                        string articleName_txt = MatchStr(ArticleName, articleHtml)[0].Trim().Replace("\r","");
                        string postDate_txt = MatchStr(PostDate, articleHtml)[0].Trim();
                        string readCount_txt = MatchStr(ReadCount, articleHtml)[0].Trim();
                        string articleContent_txt = MatchStr(ArticleContent, articleHtml)[0];
                        string isOriginal_txt = MatchStr(IsOriginal, articleHtml)[0].Trim().Trim();

                        listBox1.Items.Insert(0, "正在抓取【" + articleName_txt + "】文章...");

                        //创建xml 保存文章
                        XmlDocument xml = new XmlDocument();
                        XmlDeclaration xmldecl=xml.CreateXmlDeclaration("1.0", "gb2312", null);
                        XmlElement root = xml.CreateElement("Article");
                        XmlElement name = xml.CreateElement("Name");
                        name.InnerText = articleName_txt;
                        XmlElement url = xml.CreateElement("URL");
                        url.InnerText = articleUrl;
                        XmlElement isOriginal = xml.CreateElement("IsOriginal");
                        isOriginal.InnerText = isOriginal_txt == "ico_type_Original" ? "Y" : "N";
                        XmlElement postDate = xml.CreateElement("PostDate");
                        postDate.InnerText = postDate_txt;
                        XmlElement readCount = xml.CreateElement("ReadCount");
                        readCount.InnerText = readCount_txt;
                        XmlElement articleContent = xml.CreateElement("ArticleContent");
                        articleContent.InnerText = articleContent_txt;
                        root.AppendChild(name);
                        root.AppendChild(url);
                        root.AppendChild(isOriginal);
                        root.AppendChild(postDate);
                        root.AppendChild(readCount);
                        root.AppendChild(articleContent);
                        xml.AppendChild(xmldecl);
                        xml.AppendChild(root);
                        xml.Save(path + articleName_txt + ".xml");

                        listBox1.Items.Insert(0, "【" + articleName_txt + "】文章抓取成功!");
                        Application.DoEvents();
                    }
                }
                listBox1.Items.Insert(0, "【"+lt_CategoryName[i] + "】分类获取完毕!");
                
            }
        }

    }
}


CSDN爬虫

标签:

原文地址:http://blog.csdn.net/hutao1101175783/article/details/43953457

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!