标签:
仅做技术交流。
using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Text; using System.Windows.Forms; using System.IO; using System.Text.RegularExpressions; using DotNet.Utilities; using System.Xml; using System.Net; namespace CSDNEpt { public partial class Form1 : Form { public Form1() { InitializeComponent(); } public const string Category = "<div\\s*id=\"panel_Category\"\\s*class=\"panel\">[\\w\\W]*?</div>"; public const string CategoryId = "(?<=<li(.*)\\s*<a(.*))\\d+(?=\"\\sonclick=(.*)\\s*</li>)"; public const string CategoryName="(?<=<li(.*)\\s*<a(.*)\">)(.*)(?=</a><span>(.*)\\s*</li>)"; public const string ArticleCount = "(?<=<li(.*)\\s*<a(.*)</a><span>\\()\\d*(?=\\)</span>\\s*</li>)"; public const string ArticleName = "(?<=<div\\s*class=\"article_title\">[\\w\\W]*\">\\s*)(.*)(?=\\s*</a></span>\\s*(.*)\\s*</div>)"; public const string PostDate = "(?<=<span\\s*class=\"link_postdate\">)(.*)(?=</span>)"; public const string ReadCount = "(?<=<span\\s*class=\"link_view\"\\s*title=\"阅读次数\">)(.*)(?=人阅读</span>)"; public const string ArticleContent = "(?<=<div\\s*id=\"article_content\"\\s*class=\"article_content\">)[\\w\\W]*?(?=</div>)"; public const string ArticleId = "(?<=<span\\s*class=\"link_title\">(.*)details/)\\d*(?=\">[\\w\\W]*?</a></span>)"; public const string IsOriginal="(?<=<span\\s*class=\"ico\\s*)(.*)(?=\"></span>)";//是否原创 public List<string> MatchStr(string regexStr,string matchStr){ List<string> lt = new List<string>(); Regex reg = new Regex(regexStr, RegexOptions.IgnoreCase); Match m = reg.Match(matchStr); while (m.Success){ lt.Add(m.ToString()); m = m.NextMatch(); } return lt; } private void button1_Click(object sender, EventArgs e) { string path = Application.StartupPath + "\\article\\"; if(!Directory.Exists(path)){ Directory.CreateDirectory(path); } List<string> lt_CategoryId = new List<string>(); List<string> lt_CategoryName = new List<string>(); List<string> lt_ArticleCount = new List<string>(); List<string> lt_ArticleId = new List<string>(); HttpHelper http = new HttpHelper(); HttpItem item = new HttpItem(); item.URL = textBox1.Text; item.Referer = textBox1.Text; item.ProxyIp = "ieproxy"; item.Encoding = Encoding.GetEncoding("utf-8"); string html=http.GetHtml(item).Html; string CategoryHtml = MatchStr(Category, html)[0]; lt_CategoryId = MatchStr(CategoryId, CategoryHtml); lt_CategoryName = MatchStr(CategoryName, CategoryHtml); lt_ArticleCount = MatchStr(ArticleCount, CategoryHtml); //循环每一个分类 取分类下的文章集合 for (int i = 0; i < lt_CategoryId.Count; i++) { listBox1.Items.Insert(0, "正在获取【" + lt_CategoryName[i] + "】分类..."); int count = Convert.ToInt32(lt_ArticleCount[i]); int page = (count % 20 == 0) ? (count / 20) : (count / 20 + 1); lt_ArticleId.Clear(); for (int k = 1; k < page+1; k++) { string pageUrl = textBox1.Text.Trim() + "/article/category/" + lt_CategoryId[i] + "/" + k; item.URL = pageUrl; string pageHtml = http.GetHtml(item).Html; lt_ArticleId.AddRange(MatchStr(ArticleId, pageHtml)); } if (lt_ArticleId.Count != 0) { string articleUrl = ""; for (int j = 0; j < lt_ArticleId.Count; j++) { articleUrl = textBox1.Text.Trim() + "/article/details/" + lt_ArticleId[j]; item.URL = articleUrl; string articleHtml = http.GetHtml(item).Html; string articleName_txt = MatchStr(ArticleName, articleHtml)[0].Trim().Replace("\r",""); string postDate_txt = MatchStr(PostDate, articleHtml)[0].Trim(); string readCount_txt = MatchStr(ReadCount, articleHtml)[0].Trim(); string articleContent_txt = MatchStr(ArticleContent, articleHtml)[0]; string isOriginal_txt = MatchStr(IsOriginal, articleHtml)[0].Trim().Trim(); listBox1.Items.Insert(0, "正在抓取【" + articleName_txt + "】文章..."); //创建xml 保存文章 XmlDocument xml = new XmlDocument(); XmlDeclaration xmldecl=xml.CreateXmlDeclaration("1.0", "gb2312", null); XmlElement root = xml.CreateElement("Article"); XmlElement name = xml.CreateElement("Name"); name.InnerText = articleName_txt; XmlElement url = xml.CreateElement("URL"); url.InnerText = articleUrl; XmlElement isOriginal = xml.CreateElement("IsOriginal"); isOriginal.InnerText = isOriginal_txt == "ico_type_Original" ? "Y" : "N"; XmlElement postDate = xml.CreateElement("PostDate"); postDate.InnerText = postDate_txt; XmlElement readCount = xml.CreateElement("ReadCount"); readCount.InnerText = readCount_txt; XmlElement articleContent = xml.CreateElement("ArticleContent"); articleContent.InnerText = articleContent_txt; root.AppendChild(name); root.AppendChild(url); root.AppendChild(isOriginal); root.AppendChild(postDate); root.AppendChild(readCount); root.AppendChild(articleContent); xml.AppendChild(xmldecl); xml.AppendChild(root); xml.Save(path + articleName_txt + ".xml"); listBox1.Items.Insert(0, "【" + articleName_txt + "】文章抓取成功!"); Application.DoEvents(); } } listBox1.Items.Insert(0, "【"+lt_CategoryName[i] + "】分类获取完毕!"); } } } }
标签:
原文地址:http://blog.csdn.net/hutao1101175783/article/details/43953457