标签:style blog class c code ext
原文:简单爬虫-抓取博客园文章列表
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76 |
public
class HttpCnblogs { public
static List<CnblogsModel> HttpGetHtml() { request.Method = "GET" ; request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" ; request.UserAgent = " Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0" ; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream stream = response.GetResponseStream(); StreamReader sr = new
StreamReader(stream); string
articleContent = sr.ReadToEnd(); List<CnblogsModel> list = new
List<CnblogsModel>(); #region 正则表达式 //div post_item_body列表 Regex regBody = new
Regex( @"<div\sclass=""post_item_body"">([\s\S].*?)</div>" , RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace); //a标签 文章标题 作者名字 评论 阅读 Regex regA = new
Regex( "<a[^>]*?>(.*?)</a>" , RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace); //p标签 文章内容 Regex regP = new
Regex( @"<p\sclass=""post_item_summary"">(.*?)</p>" , RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace); //提取评论 阅读次数如:评论(10)-》10 Regex regNumbernew = new
Regex( @"\d+" , RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace); //提取时间 Regex regTime = new
Regex( @"\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}" , RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace); #endregion MatchCollection mList = regBody.Matches(articleContent); CnblogsModel model = null ; String strBody = String.Empty; for
( int
i = 0; i < mList.Count; i++) { model = new
CnblogsModel(); strBody = mList[i].Groups[1].ToString(); MatchCollection aList = regA.Matches(strBody); int
aCount = aList.Count; model.ArticleTitle = aList[0].Groups[1].ToString(); model.ArticleAutor = aCount == 5 ? aList[2].Groups[1].ToString() : aList[1].Groups[1].ToString(); model.ArticleComment = Convert.ToInt32(regNumbernew.Match(aList[aCount-2].Groups[1].ToString()).Value); model.ArticleTime = regTime.Match(strBody).Value; model.ArticleView = Convert.ToInt32(regNumbernew.Match(aList[aCount-1].Groups[1].ToString()).Value); model.ArticleContent = regP.Matches(strBody)[0].Groups[1].ToString(); list.Add(model); } return
list; } } public
class CnblogsModel { /// <summary> /// 文章标题 /// </summary> public
String ArticleTitle { get ; set ; } /// <summary> /// 文章内容摘要 /// </summary> public
String ArticleContent { get ; set ; } /// <summary> /// 文章作者 /// </summary> public
String ArticleAutor { get ; set ; } /// <summary> /// 文章发布时间 /// </summary> public
String ArticleTime { get ; set ; } /// <summary> /// 文章评论量 /// </summary> public
Int32 ArticleComment { get ; set ; } /// <summary> /// 文章浏览量 /// </summary> public
Int32 ArticleView { get ; set ; } } |
写的不好,还请见谅,准备下面试去。。
标签:style blog class c code ext
原文地址:http://www.cnblogs.com/lonelyxmas/p/3738838.html