标签:
HtmlAgilityPack组件用于解析Html字符串,一个典型的应用场景是用于网页爬虫。
示例程序
using Common.Tools; using Datebase.Entity; using HtmlAgilityPack; using Http.Extension; using ServiceStack.Orm.Extension.Imples; using ServiceStack.Orm.Extension.Interface; using ServiceStack.OrmLite; using System; using System.Collections.Generic; using System.Configuration; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; namespace WebSpider { class Program { public static IOrmClient dbClient = new OrmClient(ConfigurationManager.ConnectionStrings["mssql"].ConnectionString, SqlServerDialect.Provider); static void Main(string[] args) { List<Task> tasks = FetchSinger(); Task.WaitAll(tasks.ToArray()); Console.WriteLine("歌手信息抓取完毕!"); Console.ReadLine(); } /// <summary> /// 网页爬虫程序,从音乐网站获取最热的前100位歌手的信息 /// </summary> private static List<Task> FetchSinger() { List<Task> tasks = new List<Task>(); HttpResult result = HttpCore.Send(new HttpItem() { URL = "http://mp3.sogou.com/static_new/topsinger_remen.html", Method = MethodType.GET }); HtmlDocument document = new HtmlDocument(); document.LoadHtml(result.Html); var rootNode = document.DocumentNode; //获取第1到第10位歌手 var top10Nodes = rootNode.SelectNodes("//div[@id=‘right2‘]/ul[@class=‘singerlist2‘]/li/a"); if (top10Nodes != null) { Task t = new Task(nodes => { var singerNodes = nodes as HtmlNodeCollection; if (singerNodes != null) { foreach (var hrefNode in singerNodes) { //歌手链接 var link = hrefNode.GetAttributeValue("href", ""); //歌手的序列号码 var noNode = hrefNode.SelectSingleNode("./strong[@class=‘singertop10‘]"); if (noNode != null) { int sNo = -1; int.TryParse(noNode.InnerText.Replace("Top", "").Trim(), out sNo); SingerDetail(sNo, link); } } } }, top10Nodes); t.Start(); tasks.Add(t); } //获取第11到第100位歌手 var tbNodes = rootNode.SelectNodes("//table[@class=‘indextable‘]"); //遍历捕获的所有的table对象 foreach (var e in tbNodes) { Task t = new Task(p => { var tbNode = p as HtmlNode; if (tbNode != null) { var hrefNodes = tbNode.SelectNodes("./tbody/tr/td/a"); if (hrefNodes != null) { foreach (var href in hrefNodes) { //序号 var sNo = -1; var trNode = href.ParentNode.PreviousSibling.PreviousSibling; if (trNode != null) { int.TryParse(trNode.InnerText.Trim().TrimEnd(‘.‘), out sNo); } var link = href.GetAttributeValue("href", ""); if (!string.IsNullOrEmpty(link)) { SingerDetail(sNo, link); } } } } }, e); t.Start(); tasks.Add(t); } return tasks; } /// <summary> /// 通过歌手链接访问歌手详细信息 /// </summary> /// <param name="sNo">序列号</param> /// <param name="link">歌手的链接地址</param> private static void SingerDetail(int sNo, string link) { var linkResult = HttpCore.Send(new HttpItem() { URL = link, Method = MethodType.GET }); if (!string.IsNullOrEmpty(linkResult.Html)) { T_Singer user = new T_Singer(); user.ID = Utility.GenerateId(); user.SerialNumber = sNo; user.IsApprove = true; user.CreateBy = "admin"; user.CreateDate = DateTime.Now; user.ModifyBy = "admin"; user.ModifyDate = DateTime.Now; HtmlDocument linkDoc = new HtmlDocument(); linkDoc.LoadHtml(linkResult.Html); //姓名/昵称 var name = linkDoc.DocumentNode.SelectSingleNode("//div[@class=‘song_tit‘]"); if (name != null) { user.RealName = user.NickName = name.InnerText.Trim().Replace("<br>", System.Environment.NewLine); } //包含个人信息的所有的li元素 var lis = linkDoc.DocumentNode.SelectNodes("//ul[@class=‘song_detail‘]/li"); //国籍 var Nationality = linkDoc.DocumentNode.SelectSingleNode("//ul[@class=‘song_detail‘]/li[1]/span"); user.Nationality = Search(lis, "国籍"); //出生地 user.Birthplace = Search(lis, "出生地"); //出生日期 //出生日期 var temp = Search(lis, "出生日期"); var match = Regex.Match(temp, @"\d{0,4}年\d{1,2}月\d{1,2}日"); var bir = string.Empty; if (match != null) { var birArr = match.Value.Split(new string[] { "年", "月", "日" }, StringSplitOptions.RemoveEmptyEntries); if (birArr.Length > 0) bir += birArr[0]; if (birArr.Length > 1) bir += "-" + birArr[1]; if (birArr.Length > 2) bir += "-" + birArr[2]; } DateTime bDay = new DateTime(1900, 1, 1); if (DateTime.TryParse(bir, out bDay)) user.Birthday = bDay; //星座 user.Constellation = Search(lis, "星座"); //简介 var selfDescNode = linkDoc.GetElementbyId("desc_long"); selfDescNode = selfDescNode ?? linkDoc.GetElementbyId("desc_short"); if (selfDescNode != null) user.BriefIntroduction = selfDescNode.InnerText.Replace("<br>", "").Trim(); dbClient.Insert(user); } } /// <summary> /// 从节点中查找指定数据方法 /// </summary> private static string Search(HtmlNodeCollection nodes, string key) { if (nodes != null) { foreach (var node in nodes) { if (node.FirstChild.InnerText.Trim().StartsWith(key)) { var spanNode = node.SelectSingleNode("./span"); if (spanNode != null) { return spanNode.InnerText.Trim().Replace("<br>", System.Environment.NewLine); } } } } return string.Empty; } } }
标签:
原文地址:http://www.cnblogs.com/Jabben/p/5720431.html