码迷,mamicode.com
首页 > Web开发 > 详细

HtmlAgilityPack组件

时间:2016-07-30 11:49:25      阅读:215      评论:0      收藏:0      [点我收藏+]

标签:

HtmlAgilityPack组件用于解析Html字符串,一个典型的应用场景是用于网页爬虫。

示例程序

技术分享
using Common.Tools;
using Datebase.Entity;
using HtmlAgilityPack;
using Http.Extension;
using ServiceStack.Orm.Extension.Imples;
using ServiceStack.Orm.Extension.Interface;
using ServiceStack.OrmLite;
using System;
using System.Collections.Generic;
using System.Configuration;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;

namespace WebSpider
{
    class Program
    {
        public static IOrmClient dbClient = new OrmClient(ConfigurationManager.ConnectionStrings["mssql"].ConnectionString, SqlServerDialect.Provider);
        static void Main(string[] args)
        {
            List<Task> tasks = FetchSinger();
            Task.WaitAll(tasks.ToArray());
            Console.WriteLine("歌手信息抓取完毕!");
            Console.ReadLine();
        }

        /// <summary>
        /// 网页爬虫程序,从音乐网站获取最热的前100位歌手的信息
        /// </summary>
        private static List<Task> FetchSinger()
        {
            List<Task> tasks = new List<Task>();
            HttpResult result = HttpCore.Send(new HttpItem()
            {
                URL = "http://mp3.sogou.com/static_new/topsinger_remen.html",
                Method = MethodType.GET
            });
            HtmlDocument document = new HtmlDocument();
            document.LoadHtml(result.Html);
            var rootNode = document.DocumentNode;
            //获取第1到第10位歌手
            var top10Nodes = rootNode.SelectNodes("//div[@id=‘right2‘]/ul[@class=‘singerlist2‘]/li/a");
            if (top10Nodes != null)
            {
                Task t = new Task(nodes =>
                {
                    var singerNodes = nodes as HtmlNodeCollection;
                    if (singerNodes != null)
                    {
                        foreach (var hrefNode in singerNodes)
                        {
                            //歌手链接
                            var link = hrefNode.GetAttributeValue("href", "");
                            //歌手的序列号码
                            var noNode = hrefNode.SelectSingleNode("./strong[@class=‘singertop10‘]");
                            if (noNode != null)
                            {
                                int sNo = -1;
                                int.TryParse(noNode.InnerText.Replace("Top", "").Trim(), out sNo);
                                SingerDetail(sNo, link);
                            }
                        }
                    }
                }, top10Nodes);
                t.Start();
                tasks.Add(t);
            }
            //获取第11到第100位歌手
            var tbNodes = rootNode.SelectNodes("//table[@class=‘indextable‘]");
            //遍历捕获的所有的table对象
            foreach (var e in tbNodes)
            {
                Task t = new Task(p =>
                {
                    var tbNode = p as HtmlNode;
                    if (tbNode != null)
                    {
                        var hrefNodes = tbNode.SelectNodes("./tbody/tr/td/a");
                        if (hrefNodes != null)
                        {
                            foreach (var href in hrefNodes)
                            {
                                //序号
                                var sNo = -1;
                                var trNode = href.ParentNode.PreviousSibling.PreviousSibling;
                                if (trNode != null)
                                {
                                    int.TryParse(trNode.InnerText.Trim().TrimEnd(.), out sNo);
                                }
                                var link = href.GetAttributeValue("href", "");
                                if (!string.IsNullOrEmpty(link))
                                {
                                    SingerDetail(sNo, link);
                                }
                            }
                        }
                    }
                }, e);
                t.Start();
                tasks.Add(t);
            }
            return tasks;
        }

        /// <summary>
        /// 通过歌手链接访问歌手详细信息
        /// </summary>
        /// <param name="sNo">序列号</param>
        /// <param name="link">歌手的链接地址</param>
        private static void SingerDetail(int sNo, string link)
        {
            var linkResult = HttpCore.Send(new HttpItem()
            {
                URL = link,
                Method = MethodType.GET
            });
            if (!string.IsNullOrEmpty(linkResult.Html))
            {
                T_Singer user = new T_Singer();
                user.ID = Utility.GenerateId();
                user.SerialNumber = sNo;
                user.IsApprove = true;
                user.CreateBy = "admin";
                user.CreateDate = DateTime.Now;
                user.ModifyBy = "admin";
                user.ModifyDate = DateTime.Now;
                HtmlDocument linkDoc = new HtmlDocument();
                linkDoc.LoadHtml(linkResult.Html);
                //姓名/昵称
                var name = linkDoc.DocumentNode.SelectSingleNode("//div[@class=‘song_tit‘]");
                if (name != null)
                {
                    user.RealName = user.NickName = name.InnerText.Trim().Replace("<br>", System.Environment.NewLine);
                }
                //包含个人信息的所有的li元素
                var lis = linkDoc.DocumentNode.SelectNodes("//ul[@class=‘song_detail‘]/li");
                //国籍
                var Nationality = linkDoc.DocumentNode.SelectSingleNode("//ul[@class=‘song_detail‘]/li[1]/span");
                user.Nationality = Search(lis, "国籍");
                //出生地
                user.Birthplace = Search(lis, "出生地");
                //出生日期
                //出生日期
                var temp = Search(lis, "出生日期");
                var match = Regex.Match(temp, @"\d{0,4}年\d{1,2}月\d{1,2}日");
                var bir = string.Empty;
                if (match != null)
                {
                    var birArr = match.Value.Split(new string[] { "", "", "" }, StringSplitOptions.RemoveEmptyEntries);
                    if (birArr.Length > 0)
                        bir += birArr[0];
                    if (birArr.Length > 1)
                        bir += "-" + birArr[1];
                    if (birArr.Length > 2)
                        bir += "-" + birArr[2];
                }
                DateTime bDay = new DateTime(1900, 1, 1);
                if (DateTime.TryParse(bir, out bDay))
                    user.Birthday = bDay;
                //星座
                user.Constellation = Search(lis, "星座");
                //简介
                var selfDescNode = linkDoc.GetElementbyId("desc_long");
                selfDescNode = selfDescNode ?? linkDoc.GetElementbyId("desc_short");
                if (selfDescNode != null)
                    user.BriefIntroduction = selfDescNode.InnerText.Replace("<br>", "").Trim();
                dbClient.Insert(user);
            }
        }

        /// <summary>
        /// 从节点中查找指定数据方法
        /// </summary>
        private static string Search(HtmlNodeCollection nodes, string key)
        {
            if (nodes != null)
            {
                foreach (var node in nodes)
                {
                    if (node.FirstChild.InnerText.Trim().StartsWith(key))
                    {
                        var spanNode = node.SelectSingleNode("./span");
                        if (spanNode != null)
                        {
                            return spanNode.InnerText.Trim().Replace("<br>", System.Environment.NewLine);
                        }
                    }
                }
            }
            return string.Empty;
        }
    }
}
View Code

 

HtmlAgilityPack组件

标签:

原文地址:http://www.cnblogs.com/Jabben/p/5720431.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!