码迷,mamicode.com
首页 > Web开发 > 详细

.net使用abot爬虫简单例子

时间:2017-07-11 17:49:50      阅读:1078      评论:0      收藏:0      [点我收藏+]

标签:exce   second   为我   tps   page   seconds   query   div   自己   

abot是.net爬虫框架中的一种,Abot是一个开源的.net爬虫,速度快,易于使用和扩展。项目的地址是https://code.google.com/p/abot/

爬取的html解析,我们使用AngleSharp,项目的地址:https://github.com/AngleSharp/AngleSharp

首先我们需要配置abot

 private static readonly Uri FeedUrl = new Uri("https://www.jd.com/allSort.aspx");//定义一个爬取的url,这里以京东商品的分类为例子

 

 public static IWebCrawler GetManuallyConfiguredWebCrawler()
        {
            //这里进行配置,具体的含义自己看源代码了解
            CrawlConfiguration config = new CrawlConfiguration();
            config.MaxConcurrentThreads = System.Environment.ProcessorCount;
            config.MaxPagesToCrawl = 1000;
            config.IsExternalPageCrawlingEnabled = false;
            config.IsUriRecrawlingEnabled = false;
            config.IsExternalPageLinksCrawlingEnabled = false;
            config.IsRespectRobotsDotTextEnabled = false;
            config.DownloadableContentTypes = "text/html, text/plain";
            config.MinCrawlDelayPerDomainMilliSeconds = 1000;
            config.CrawlTimeoutSeconds = 0;
            config.MaxPagesToCrawlPerDomain = 0;

            var crawler = new PoliteWebCrawler(config, null, null, null, null, null, null, null, null);
//爬取页面前的判断 crawler.ShouldCrawlPage(ShouldCrawlPage); crawler.ShouldDownloadPageContent(ShouldDownloadPageContent); crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks);

//下面是爬取的四个事件 crawler.PageCrawlStartingAsync
+= crawler_ProcessPageCrawlStarting;//单个页面爬取开始 crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompletedAsync;//单个页面爬取结束 // crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;// 页面链接不允许爬取事件 //crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;//页面不允许爬取事件 return crawler; }

爬虫中主要是4个事件, 页面爬取开始、页面爬取失败、页面不允许爬取事件、页面中的链接不允许爬取事件.

以下是示例

 //单个页面爬取开始 
        public static void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;

        }
        //单个页面爬取结束 
        public static void crawler_ProcessPageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e)
        {
            if (e.CrawledPage.Uri == FeedUrl)
            {
                StringBuilder sb=new StringBuilder();
               //这里使用AngleSharp解析html
                var all=e.CrawledPage.AngleSharpHtmlDocument.QuerySelector(".category-items").Children;
                foreach (var col in all)
                {
                    var categorys=col.QuerySelectorAll(".category-item");
                    foreach (var category in categorys)
                    {
                        var first=category.QuerySelector(".item-title span").Text();
                        sb.Append("\r\n" + first + "\r\n");
                        var seconds = category.QuerySelector(".items").Children;
                        foreach (var second in seconds)
                        {
                            var secondtext=second.QuerySelector("dt a").Text();
                            sb.Append(secondtext + "\t");
                            var thireds = second.QuerySelector("dd").Children;
                            foreach (var thired in thireds)
                            {
                                var thiredtext = thired.Text();
                                sb.Append(thiredtext + ",");
                            }
                            sb.Remove(sb.Length - 1, 1);
                        }
                    }
                }
//爬取的数据保存到C:\Program Files (x86)\IIS Express下面。注意这里保存可能需要以管理员的身份运行VS System.IO.File.AppendAllText(
"fake.txt", sb.ToString()); } } #region /// <summary> /// 同步方法注册一个委托,以确定是否应该抓取一个页面 /// </summary> /// <param name="pageToCrawl"></param> /// <param name="crawlContext"></param> /// <returns></returns> public static CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if (pageToCrawl.IsRetry || pageToCrawl.IsRoot || FeedUrl == pageToCrawl.Uri )//判断是否为根Url,爬取的Url是否为我们指定的 { return new CrawlDecision() { Allow = true }; } else { return new CrawlDecision { Allow = false, Reason = "Not match uri" };//如果为false,就不爬取页面 } } /// <summary> /// 同步方法注册一个委托,以确定页面的内容是否应该被加载 /// </summary> /// <param name="pageToCrawl"></param> /// <param name="crawlContext"></param> /// <returns></returns> private static CrawlDecision ShouldDownloadPageContent(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if (pageToCrawl.IsRoot || pageToCrawl.IsRetry || FeedUrl == pageToCrawl.Uri) { return new CrawlDecision { Allow = true }; } return new CrawlDecision { Allow = false, Reason = "Not match uri" }; } /// <summary> /// 同步方法注册一个委托,以确定是否应该抓取一个页面的链接 /// </summary> /// <param name="crawledPage"></param> /// <param name="crawlContext"></param> /// <returns></returns> private static CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext) { if (!crawledPage.IsInternal) return new CrawlDecision { Allow = false, Reason = "We dont crawl links of external pages" }; if (crawledPage.IsRoot || crawledPage.IsRetry || crawledPage.Uri == FeedUrl) { return new CrawlDecision { Allow = true }; } else { return new CrawlDecision { Allow = false, Reason = "We only crawl links of pagination pages" }; } } #endregion

 接下来就是测试

        public ActionResult Index()
        {
            var crawler = GetManuallyConfiguredWebCrawler();
            var reuslt = crawler.Crawl(FeedUrl);
            Response.Write(reuslt.ErrorException);
            return View();
        }

 

.net使用abot爬虫简单例子

标签:exce   second   为我   tps   page   seconds   query   div   自己   

原文地址:http://www.cnblogs.com/yuanxinSix/p/7151375.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!