码迷,mamicode.com
首页 > 其他好文 > 详细

Soufun_News

时间:2014-12-24 09:58:16      阅读:129      评论:0      收藏:0      [点我收藏+]

标签:

using AnfleCrawler.Common;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;

namespace AnfleCrawler.DataAnalyzer
{
    internal class Soufun_News : AnalyzerBase
    {
        private enum Kind
        {
            [Description("市场")]
            Market = 32,
            [Description("政策")]
            Policy = 35,
            [Description("公司")]
            Company = 736,
        }

        private static readonly string[] FilterTags = new string[] { "script", "iframe" };

        public override void Init(PageCrawler crawler)
        {
            string exp = string.Format("http://news.sh.soufun.com/more/[{0}]/[1-50].html", string.Join(",", Enum.GetValues(typeof(Kind)).Cast<int>()));
            crawler.PushUrl(new StringPatternGenerator(exp), 0);
            base.Init(crawler);
        }

        protected override void AnalyzeInternal(PageLandEntity current)
        {
            var lander = Crawler.Lander;
            dynamic repository = Repository;
            var pHandler = CreateContentHandler(current);
            switch (current.Depth)
            {
                case 0:
                    {
                        var dom = lander.GetDocument(pHandler);
                        foreach (var node in QueryNodes(dom.DocumentNode, ".contenttext"))
                        {
                            var linkNode = QueryNode(node, "a.link_01");
                            string url = GetHref(linkNode, current.Url).OriginalString;
                            int i = url.LastIndexOf(".");
                            Crawler.PushUrl(new Uri(url.Insert(i, "_all")), 1);
                        }
                    }
                    break;
                case 1:
                    {
                        var dom = lander.GetDocument(pHandler);
                        var hackNode = QueryNode(dom.DocumentNode, "#newxq_B01_26");
                        string kind = QueryNodes(hackNode, "a").Last().InnerText;
                        string title = QueryNode(dom.DocumentNode, "h1").InnerText;
                        var contentNode = QueryNode(dom.DocumentNode, "#news_body");
                        foreach (string tag in FilterTags)
                        {
                            foreach (var node in QueryNodes(contentNode, tag, false).ToArray())
                            {
                                node.Remove();
                            }
                        }
                        var set = QueryNodes(dom.DocumentNode, "#newxq_B01_27 span").Take(2).ToArray();
                        string source = null;
                        DateTime publishDate;
                        DateTime.TryParse(set[0].InnerText, out publishDate);
                        if (set.Length == 2)
                        {
                            source = set[1].InnerText;
                        }
                        repository.SaveNews(current.Url, kind, source, title, contentNode.InnerHtml, publishDate);
                        Crawler.OutWrite("保存新闻 {0}", title);
                    }
                    break;
            }
        }
    }
}

 

 

        public void SaveNews(Uri pageUrl, string kind, string source, string title, string content, DateTime publishDate)
        {
            Guid rowID = CryptoManaged.MD5Hash(pageUrl.OriginalString);
            using (var db = Create())
            {
                var q = from t in db.News
                        where t.RowID == rowID
                        select t;
                var news = q.SingleOrDefault();
                if (news == null)
                {
                    db.News.Add(news = new News()
                    {
                        RowID = rowID,
                        SiteID = pageUrl.Authority,
                    });
                }
                news.Kind = kind;
                news.Source = source;
                news.Title = title;
                news.Content = content;
                news.PublishDate = publishDate;
                db._SaveChanges();
            }
        }

 

Soufun_News

标签:

原文地址:http://www.cnblogs.com/Googler/p/4181664.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!