标签:
using AnfleCrawler.Common; using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace AnfleCrawler.DataAnalyzer { internal class Mytophome : AnalyzerBase { protected override void AnalyzeInternal(PageLandEntity current) { var lander = Crawler.Lander; var pHandler = CreateContentHandler(current); switch (current.Depth) { case 0: { var dom = lander.GetDocument(pHandler); var nextNode = QueryNode(dom.DocumentNode, "nobr").ParentNode; nextNode.SetAttributeValue("id", PagingHack); DoPerPaging(current, dom.DocumentNode, string.Format("#{0}", PagingHack)); foreach (var node in QueryNodes(dom.DocumentNode, ".deD_ctt li")) { var Nset = QueryNodes(node, "span").ToArray(); var hUrl = GetHref(QueryNode(Nset[1], "a"), current.Url); var query = System.Web.HttpUtility.ParseQueryString(hUrl.Query); string shid = query["estateId"]; hUrl = new Uri(string.Format("http://{0}/wiki/{1}/detail.html", hUrl.Authority, shid)); Guid housesID; try { CheckHouses(hUrl, out housesID); } catch (HtmlNodeMissingException ex) { App.LogError(ex, "OrgUrl={0} HousesUrl={1}", shid, hUrl); continue; } var vals = Nset.Select(p => p.InnerText.HtmlTrim()).ToArray(); DateTime? transactionDate = null; DateTime dump; if (DateTime.TryParse(vals.Last(), out dump)) { transactionDate = dump; } if (vals.Length == 6) { Repository.SaveHouselisting(new HouselistingEntity() { HousesID = housesID, TransactionDate = transactionDate, BuildingName = vals[2], Area = string.Format("{0}平方", vals[3]), SoldPriceOrRent = string.Format("{0}万", vals[4]), UnitPriceOrLease = string.Format("{0}元/平方", vals[5]), }); } else { Repository.SaveHouselisting(new HouselistingEntity() { HousesID = housesID, TransactionDate = transactionDate, Area = string.Format("{0}平方", vals[2]), SoldPriceOrRent = string.Format("{0}万", vals[3]), UnitPriceOrLease = string.Format("{0}元/平方", vals[4]), }); } Crawler.OutWrite("保存小区出售记录 {0}", housesID); } } break; } } private void CheckHouses(Uri housesUrl, out Guid housesID) { var pHandler = CreateContentHandler(new PageLandEntity() { Url = housesUrl, Depth = DataDepth.Houses }); pHandler.AjaxBlocks.Add(HACK); var dom = Crawler.Lander.GetDocument(pHandler); var attrs = new AttributeFiller(); attrs.Append(QueryTexts(dom.DocumentNode, ".xxjs_rbar_ct li")); housesID = GenHashKey(housesUrl.OriginalString); var bo = Crawler.Repository.LoadHouses(housesID); if (!string.IsNullOrEmpty(bo.SiteID)) { return; } bo.SiteID = "Mytophome.com"; bo.PageUrl = housesUrl.OriginalString; bo.CityName = Crawler.Config.CityName; attrs.FillEntity(bo, new Dictionary<string, string>() { {"楼盘名称", "小区名称"}, {"楼盘地址", "小区地址"}, {"发展商", "开发商"}, {"物管公司", "物业公司"}, {"物管电话", "物业办公电话"}, }); MapMark(bo); Crawler.Repository.Save(bo); Crawler.OutWrite("保存楼盘 {0}", bo.小区名称); } } }
标签:
原文地址:http://www.cnblogs.com/Googler/p/4272703.html