标签:
using AnfleCrawler.Common; using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace AnfleCrawler.DataAnalyzer { internal class Qy58 : AnalyzerBase { public override void Init(PageCrawler crawler) { base.Init(crawler); var url = new Uri("http://qy.58.com/caohejing/pn1/?PGTID=14177711280840.45006677554920316&ClickID=1"); //http://qy.58.com/19583455460359/?PGTID=14177659184690.5166369006238447&ClickID=4 crawler.PushUrl(url, 0); } protected override void AnalyzeInternal(PageLandEntity current) { var lander = Crawler.Lander; var pHandler = CreateContentHandler(current); switch (current.Depth) { case 0: { pHandler.AjaxBlocks.Add(HACK); var dom = lander.GetDocument(pHandler); DoPerPaging(current, dom.DocumentNode, ".next"); foreach (var node in QueryNodes(dom.DocumentNode, ".compList a")) { var url = GetHref(node, current.Url); Crawler.PushUrl(url, 1); } } break; case 1: { var dom = lander.GetDocument(pHandler); var attr = new AttributeFiller(); attr.Append("Name:{0}", QueryTexts(dom.DocumentNode, ".compT").First()); foreach (var th in QueryNodes(dom.DocumentNode, ".basicMsg table th").Skip(1)) { string sTh = th.InnerText, sTd; switch (sTh) { case "联系电话": case "邮箱": var client = new System.Net.WebClient(); var iNode = QueryNode(th.NextSibling, "img"); byte[] imgRaw = client.DownloadData(GetHref(iNode, current.Url, attrName: "src")); var img = new System.Drawing.Bitmap(new System.IO.MemoryStream(imgRaw)); sTd = OCR(img); break; case "公司地址": sTd = QueryTexts(th.NextSibling, "span").First(); break; default: sTd = th.NextSibling.InnerText.HtmlTrim(); break; } attr.Append("{0}:{1}", sTh, sTd); } var bo = new CompanyEntity(); bo.City = "上海"; bo.GroupName = "漕河泾企业"; bo.PageUrl = current.Url.OriginalString; bo.UpdateDate = DateTime.Now; attr.FillEntity(bo, new Dictionary<string, string>() { {"公司性质", "Nature"}, {"公司行业", "Industry"}, {"公司规模", "Scale"}, {"联系人", "ContactPerson"}, {"企业网址", "Website"}, {"联系电话", "Tel"}, {"邮箱", "Email"}, {"公司地址", "Address"}, }); Repository.SaveCompany(bo); Crawler.OutWrite("保存企业 {0}", bo.Name); } break; } } } }
标签:
原文地址:http://www.cnblogs.com/Googler/p/4211492.html