标签:
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using System.Text; using GearUp.Crawler.Entities; using HtmlAgilityPack; using System.Threading.Tasks; using System.Threading.Tasks.Dataflow; using System.Text.RegularExpressions; using System.Collections.Concurrent; using System.Threading; namespace GearUp.Crawler { public class Crawler { private ILoreBookItemRepository repository; private ILorebookItemParser parser; private LinkManager linkManager; private string linkDomain; private static ConcurrentDictionary<string, bool> urls = new ConcurrentDictionary<string, bool>(); private const int DownloadTimeout = 10; public Crawler(ILoreBookItemRepository repository, ILorebookItemParser parser, LinkManager linkManager) { this.repository = repository; this.parser = parser; this.linkManager = linkManager; } public async void StartCrawl(string targetUrl) { var cts = new CancellationTokenSource(); var ct = cts.Token; linkDomain = LinkManager.LinkDomain(targetUrl); var downloaderOptions = new ExecutionDataflowBlockOptions { MaxMessagesPerTask = 3, MaxDegreeOfParallelism = 4, BoundedCapacity = 10 }; var downloader = new TransformBlock<string, PageAndUrl>(async (url) => await DownloadUrl(url), downloaderOptions); var pipelineOptions = new ExecutionDataflowBlockOptions { MaxMessagesPerTask = 2, CancellationToken = ct }; var linkParser = new TransformManyBlock<PageAndUrl, string>(page => ExtactLinksFromPage(page), pipelineOptions); var writer = new ActionBlock<PageAndUrl>(async page => await SaveEntry(page), new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = 4 }); var contentBroadcaster = new BroadcastBlock<PageAndUrl>(p => p, new ExecutionDataflowBlockOptions() { CancellationToken = ct }); // Flow setup downloader.LinkTo(contentBroadcaster); contentBroadcaster.LinkTo(linkParser); contentBroadcaster.LinkTo(writer); linkParser.LinkTo(downloader); //Kick off the TPL dataflow here downloader.Post(targetUrl); WriteToConsole("Crawling...", ConsoleColor.Green); PromptUser("Press <Esc> to Stop:", ConsoleColor.White, ConsoleKey.Escape); cts.Cancel(); WriteToConsole("Stopping...", ConsoleColor.Green); await Task.WhenAll(downloader.Completion, contentBroadcaster.Completion, linkParser.Completion, writer.Completion); } public IEnumerable<string> ExtactLinksFromPage(PageAndUrl page) { if (page == null) return Enumerable.Empty<string>(); var discoveredLinks = new List<string>(); var document = new LorebookDocument(page.Html); foreach (var link in document.LinksInArticleBodyDiv()) { var fullUrl = linkManager.FullyQualifyLink(page.Url, link); if (linkDomain.Equals(LinkManager.LinkDomain(fullUrl))) discoveredLinks.Add(fullUrl); } WriteToConsole(" {0} --> {1} links", ConsoleColor.Gray, page.Url, discoveredLinks.Count); return discoveredLinks; } public LorebookItem ExtractLoreBookItem(LorebookDocument document, string url) { WriteToConsole("Parsing: {0}", ConsoleColor.Cyan, url); var itemDetails = document.OfficialLorebookEntry(); var item = parser.ParseHtmlNode(itemDetails, url); return item; } public async Task<PageAndUrl> DownloadUrl(string url) { try { if (urls.ContainsKey(url)) return null; urls.TryAdd(url, true); var client = new WebClient(); WriteToConsole("Fetching: {0}", ConsoleColor.DarkGreen, url); var download = client.DownloadStringTaskAsync(url); var cancel = Task.Delay(DownloadTimeout * 1000); var any = await Task.WhenAny(download, cancel); if (any == cancel) { client.CancelAsync(); WriteToConsole("Cancel: [{0}]", ConsoleColor.Gray, url); return null; } string result = download.Result; WriteToConsole("Downloaded: {0}", ConsoleColor.White, url); return new PageAndUrl() { Url = url, Html = result }; } catch (WebException ex) { WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, ex.Message); } catch (AggregateException ex) { foreach (var exc in ex.Flatten().InnerExceptions) { WriteToConsole("Error: [{0}]\r\n\t{1}", ConsoleColor.Red, url, exc.Message); } } catch (Exception ex) { WriteToConsole("Unexpected error: {0}", ConsoleColor.Red, ex.Message); } return null; } public async Task SaveEntry(PageAndUrl page) { if (page == null) return; var document = new LorebookDocument(page.Html); var item = ExtractLoreBookItem(document, page.Url); if (item != null) await repository.Save(page.Url, item); } private static void WriteToConsole(string format, ConsoleColor color, params object[] texts) { Console.ForegroundColor = color; Console.WriteLine(format, texts); Console.ResetColor(); } private void PromptUser(string message, ConsoleColor color, ConsoleKey? key = null) { WriteToConsole(message, color); if (key == null) Console.ReadLine(); else { ConsoleKeyInfo entry; do { entry = Console.ReadKey(true); } while (key != entry.Key); } } } }
标签:
原文地址:http://www.cnblogs.com/zeroone/p/4418338.html