标签:
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; namespace MeiZi { public class Program { static void Main(string[] args) { new GetMeiziPic(); } } /// <summary> /// 获取妹子图片 /// </summary> public class GetMeiziPic { private readonly string _path; private const string ImgRegex = @"<img[^>]*?src\s*=\s*[""‘]?([^‘"" >]+?)[ ‘""][^>]*?>";//图片的正则表达式 private const string LinkRegex = @"<h2><a\s+[^>]*?>[^<>]*?<\/a></h2>"; public GetMeiziPic() { _path = DealDir(Path.Combine(Environment.CurrentDirectory, "Images")); Console.WriteLine("=============== 开始采集 ==============="); for (var i = 1; i < 10; i++) { Console.WriteLine("===============正在下载第{0}页数据===============", i); DoFetchStep1(i); } Console.WriteLine("=============== 采集完成 ==============="); } private string DealDir(string path) { if (!Directory.Exists(path)) Directory.CreateDirectory(path); return path; } private void DoFetchStep1(int pageNum) { var request = (HttpWebRequest)WebRequest.Create("http://www.sepaidui.com/?sort=4&page=" + pageNum); request.Credentials = CredentialCache.DefaultCredentials; var response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode != HttpStatusCode.OK) return; var stream = response.GetResponseStream(); if (stream == null) return; using (var sr = new StreamReader(stream)) { FetchLinksFromSource1(sr.ReadToEnd()); } } private void FetchLinksFromSource1(string htmlSource) { var matchesLink = Regex.Matches(htmlSource, LinkRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline); foreach (Match m in matchesLink) { string href = m.Groups[0].Value.Split(‘"‘)[1]; DoFetchStep2(href); } } private void DoFetchStep2(string href) { var request = (HttpWebRequest)WebRequest.Create(href); var h = request.Headers; request.Credentials = CredentialCache.DefaultCredentials; var response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode != HttpStatusCode.OK) return; var stream = response.GetResponseStream(); if (stream == null) return; using (var sr = new StreamReader(stream)) { FetchLinksFromSource2(sr.ReadToEnd()); } } private void FetchLinksFromSource2(string htmlSource) { var matchesImgSrc = Regex.Matches(htmlSource, ImgRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline); foreach (Match m in matchesImgSrc) { var href = m.Groups[1].Value; //只选取来自新浪相册的图片 if (href.Contains("sinaimg") && CheckIsUrlFormat(href) && !href.Contains("60d02b59tw1eq6g7srmiwj20pv03mdg8")) { Console.WriteLine(href); } else continue; using (var myWebClient = new WebClient()) { try { myWebClient.DownloadFile(new Uri(href), Path.Combine(_path, Path.GetRandomFileName() + Path.GetExtension(href))); } catch (Exception ex) { Console.WriteLine(ex.Message); } } } } private readonly Regex _isUrlFormat = new Regex(@"http://?([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"); private bool CheckIsUrlFormat(string value) { return _isUrlFormat.IsMatch(value); } } }
标签:
原文地址:http://www.cnblogs.com/talentzemin/p/4355035.html