码迷,mamicode.com
首页 > Web开发 > 详细

dotNet使用HttpWebRequest模拟浏览器

时间:2015-07-30 21:20:43      阅读:221      评论:0      收藏:0      [点我收藏+]

标签:

在编写网络爬虫时,HttpWebRequest几乎可以完成绝大多数网站的抓取,为了更好的使用这一技术,我将常用的几个功能进行了封装,以方便调用。这个类已经在多个项目中得到使用,主要解决了Cookies相关的一些问题;如果有其它方面的问题可以提出来,我会进一步完善。

目前HttpHelper包含了以下几个方面:

  • GetHttpContent:通过Get或Post来获取网页的Html
  • SetCookie:根据response中头部的set-cookie对cookie进行设置,能识别httponly
  • GetAllCookies:将CookieContainer转换为键值对,方便存储和跨程序间调用
  • ConvertToCookieContainer:将键值对转换回CookieContainer供程序调用
  • BuildPostData:通过一个需要post的html构建出postdata

代码如下:

  1 using System;
  2 using System.Collections.Generic;
  3 using System.Collections.Specialized;
  4 using System.IO;
  5 using System.IO.Compression;
  6 using System.Linq;
  7 using System.Net;
  8 using System.Net.Security;
  9 using System.Security.Cryptography.X509Certificates;
 10 using System.Text;
 11 using System.Text.RegularExpressions;
 12 using System.Collections;
 13 using HtmlAgilityPack;
 14 
 15 namespace TNIdea.Common.Helper
 16 {
 17     public class HttpHelper
 18     {
 19         public const string CharsetReg = @"(meta.*?charset=""?(?<Charset>[^\s""‘>]+)""?)|(xml.*?encoding=""?(?<Charset>[^\s"">]+)""?)";
 20 
 21         /// <summary>
 22         /// 获取网页的内容
 23         /// </summary>
 24         /// <param name="url">Url</param>
 25         /// <param name="postData">Post的信息</param>
 26         /// <param name="cookies">Cookies</param>
 27         /// <param name="userAgent">浏览器标识</param>
 28         /// <param name="referer">来源页</param>
 29         /// <param name="cookiesDomain">Cookies的Domian参数,配合cookies使用;为空则取url的Host</param>
 30         /// <param name="encode">编码方式,用于解析html</param>
 31         /// <returns></returns>
 32         public static string GetHttpContent(string url, string postData = null, CookieContainer cookies = null, string userAgent = "", string referer = "", string cookiesDomain = "", Encoding encode = null)
 33         {
 34             try
 35             {
 36                 HttpWebResponse httpResponse = null;
 37                 if (!string.IsNullOrWhiteSpace(postData))
 38                     httpResponse = CreatePostHttpResponse(url, postData, cookies: cookies, userAgent: userAgent, referer: referer);
 39                 else
 40                     httpResponse = CreateGetHttpResponse(url, cookies: cookies, userAgent: userAgent, referer: referer);
 41 
 42                 #region 根据Html头判断
 43                 string Content = null;
 44                 //缓冲区长度
 45                 const int N_CacheLength = 10000;
 46                 //头部预读取缓冲区,字节形式
 47                 var bytes = new List<byte>();
 48                 int count = 0;
 49                 //头部预读取缓冲区,字符串
 50                 String cache = string.Empty;
 51 
 52                 //创建流对象并解码
 53                 Stream ResponseStream;
 54                 switch (httpResponse.ContentEncoding.ToUpperInvariant())
 55                 {
 56                     case "GZIP":
 57                         ResponseStream = new GZipStream(
 58                             httpResponse.GetResponseStream(), CompressionMode.Decompress);
 59                         break;
 60                     case "DEFLATE":
 61                         ResponseStream = new DeflateStream(
 62                             httpResponse.GetResponseStream(), CompressionMode.Decompress);
 63                         break;
 64                     default:
 65                         ResponseStream = httpResponse.GetResponseStream();
 66                         break;
 67                 }
 68 
 69                 try
 70                 {
 71                     while (
 72                         !(cache.EndsWith("</head>", StringComparison.OrdinalIgnoreCase)
 73                           || count >= N_CacheLength))
 74                     {
 75                         var b = (byte)ResponseStream.ReadByte();
 76                         if (b < 0) //end of stream
 77                         {
 78                             break;
 79                         }
 80                         bytes.Add(b);
 81 
 82                         count++;
 83                         cache += (char)b;
 84                     }
 85 
 86 
 87                     if (encode == null)
 88                     {
 89                         try
 90                         {
 91                             if (httpResponse.CharacterSet == "ISO-8859-1" || httpResponse.CharacterSet == "zh-cn")
 92                             {
 93                                 Match match = Regex.Match(cache, CharsetReg, RegexOptions.IgnoreCase | RegexOptions.Multiline);
 94                                 if (match.Success)
 95                                 {
 96                                     try
 97                                     {
 98                                         string charset = match.Groups["Charset"].Value;
 99                                         encode = Encoding.GetEncoding(charset);
100                                     }
101                                     catch { }
102                                 }
103                                 else
104                                     encode = Encoding.GetEncoding("GB2312");
105                             }
106                             else
107                                 encode = Encoding.GetEncoding(httpResponse.CharacterSet);
108                         }
109                         catch { }
110                     }
111 
112                     //缓冲字节重新编码,然后再把流读完
113                     var Reader = new StreamReader(ResponseStream, encode);
114                     Content = encode.GetString(bytes.ToArray(), 0, count) + Reader.ReadToEnd();
115                     Reader.Close();
116                 }
117                 catch (Exception ex)
118                 {
119                     return ex.ToString();
120                 }
121                 finally
122                 {
123                     httpResponse.Close();
124                 }
125                 #endregion 根据Html头判断
126 
127                 //获取返回的Cookies,支持httponly
128                 if (string.IsNullOrWhiteSpace(cookiesDomain))
129                     cookiesDomain = httpResponse.ResponseUri.Host;
130 
131                 cookies = new CookieContainer();
132                 CookieCollection httpHeaderCookies = SetCookie(httpResponse, cookiesDomain);
133                 cookies.Add(httpHeaderCookies ?? httpResponse.Cookies);
134 
135                 return Content;
136             }
137             catch
138             {
139                 return string.Empty;
140             }
141         }
142 
143 
144         /// <summary>
145         /// 创建GET方式的HTTP请求 
146         /// </summary>
147         /// <param name="url"></param>
148         /// <param name="timeout"></param>
149         /// <param name="userAgent"></param>
150         /// <param name="cookies"></param>
151         /// <param name="referer"></param>
152         /// <returns></returns>
153         public static HttpWebResponse CreateGetHttpResponse(string url, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "")
154         {
155             HttpWebRequest request = null;
156             if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
157             {
158                 //对服务端证书进行有效性校验(非第三方权威机构颁发的证书,如自己生成的,不进行验证,这里返回true)
159                 ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
160                 request = WebRequest.Create(url) as HttpWebRequest;
161                 //request.ProtocolVersion = HttpVersion.Version10;    //http版本,默认是1.1,这里设置为1.0
162             }
163             else
164             {
165                 request = WebRequest.Create(url) as HttpWebRequest;
166             }
167 
168             request.Referer = referer;
169             request.Method = "GET";
170 
171             //设置代理UserAgent和超时
172             if (string.IsNullOrWhiteSpace(userAgent))
173                 userAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36";
174 
175             request.UserAgent = userAgent;
176             request.Timeout = timeout;
177             request.KeepAlive = true;
178             request.AllowAutoRedirect = true;
179 
180             if (cookies == null)
181                 cookies = new CookieContainer();
182             request.CookieContainer = cookies;
183 
184             return request.GetResponse() as HttpWebResponse;
185         }
186 
187         /// <summary>
188         /// 创建POST方式的HTTP请求
189         /// </summary>
190         /// <param name="url"></param>
191         /// <param name="postData"></param>
192         /// <param name="timeout"></param>
193         /// <param name="userAgent"></param>
194         /// <param name="cookies"></param>
195         /// <param name="referer"></param>
196         /// <returns></returns>
197         public static HttpWebResponse CreatePostHttpResponse(string url, string postData, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "")
198         {
199             HttpWebRequest request = null;
200             //如果是发送HTTPS请求  
201             if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
202             {
203                 ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
204                 request = WebRequest.Create(url) as HttpWebRequest;
205                 //request.ProtocolVersion = HttpVersion.Version10;
206             }
207             else
208             {
209                 request = WebRequest.Create(url) as HttpWebRequest;
210             }
211             request.Referer = referer;
212             request.Method = "POST";
213             request.ContentType = "application/x-www-form-urlencoded";
214 
215             //设置代理UserAgent和超时
216             if (string.IsNullOrWhiteSpace(userAgent))
217                 request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36";
218             else
219                 request.UserAgent = userAgent;
220             request.Timeout = timeout;
221             request.KeepAlive = true;
222             request.AllowAutoRedirect = true;
223 
224             if (cookies == null)
225                 cookies = new CookieContainer();
226             request.CookieContainer = cookies;
227 
228             //发送POST数据  
229             if (!string.IsNullOrWhiteSpace(postData))
230             {
231                 byte[] data = Encoding.UTF8.GetBytes(postData);
232                 request.ContentLength = data.Length;
233                 using (Stream stream = request.GetRequestStream())
234                 {
235                     stream.Write(data, 0, data.Length);
236                 }
237             }
238             //string[] values = request.Headers.GetValues("Content-Type");
239             return request.GetResponse() as HttpWebResponse;
240         }
241 
242         /// <summary>
243         /// 验证证书
244         /// </summary>
245         /// <param name="sender"></param>
246         /// <param name="certificate"></param>
247         /// <param name="chain"></param>
248         /// <param name="errors"></param>
249         /// <returns>是否验证通过</returns>
250         private static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
251         {
252             if (errors == SslPolicyErrors.None)
253                 return true;
254             return false;
255         }
256 
257         /// <summary>
258         /// 根据response中头部的set-cookie对request中的cookie进行设置
259         /// </summary>
260         /// <param name="setCookie">The set cookie.</param>
261         /// <param name="defaultDomain">The default domain.</param>
262         /// <returns></returns>
263         private static CookieCollection SetCookie(HttpWebResponse response, string defaultDomain)
264         {
265             try
266             {
267                 string[] setCookie = response.Headers.GetValues("Set-Cookie");
268 
269                 // there is bug in it,the datetime in "set-cookie" will be sepreated in two pieces.
270                 List<string> a = new List<string>(setCookie);
271                 for (int i = setCookie.Length - 1; i > 0; i--)
272                 {
273                     if (a[i].Substring(a[i].Length - 3) == "GMT")
274                     {
275                         a[i - 1] = a[i - 1] + ", " + a[i];
276                         a.RemoveAt(i);
277                         i--;
278                     }
279                 }
280                 setCookie = a.ToArray<string>();
281                 CookieCollection cookies = new CookieCollection();
282                 foreach (string str in setCookie)
283                 {
284                     NameValueCollection hs = new NameValueCollection();
285                     foreach (string i in str.Split(;))
286                     {
287                         int index = i.IndexOf("=");
288                         if (index > 0)
289                             hs.Add(i.Substring(0, index).Trim(), i.Substring(index + 1).Trim());
290                         else
291                             switch (i)
292                             {
293                                 case "HttpOnly":
294                                     hs.Add("HttpOnly", "True");
295                                     break;
296                                 case "Secure":
297                                     hs.Add("Secure", "True");
298                                     break;
299                             }
300                     }
301                     Cookie ck = new Cookie();
302                     foreach (string Key in hs.AllKeys)
303                     {
304                         switch (Key.ToLower().Trim())
305                         {
306                             case "path":
307                                 ck.Path = hs[Key];
308                                 break;
309                             case "expires":
310                                 ck.Expires = DateTime.Parse(hs[Key]);
311                                 break;
312                             case "domain":
313                                 ck.Domain = hs[Key];
314                                 break;
315                             case "httpOnly":
316                                 ck.HttpOnly = true;
317                                 break;
318                             case "secure":
319                                 ck.Secure = true;
320                                 break;
321                             default:
322                                 ck.Name = Key;
323                                 ck.Value = hs[Key];
324                                 break;
325                         }
326                     }
327                     if (ck.Domain == "") ck.Domain = defaultDomain;
328                     if (ck.Name != "") cookies.Add(ck);
329                 }
330                 return cookies;
331             }
332             catch
333             {
334                 return null;
335             }
336         }
337 
338         /// <summary>
339         /// 遍历CookieContainer
340         /// </summary>
341         /// <param name="cookieContainer"></param>
342         /// <returns>List of cookie</returns>
343         public static Dictionary<string, string> GetAllCookies(CookieContainer cookieContainer)
344         {
345             Dictionary<string, string> cookies = new Dictionary<string, string>();
346 
347             Hashtable table = (Hashtable)cookieContainer.GetType().InvokeMember("m_domainTable",
348                 System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField |
349                 System.Reflection.BindingFlags.Instance, null, cookieContainer, new object[] { });
350 
351             foreach (string pathList in table.Keys)
352             {
353                 StringBuilder _cookie = new StringBuilder();
354                 SortedList cookieColList = (SortedList)table[pathList].GetType().InvokeMember("m_list",
355                     System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField
356                     | System.Reflection.BindingFlags.Instance, null, table[pathList], new object[] { });
357                 foreach (CookieCollection colCookies in cookieColList.Values)
358                     foreach (Cookie c in colCookies)
359                         _cookie.Append(c.Name + "=" + c.Value + ";");
360 
361                 cookies.Add(pathList, _cookie.ToString().TrimEnd(;));
362             }
363             return cookies;
364         }
365 
366         /// <summary>
367         /// convert cookies string to CookieContainer
368         /// </summary>
369         /// <param name="cookies"></param>
370         /// <returns></returns>
371         public static CookieContainer ConvertToCookieContainer(Dictionary<string, string> cookies)
372         {
373             CookieContainer cookieContainer = new CookieContainer();
374 
375             foreach (var cookie in cookies)
376             {
377                 string[] strEachCookParts = cookie.Value.Split(;);
378                 int intEachCookPartsCount = strEachCookParts.Length;
379 
380                 foreach (string strCNameAndCValue in strEachCookParts)
381                 {
382                     if (!string.IsNullOrEmpty(strCNameAndCValue))
383                     {
384                         Cookie cookTemp = new Cookie();
385                         int firstEqual = strCNameAndCValue.IndexOf("=");
386                         string firstName = strCNameAndCValue.Substring(0, firstEqual);
387                         string allValue = strCNameAndCValue.Substring(firstEqual + 1, strCNameAndCValue.Length - (firstEqual + 1));
388                         cookTemp.Name = firstName;
389                         cookTemp.Value = allValue;
390                         cookTemp.Path = "/";
391                         cookTemp.Domain = cookie.Key;
392                         cookieContainer.Add(cookTemp);
393                     }
394                 }
395             }
396             return cookieContainer;
397         }
398 
399         public static string BuildPostData(string htmlContent)
400         {
401             HtmlDocument htmlDoc = new HtmlDocument();
402             htmlDoc.LoadHtml(htmlContent);
403             //Get the form node collection.
404             HtmlNode htmlNode = htmlDoc.DocumentNode.SelectSingleNode("//form");
405             HtmlNodeCollection htmlInputs = htmlNode.SelectNodes("//input");
406 
407             StringBuilder postData = new StringBuilder();
408 
409             foreach (HtmlNode input in htmlInputs)
410             {
411                 if(input.Attributes["value"] != null)
412                     postData.Append(input.Attributes["name"].Value + "=" + input.Attributes["value"].Value + "&");
413             }
414             return postData.ToString().TrimEnd(&);
415         }
416     }
417 }

部分网站需要登录的问题我已经着手通过另一个项目来解决(imitate-login),目前还有许多网页使用了JavaScript或各种基于JS的框架来对网页进行数据加载,如何来模拟执行JavaScript暂时还没找到比较优美的解决方案,如果大家有什么好的方案可以发给我,谢谢!

 未经授权,拒绝任何全文及摘要转载!

dotNet使用HttpWebRequest模拟浏览器

标签:

原文地址:http://www.cnblogs.com/NewIdea/p/http-helper-at-csharp.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!