码迷,mamicode.com
首页 > Web开发 > 详细

ASP.NET抓取网页内容

时间:2014-11-05 19:13:58      阅读:249      评论:0      收藏:0      [点我收藏+]

标签:blog   http   io   ar   os   使用   sp   strong   文件   

原文:ASP.NET抓取网页内容

一、ASP.NET 使用HttpWebRequest抓取网页内容

 

这种方式抓取某些页面会失败

不过,有时候我们会发现,这个程序在抓取某些页面时,是获不到所需的内容的,有时候甚至返回404的错误提示页,这是什么原因呢?

其实,很多人都忽略了一个问题,那就是服务器默认的浏览器问题。有的服务器默认的浏览器是手机浏览器,那么,当我抓取这个服务器下的网页是,就相当于用手机浏览器来打开网页,而如果要抓取的目标网页没有相应的手机网页的话,就会返回意想不到的结果,有的返回404错误提示页,有的返回403错误提示页,有的甚至跳转到其他网页去了。

 

如何解决这个问题?

要解决这个问题,其实很简单,我们只需要在程序里指定使用的浏览器的即可,即是设置UserAgent的参数值。

 

完整代码:

 
C# 代码   复制
bubuko.com,布布扣
bubuko.com,布布扣        /// <summary>方法一:比较推荐
bubuko.com,布布扣        /// 用HttpWebRequest取得网页源码
bubuko.com,布布扣        /// 对于带BOM的网页很有效,不管是什么编码都能正确识别
bubuko.com,布布扣        /// </summary>
bubuko.com,布布扣        /// <param name="url">网页地址" </param>
bubuko.com,布布扣        /// <returns>返回网页源文件</returns>
bubuko.com,布布扣        public static string GetHtmlSource2(string url)
bubuko.com,布布扣        {
bubuko.com,布布扣            //处理内容
bubuko.com,布布扣            string html = "";
bubuko.com,布布扣            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
bubuko.com,布布扣            request.Accept = "*/*"; //接受任意文件
bubuko.com,布布扣            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)"; // 
bubuko.com,布布扣            request.AllowAutoRedirect = true;//是否允许302
bubuko.com,布布扣            //request.CookieContainer = new CookieContainer();//cookie容器,
bubuko.com,布布扣            request.Referer = url; //当前页面的引用
bubuko.com,布布扣            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
bubuko.com,布布扣            Stream stream = response.GetResponseStream();
bubuko.com,布布扣            StreamReader reader = new StreamReader(stream, Encoding.Default);
bubuko.com,布布扣            html = reader.ReadToEnd();
bubuko.com,布布扣            stream.Close();
bubuko.com,布布扣            return html;
bubuko.com,布布扣        }
bubuko.com,布布扣

 

二、ASP.NET 使用 WebResponse 抓取网页内容

 
C# 代码   复制
bubuko.com,布布扣
bubuko.com,布布扣        public static string GetHttpData2(string Url)
bubuko.com,布布扣        {
bubuko.com,布布扣            string sException = null;
bubuko.com,布布扣            string sRslt = null;
bubuko.com,布布扣            WebResponse oWebRps = null;
bubuko.com,布布扣            WebRequest oWebRqst = WebRequest.Create(Url);
bubuko.com,布布扣            oWebRqst.Timeout = 50000;
bubuko.com,布布扣            try
bubuko.com,布布扣            {
bubuko.com,布布扣                oWebRps = oWebRqst.GetResponse();
bubuko.com,布布扣            }
bubuko.com,布布扣            catch (WebException e)
bubuko.com,布布扣            {
bubuko.com,布布扣                sException = e.Message.ToString();
bubuko.com,布布扣            }
bubuko.com,布布扣            catch (Exception e)
bubuko.com,布布扣            {
bubuko.com,布布扣                sException = e.ToString();
bubuko.com,布布扣            }
bubuko.com,布布扣            finally
bubuko.com,布布扣            {
bubuko.com,布布扣                if (oWebRps != null)
bubuko.com,布布扣                {
bubuko.com,布布扣                    StreamReader oStreamRd = new StreamReader(oWebRps.GetResponseStream(), Encoding.GetEncoding("utf-8"));
bubuko.com,布布扣                    sRslt = oStreamRd.ReadToEnd();
bubuko.com,布布扣                    oStreamRd.Close();
bubuko.com,布布扣                    oWebRps.Close();
bubuko.com,布布扣                }
bubuko.com,布布扣            }
bubuko.com,布布扣            return sRslt;
bubuko.com,布布扣        }
bubuko.com,布布扣

 

三、ASP.NET 使用 WebClient 抓取网页内容

 
C# 代码   复制
bubuko.com,布布扣
bubuko.com,布布扣        /// <param name="url">/要访问的网站地址</param>
bubuko.com,布布扣        /// <param name="charSets">目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码</param>
bubuko.com,布布扣        /// <returns></returns>
bubuko.com,布布扣        public static string getHtml(string url, params  string[] charSets)
bubuko.com,布布扣        {
bubuko.com,布布扣            try
bubuko.com,布布扣            {
bubuko.com,布布扣                string charSet = null;
bubuko.com,布布扣                if (charSets.Length == 1)
bubuko.com,布布扣                {
bubuko.com,布布扣                    charSet = charSets[0];
bubuko.com,布布扣                }
bubuko.com,布布扣                WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
bubuko.com,布布扣                // 需要注意的:
bubuko.com,布布扣                //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
bubuko.com,布布扣                //这是就要具体问题具体分析比如在头部加入cookie
bubuko.com,布布扣                // webclient.Headers.Add("Cookie", cookie);
bubuko.com,布布扣                //这样可能需要一些重载方法.根据需要写就可以了
bubuko.com,布布扣                //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据.
bubuko.com,布布扣                myWebClient.Credentials = CredentialCache.DefaultCredentials;
bubuko.com,布布扣                //如果服务器要验证用户名,密码
bubuko.com,布布扣                //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
bubuko.com,布布扣                //myWebClient.Credentials = mycred;
bubuko.com,布布扣                //从资源下载数据并返回字节数组.(加@是因为网址中间有"/"符号)
bubuko.com,布布扣                byte[] myDataBuffer = myWebClient.DownloadData(url);
bubuko.com,布布扣                string strWebData = Encoding.Default.GetString(myDataBuffer);
bubuko.com,布布扣                //获取网页字符编码描述信息
bubuko.com,布布扣                Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)", RegexOptions.IgnoreCase | RegexOptions.Multiline);
bubuko.com,布布扣                string webCharSet = charSetMatch.Groups[2].Value;
bubuko.com,布布扣                if (charSet == null || charSet == "")
bubuko.com,布布扣                    charSet = webCharSet;
bubuko.com,布布扣                if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
bubuko.com,布布扣                {
bubuko.com,布布扣                    strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
bubuko.com,布布扣                }
bubuko.com,布布扣                else
bubuko.com,布布扣                {
bubuko.com,布布扣                    strWebData = Encoding.GetEncoding("utf-8").GetString(myDataBuffer);
bubuko.com,布布扣                }
bubuko.com,布布扣                return strWebData;
bubuko.com,布布扣            }
bubuko.com,布布扣            catch (Exception e) { return ""; }
bubuko.com,布布扣        }

ASP.NET抓取网页内容

标签:blog   http   io   ar   os   使用   sp   strong   文件   

原文地址:http://www.cnblogs.com/lonelyxmas/p/4076749.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!