标签:
/// <summary> /// 函数名称:GetDataFromUrl /// 功能说明:获取url指定的网页的源码 /// 参数:string url用于指定 url /// 参数:ref Encoding encode用来获取网页中的字符集编码 /// </summary> public static string GetDataFromUrl(string url, ref Encoding encode) { string str = string.Empty; HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url); //设置http头 request.AllowAutoRedirect = true; request.AllowWriteStreamBuffering = true; request.Referer = ""; request.Timeout = 10 * 1000; request.UserAgent = ""; HttpWebResponse response = null; response = (HttpWebResponse)request.GetResponse(); //根据http应答的http头来判断编码 string characterSet = response.CharacterSet; //Encoding encode; if (characterSet != "") { if (characterSet == "ISO-8859-1") { characterSet = "gb2312"; } encode = Encoding.GetEncoding(characterSet); } else { encode = Encoding.Default; } //声明一个内存流来保存http应答流 Stream receiveStream = response.GetResponseStream(); MemoryStream mStream = new MemoryStream(); byte[] bf = new byte[255]; int count = receiveStream.Read(bf, 0, 255); while (count > 0) { mStream.Write(bf, 0, count); count = receiveStream.Read(bf, 0, 255); } receiveStream.Close(); mStream.Seek(0, SeekOrigin.Begin); //从内存流里读取字符串 StreamReader reader = new StreamReader(mStream, encode); char[] buffer = new char[1024]; count = reader.Read(buffer, 0, 1024); while (count > 0) { str += new String(buffer, 0, count); count = reader.Read(buffer, 0, 1024); } //从解析出的字符串里判断charset,如果和http应答的编码不一直 //那么以页面声明的为准,再次从内存流里重新读取文本 Regex reg = new Regex(@"<meta[\s\S]+?charset=(.*?)""[\s\S]+?>", RegexOptions.Multiline | RegexOptions.IgnoreCase); MatchCollection mc = reg.Matches(str); if (mc.Count > 0) { string tempCharSet = mc[0].Result("$1"); if (string.Compare(tempCharSet, characterSet, true) != 0) { encode = Encoding.GetEncoding(tempCharSet); str = string.Empty; mStream.Seek(0, SeekOrigin.Begin); reader = new StreamReader(mStream, encode); buffer = new char[255]; count = reader.Read(buffer, 0, 255); while (count > 0) { str += new String(buffer, 0, count); count = reader.Read(buffer, 0, 255); } } } reader.Close(); mStream.Close(); if (response != null) response.Close(); return str; }
标签:
原文地址:http://www.cnblogs.com/muxueyuan/p/4522292.html