标签:stun htm sha ram mat ret meta charset returns
请求一些网页时,因不同网页的编码格式不同,无法固定使用一种编码格式去请求,会出现乱码,也没发现通用的编码格式,暂时使用如下方法,请求后获取数据流,先用utf-8,这个比较普遍,如发现不是再用gb2312
/// <summary>
/// 网址流
/// </summary>
/// <param name="Url"></param>
/// <returns></returns>
public static Stream GetWebUrlStream(string Url)
{
Stream stream = null;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK)
{
stream = response.GetResponseStream();
}
return stream;
}
public ActionResult GetBaiduUrlInfo()
{
string url = RequestUtils.GetString("url");
Stream stream = HttpWebRequestUntil.GetWebUrlStream(url);
string strHtml = "";
List<byte> lst = new List<byte>();
int nRead = 0;
while ((nRead = stream.ReadByte()) != -1) lst.Add((byte)nRead);
byte[] byHtml = lst.ToArray();
//utf8的编码比较多 所以默认先用他解码
strHtml = Encoding.UTF8.GetString(byHtml, 0, byHtml.Length);
//就算编码没对也不会影响英文和数字的显示 然后匹配真正编码
string strCharSet =
Regex.Match(strHtml, @"<meta.*?charset=""?([a-z0-9-]+)\b", RegexOptions.IgnoreCase)
.Groups[1].Value;
//如果匹配到了标签并且不是utf8 那么重新解码一次
if (strCharSet != "" && (strCharSet.ToLower().IndexOf("utf") == -1))
{
try
{
strHtml = Encoding.GetEncoding(strCharSet).GetString(byHtml, 0, byHtml.Length);
}
catch
{
}
}
return Json(new { html = strHtml }, JsonRequestBehavior.AllowGet);
}
--谨记铭心
标签:stun htm sha ram mat ret meta charset returns
原文地址:http://www.cnblogs.com/xinloverong/p/6744967.html