标签:stun htm sha ram mat ret meta charset returns
请求一些网页时,因不同网页的编码格式不同,无法固定使用一种编码格式去请求,会出现乱码,也没发现通用的编码格式,暂时使用如下方法,请求后获取数据流,先用utf-8,这个比较普遍,如发现不是再用gb2312
/// <summary> /// 网址流 /// </summary> /// <param name="Url"></param> /// <returns></returns> public static Stream GetWebUrlStream(string Url) { Stream stream = null; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK) { stream = response.GetResponseStream(); } return stream; }
public ActionResult GetBaiduUrlInfo() { string url = RequestUtils.GetString("url"); Stream stream = HttpWebRequestUntil.GetWebUrlStream(url); string strHtml = ""; List<byte> lst = new List<byte>(); int nRead = 0; while ((nRead = stream.ReadByte()) != -1) lst.Add((byte)nRead); byte[] byHtml = lst.ToArray(); //utf8的编码比较多 所以默认先用他解码 strHtml = Encoding.UTF8.GetString(byHtml, 0, byHtml.Length); //就算编码没对也不会影响英文和数字的显示 然后匹配真正编码 string strCharSet = Regex.Match(strHtml, @"<meta.*?charset=""?([a-z0-9-]+)\b", RegexOptions.IgnoreCase) .Groups[1].Value; //如果匹配到了标签并且不是utf8 那么重新解码一次 if (strCharSet != "" && (strCharSet.ToLower().IndexOf("utf") == -1)) { try { strHtml = Encoding.GetEncoding(strCharSet).GetString(byHtml, 0, byHtml.Length); } catch { } } return Json(new { html = strHtml }, JsonRequestBehavior.AllowGet); }
--谨记铭心
标签:stun htm sha ram mat ret meta charset returns
原文地址:http://www.cnblogs.com/xinloverong/p/6744967.html