1 #region 解析HTML 2 /// <summary> 3 /// 获取网页标签内容 4 /// </summary> 5 public static string[] RegexHtmlToFormat(string as_Html, string tags) 6 { 7 List<string> list = new List<string>(); 8 Regex regex = new Regex("<" + tags + "[^>]*?>[\\s\\S]*?<\\/" + tags + ">", RegexOptions.IgnoreCase | RegexOptions.Multiline); 9 if (regex.IsMatch(as_Html)) 10 { 11 MatchCollection matchCollection = regex.Matches(as_Html); 12 foreach (Match match in matchCollection) 13 { 14 list.Add(match.Value);//获取到的 15 } 16 } 17 return list.ToArray(); 18 } 19 /// <summary> 20 /// 取得HTML中所有图片的 URL。 21 /// </summary> 22 /// <param name="sHtmlText">HTML代码</param> 23 /// <returns>图片的URL列表</returns> 24 public static string[] GetHtmlImageUrlList(string sHtmlText) 25 { 26 // 定义正则表达式用来匹配 img 标签 27 Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""‘]?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""‘<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase); 28 29 // 搜索匹配的字符串 30 MatchCollection matches = regImg.Matches(sHtmlText); 31 int i = 0; 32 string[] sUrlList = new string[matches.Count]; 33 34 // 取得匹配项列表 35 foreach (Match match in matches) 36 sUrlList[i++] = match.Groups["imgUrl"].Value; 37 return sUrlList; 38 } 39 /// <summary> 40 /// 获取页面内所有漫画图片地址 41 /// </summary> 42 public static string[] RegexHtmlDiv(string as_Html, string className) 43 { 44 List<string> list = new List<string>(); 45 Regex regex = new Regex("<div class=‘" + className + "‘>(.|\n)*?</div>", RegexOptions.IgnoreCase | RegexOptions.Multiline); 46 if (regex.IsMatch(as_Html)) 47 { 48 MatchCollection matchCollection = regex.Matches(as_Html); 49 foreach (Match match in matchCollection) 50 { 51 string ls_rc = match.Value; 52 list.Add(ls_rc);//获取到的 53 } 54 } 55 return list.ToArray(); 56 } 57 58 /// <summary> 59 /// 解析HTML 60 /// <para>示例代码</para> 61 /// </summary> 62 public static string RegexHTMLList(string as_Html) 63 { 64 as_Html = as_Html.Replace("\t", ""); 65 as_Html = as_Html.Replace(" ", ""); 66 string ls_rc = ""; 67 Regex regex = new Regex("<tr>(?<CompanyName>.*?)</tr>", RegexOptions.IgnoreCase | RegexOptions.Multiline); 68 if (regex.IsMatch(as_Html)) 69 { 70 MatchCollection matchCollection = regex.Matches(as_Html); 71 foreach (Match match in matchCollection) 72 { 73 ls_rc += match.Value;//获取到的 74 } 75 } 76 return ls_rc; 77 } 78 79 /// <summary> 80 /// 获取字符中指定标签的值 81 /// </summary> 82 /// <param name="str">字符串</param> 83 /// <param name="title">标签</param> 84 /// <param name="attrib">属性名</param> 85 /// <returns>属性</returns> 86 public static string GetTitleContent(string str, string title, string attrib) 87 { 88 string tmpStr = string.Format("<{0}[^>]*?{1}=([‘\"\"]?)(?<url>[^‘\"\"\\s>]+)\\1[^>]*>", title, attrib); //获取<title>之间内容 89 Match TitleMatch = Regex.Match(str, tmpStr, RegexOptions.IgnoreCase); 90 string result = TitleMatch.Groups["url"].Value; 91 return result; 92 } 93 /// <summary> 94 /// 解析控件的属性返回键值对 95 /// </summary> 96 /// <param name="HtmlElement"></param> 97 /// <returns></returns> 98 public static System.Collections.Hashtable getAttrs(string HtmlElement) 99 { 100 System.Collections.Hashtable ht = new System.Collections.Hashtable(); 101 MatchCollection mc = Regex.Matches(HtmlElement, "(?<name>[\\S^=]+)\\s*=\\s*\"(?<value>[^\"\"]+)\"|(?<name>[\\S^=]+)\\s*=\\s*‘(?<value>[^‘‘]+)‘|(?<name>\\w+)=(?<value>[^\"])(?=[\\s])"); 102 foreach (Match m in mc) 103 { 104 ht[m.Groups[1].Value] = m.Groups[2].Value; 105 } 106 return ht; 107 } 108 #endregion