码迷,mamicode.com
首页 > Web开发 > 详细

用正则表达式抓取网页中的ul 和 li标签中最终的值!

时间:2014-06-29 00:49:28      阅读:369      评论:0      收藏:0      [点我收藏+]

标签:style   blog   http   color   get   width   

            获取你要抓取的页面

            const string URL = "http://www.hn3ddf.gov.cn/price/GetList.html?pageno=1";
            string htmlStr = null;
            for (int i = 0; i < 10; i++)
            {
                try
                {
                    System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(URL);
                    request.Headers.Set("Pragma", "no-cache");
                    request.Timeout = 10000 + (i * 5000);
                    System.Net.HttpWebResponse response = (System.Net.HttpWebResponse)request.GetResponse();
                    System.IO.Stream streamReceive = response.GetResponseStream();
                    System.IO.StreamReader streamReader = new System.IO.StreamReader(streamReceive, Encoding.GetEncoding("utf-8"));
                    htmlStr = streamReader.ReadToEnd();
                    break;
                }
                catch (Exception e)
                {
                    //----------------抓取异常!!
                }
            }

//抓取页面中的ul 标签中的特定一行属性

           MatchCollection priceList = Regex.Matches(htmlStr, @"<ul style=""font-size:12px;width:320px; margin:0; padding:0;"">(.*?)</ul>", RegexOptions.Singleline);
            StringBuilder resultStr = new StringBuilder();
            for (int i = 0; i < priceList.Count; i++)
            {
                try
                {
                      //<ul style="font-size:12px;width:320px; margin:0; padding:0;">
                      //  <li style="color:#555555; float:left; display:block; width:140px; height:22px; line-height:22px;" align="center">铔嬮浮閰嶅悎楗叉枡</li>
                      //  <li align="center" style="color:#555555; float:left; display:block; width:100px; height:22px; line-height:22px;">2.83鍏?鍗冨厠</li>
                      //  <li style="color:#555555; float:left; display:block; width:50px;text-align:center; height:22px; line-height:22px;">05-21</li>
                      //</ul>

                    //List<string> list = new List<string>();   //放结果的泛型集合
                    //string splitStr = "</li>";
                    //string[] strArray = priceList[i].Value.Split(splitStr.ToArray());    //一组一组的li标签
                    //foreach (string item in strArray)
                    //{
                    //    int first = item.IndexOf(‘>‘);
                    //    int last = item.IndexOf("</li>");
                    //    list.Add(item.Substring(first, last - first));
                    //    //list.add(item.substring(item.indexof(">")));
                    //}
                    //MatchCollection items = Regex.Matches(htmlStr, @"<li.*(?=>)(.|\n)*?</li>");

                    resultStr.Append("<tr>");

                     //<li style="color:#555555; float:left; display:block; width:140px; height:22px; line-height:22px;" align="center">蛋鸡配合饲料</li>

                    //<ul style="font-size:12px;width:320px; margin:0; padding:0;">
                    //    <li style="color:#555555; float:left; display:block; width:140px; height:22px; line-height:22px;" align="center">蛋鸡配合饲料</li>
                    //    <li align="center" style="color:#555555; float:left; display:block; width:100px; height:22px; line-height:22px;">2.83元/千克</li>
                    //    <li style="color:#555555; float:left; display:block; width:50px;text-align:center; height:22px; line-height:22px;">05-21</li>
                    //</ul>
                    string priceItem = priceList[i].Value;
                    //string name = Regex.Match(priceItem, @"<li style=""color:#555555; float:left; display:block; width:140px; height:22px; line-height:22px;"" align=""center"">(.*?)</li>").Value;
//配备<开头的在抓取的网页中的li标签中的所有属性进行配备为真的一行结果包含:样式和值
                    Match TitleMatch = Regex.Match(priceItem, @"<li style=""color:#555555; float:left; display:block; width:140px; height:22px; line-height:22px;"" align=""center"">([^<]*)</li>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
       //取上面一行中的只有属性的值Value.Groups[1],1 代表Regex.Match方法得到的Groups的索引是从1开始的,而不是从0开始的
                    string name = TitleMatch.Groups[1].Value;

                    //"color:#555555; float:left; display:block; width:140px; height:22px; line-height:22px;" align="center">铔嬮浮閰嶅悎楗叉枡
                    //name = name.Substring(10, name.Length - 15);
                    //name = name.Substring(113, name.Length - 118);

                    //string price = Regex.Match(priceItem, @"<li align=""center"" style=""color:#555555; float:left; display:block; width:100px; height:22px; line-height:22px;"">(.*?)</li>").Value;
                    //price = price.Substring(13, price.Length - 18);
                    //price = price.Substring(115, price.Length -120);
                    Match priceMatch = Regex.Match(priceItem, @"<li align=""center"" style=""color:#555555; float:left; display:block; width:100px; height:22px; line-height:22px;"">([^<]*)</li>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    string price = priceMatch.Groups[1].Value;
//                    string weeks = Regex.Match(priceItem, @"<li style=""color:#555555; float:left; display:block; width:50px;text-align:center; height:22px; line-height:22px;"">(.*?)</li>
//").Value;
//                    //weeks = weeks.Substring(9, weeks.Length - 16);
//                    weeks = weeks.Substring(116, weeks.Length - 122);

                    Match weeksMatch = Regex.Match(priceItem, @"<li style=""color:#555555; float:left; display:block; width:50px;text-align:center; height:22px; line-height:22px;"">([^<]*)</li>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    string weeks = weeksMatch.Groups[1].Value;
                    resultStr.Append("<td width=\"195\" height=\"25\" align=\"left\">" + name + "</td><td width=\"70\" height=\"25\" align=\"center\" style=\"text-align:right;\">" + price + "</td><td height=\"25\" align=\"center\" style=\"color:#55a8ea;\">" + weeks + "</td>");
                    resultStr.Append("</tr>");
                    #region 原来的
                    //resultStr.Append("<tr>");
                    //string priceItem = priceList[i].Value;
                    //string name = Regex.Match(priceItem, "width=125>.*?</td>").Value;
                    //name = name.Substring(10, name.Length - 15);
                    //string price = Regex.Match(priceItem, "<td width=50.*?</td>").Value;
                    //price = price.Substring(13, price.Length - 18);
                    //string weeks = Regex.Match(priceItem, "class=en>.*?</font>").Value;
                    //weeks = weeks.Substring(9, weeks.Length - 16);
                    //resultStr.Append("<td width=\"195\" height=\"25\" align=\"left\">" + name + "</td><td width=\"70\" height=\"25\" align=\"center\">" + price + "</td><td height=\"25\" align=\"center\" style=\"color:#55a8ea;\">" + weeks + "</td>");
                    //resultStr.Append("</tr>");
                    #endregion
                }
                catch (Exception ex)
                {
                    //Common.Log4netUtil.Log().Error("获取跨域数据错误." + ex.Message);
                }
            }

            return resultStr.ToString();
 
 
bubuko.com,布布扣

用正则表达式抓取网页中的ul 和 li标签中最终的值!,布布扣,bubuko.com

用正则表达式抓取网页中的ul 和 li标签中最终的值!

标签:style   blog   http   color   get   width   

原文地址:http://www.cnblogs.com/qiankundai/p/3794014.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!