码迷,mamicode.com
首页 > 其他好文 > 详细

数据抓取Fizzler

时间:2015-04-23 15:42:13      阅读:337      评论:0      收藏:0      [点我收藏+]

标签:fizzler

  Fizzler插件下载地址

需要引用:

using Fizzler;
using Fizzler.Systems.HtmlAgilityPack;
using HtmlAgilityPack;


 public class FizzlerHelper
    {
        /// <summary>
        /// 获取相应的标签内容
        /// </summary>
        /// <param name="url">地址链接</param>
        /// <param name="cssLoad">css路径</param>
        /// <returns></returns>
        public static IEnumerable<HtmlNode> GetUrlInfo(string url, string cssLoad)
        {
            HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd = true,
                OptionFixNestedTags = true,
                OptionReadEncoding = true
            };
            string html = HttpGet<string>(url);
            htmlDoc.LoadHtml(html);
            IEnumerable<HtmlNode> NodesMainContent = htmlDoc.DocumentNode.QuerySelectorAll(cssLoad);//查询的路径
            return NodesMainContent;
        }

        /// <summary>
        /// 获取相应的标签内容
        /// </summary>
        /// <param name="html">html内容</param>
        /// <param name="cssLoad">css路径</param>
        /// <returns></returns>
        public static IEnumerable<HtmlNode> GetHtmlInfo(string html, string cssLoad)
        {
            HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd = true,
                OptionFixNestedTags = true,
                OptionReadEncoding = true
            };

            htmlDoc.LoadHtml(html);
            IEnumerable<HtmlNode> NodesMainContent = htmlDoc.DocumentNode.QuerySelectorAll(cssLoad);//查询的路径
            return NodesMainContent;
        }

        #region GET请求
        public static T HttpGet<T>(string url)
        {
            try
            {
                string retString = "";
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                request.Method = "GET";
                using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
                {
                    Stream stream = response.GetResponseStream();
                    using (StreamReader streamReader = new StreamReader(stream, Encoding.UTF8))
                    {
                        retString = streamReader.ReadToEnd().ToString();
                    }
                }

                return (T)Convert.ChangeType(retString, typeof(T));
            }
            catch
            {
                return default(T);
            }
        }
        #endregion
    }

实现数据抓取(透明售房网),Fizzler:主要是通过Html中的标签样式获取数据,屏蔽了复杂的正则表达式。

using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.Services;
using System.Web.UI;
using System.Web.UI.WebControls;
using Fizzler;
using Fizzler.Systems.HtmlAgilityPack;
using HtmlAgilityPack;
using Newtonsoft.Json;
using DataCollectionCommon;
using System.Text;
using DataCollectionDAL;
using DataCollectionModel;
using System.Text.RegularExpressions;


namespace DataCollectionDemo
{
    public partial class WebForm1 : System.Web.UI.Page
    {
        public static FizzlerHelper fizzlerHelper = new FizzlerHelper();
        public string resultHtml = string.Empty;
        protected void Page_Load(object sender, EventArgs e)
        {
            if (!IsPostBack)
            {
                //杭州
                resultHtml = StartDataCollection("http://www.tmsf.com/daily.htm");
            }
        }


        /// <summary>
        /// 开始抓取数据
        /// </summary>
        /// <param name="<span style="font-family: Arial, Helvetica, sans-serif;">url</span><span style="font-family: Arial, Helvetica, sans-serif;">">网站地址路径</param></span>
        /// <returns></returns>
        [WebMethod]
        public static string StartDataCollection(string url)
        {
            StringBuilder temp_table = new StringBuilder();
            temp_table.Append("<table width=\"100%\" border=\"0\" cellspacing=\"0\" cellpadding=\"0\">");
            temp_table.Append("<tr><td>楼盘名称</td><td>城区</td><td>签约套数</td><td>预定套数</td><td>签约面积</td><td>签约均价</td></tr>");
            List<HtmlNode> list_tr = FizzlerHelper.GetUrlInfo(url, "div.datanowin table tr").ToList();
            if (list_tr.Count > 0)
            {
                //移除表头
                list_tr.RemoveAt(0);
            }
            //循环行tr
            foreach (HtmlNode node_tr in list_tr)
            {
                string tdHtml = node_tr.InnerHtml;//再去解析html中的td
                List<HtmlNode> list_td = FizzlerHelper.GetHtmlInfo(tdHtml, "td").ToList();
                temp_table.Append("<tr>");
                //循环列td
                foreach (HtmlNode node_td in list_td)
                {
                    string spanHtml = node_td.InnerHtml;
                    MatchCollection mcc_temp = Regex.Matches(spanHtml, @"<span class=""(?<url>.+?)""></span>", RegexOptions.Singleline);
                    string values = GetValueBySpanClass(mcc_temp);
                    temp_table.AppendFormat("<td>{0}{1}</td>", values, node_td.InnerText);
                }
                temp_table.Append("</tr>");
            }
            temp_table.Append("</table>");
            return temp_table.ToString();


            //return JsonConvert.SerializeObject(new { code = 1, msg = "数据采集失败", data = temp.ToString() });
        }


        /// <summary>
        /// 根据span样式名称 解析值
        /// </summary>
        /// <param name="mcc_span"></param>
        /// <returns></returns>
        private static string GetValueBySpanClass(MatchCollection mcc_span)
        {
            string str_value = "";
            for (int i = 0, length = mcc_span.Count; i < length; i++)
            {
                switch (mcc_span[i].Groups["url"].Value)
                {
                    case "numbdor":
                        str_value += ".";
                        break;
                    case "numbzero":
                        str_value += "0";
                        break;
                    case "numbone":
                        str_value += "1";
                        break;
                    case "numbtwo":
                        str_value += "2";
                        break;
                    case "numbthree":
                        str_value += "3";
                        break;
                    case "numbfour":
                        str_value += "4";
                        break;
                    case "numbfive":
                        str_value += "5";
                        break;
                    case "numbsix":
                        str_value += "6";
                        break;
                    case "numbseven":
                        str_value += "7";
                        break;
                    case "numbeight":
                        str_value += "8";
                        break;
                    case "numbnine":
                        str_value += "9";
                        break;
                    default:
                        break;
                }
            }
            return str_value;
        }
    }
}


数据抓取Fizzler

标签:fizzler

原文地址:http://blog.csdn.net/jayzai/article/details/45222091

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!