码迷,mamicode.com
首页 > Web开发 > 详细

爬虫爬取网页数据

时间:2019-04-21 20:07:14      阅读:160      评论:0      收藏:0      [点我收藏+]

标签:tps   zip   html   div   sel   @class   add   manage   setting   

 

public static void Main(string[] args)
{
string url = "https://ly.esf.fang.com/house-a010204-b012374/";//所需要爬取网站地址
string data = GetWebContent(url);

var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(data);//加载数据流
HtmlNodeCollection htmlNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=‘shop_list shop_list_4‘]/dl[@id]");//查询根节点div下的class属性 详细请看https://www.cnblogs.com/GmrBrian/p/6201237.html
string a = "";
foreach (var item in htmlNodes)//htmlNodes把数据乱码进行编译
{
a += item.OuterHtml.ToString();
}
var FangJi = new HtmlDocument();
FangJi.LoadHtml(a);
string fM = "//dd/h4[@class=‘clearfix‘]/a/span";//同上
string gGe = "//dd/p[@class=‘tel_shop‘]";
string dZ = "//dd/p[@class=‘add_shop‘]/span";
var fangMing = FangJi.DocumentNode.SelectNodes(fM);
var guiGe = FangJi.DocumentNode.SelectNodes(gGe);
var diZhi = FangJi.DocumentNode.SelectNodes(dZ);
string fm = "";
string gge = "";
string dz = "";
foreach (var item in fangMing)
{
fm += item.InnerText.ToString() + "*";
}
foreach (var item in guiGe)
{
gge += item.InnerText.Trim() + "*";
}
foreach (var item in diZhi)
{
dz += item.InnerText.ToString() + "*";
}
string[] Fangming = fm.Split(*);
string[] Guige = gge.Split(*);
string[] Dizhi = dz.Split(*);

for (int i = 0; i < Fangming.Length - 1; i++)
{
Add(Fangming[i], Guige[i], Dizhi[i]);
}
}

public static void Add(string da1, string da2, string da3)//存储数据数据库
{
SqlConnection conn = new SqlConnection(ConfigurationManager.AppSettings["constring"]);
SqlCommand cmd = new SqlCommand("insert into CountPC values(‘" + da1 + "‘,‘" + da2 + "‘,‘" + da3 + "‘)", conn);
conn.Open();
cmd.ExecuteNonQuery();
conn.Close();
}

public static string GetWebContent(string Url)
{
/// 抓取数据
string strResult = "";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
//声明一个HttpWebRequest请求 
request.Timeout = 30000;
//设置连接超时时间 
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("GB2312");//网页所需编码格式
StreamReader read = new StreamReader(new GZipStream(streamReceive, CompressionMode.Decompress), encoding);

strResult = read.ReadToEnd();
}
catch
{

}
return strResult;
}

 


}

爬虫爬取网页数据

标签:tps   zip   html   div   sel   @class   add   manage   setting   

原文地址:https://www.cnblogs.com/wxc-love/p/10746404.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!