码迷,mamicode.com
首页 > Web开发 > 详细

爬取房天下整个网站房产数据。。。

时间:2017-10-29 17:43:23      阅读:464      评论:0      收藏:0      [点我收藏+]

标签:activity   run   线程   bdd   web   poll   return   list   数据   

以前爬的数据量都有点少了,所以现在写个爬全站数据爬虫来,用redis进行URL的去重处理,采用mysql储存清洗过后房产数据,采用线程池来进行调度,进行多线程爬取

下面是房天下所有地区二手房和新房的URL,为后续爬取提供起始URL:

 1 import requests
 2 from lxml import etree
 3 
 4 
 5 class Ftx_newhouse_Secondhandhouse(object):
 6 
 7     headers = {
 8         User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36,
 9         Cookie: global_cookie=5n55ylc24xzrdp58gka2fm0mx2lj4mqfqak; Integrateactivity=notincludemc; vh_newhouse=3_1499483589_17454%5B%3A%7C%40%7C%3A%5D9af16b0d610e2cdd596b0d5a35400fbd; newhouse_user_guid=925B3734-6802-3162-165C-B593DAA860F1; recentViewlpNew_newhouse=3_1502607112_9948%5B%3A%7C%40%7C%3A%5D54e263288e4374965795dfe7c94c7fd3; city=heyuan; polling_imei=232d98985399f89e; token=59c66a51681142018630f1745e1e739f; Captcha=6E6B7334505855746454384A743161514A46696B346D577833476C613647745662647355494E7570596D4C52612B564F45473832462B59674B5A6E504C63386A34614767326774426455773D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; sfut=33A48A581B218095B1D7CE492BDDCA86292F2A06B82634CBDD1201D2545F42EE4B54A2BC1247390DE02741E7CA2C9A911EA425B693C59EC2D62EDD7A4D70012C0F8DEE007CB20A5E2A74C8A9B17D4A8E3A7698ADDEAEC479D29D9DC82BC746FB; passport=usertype=1&userid=100371905&username=huangsonghui&password=&isvalid=1&validation=; agent_validation=a=0; __utma=147393320.331855580.1499000907.1504415980.1508935988.27; __utmb=147393320.49.10.1508935988; __utmc=147393320; __utmz=147393320.1508935988.27.21.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; unique_cookie=U_35b7j0utahefmuagw4fol4w8y1bj971iz3h*14
10 
11     }
12 
13     def __init__(self):
14         self.url = http://newhouse.fang.com/house/s/
15         self.s = requests.session()
16 
17 
18     def Newhouse_ftx(self):
19         try:
20             response = self.s.post(self.url,headers=self.headers,verify=False)
21         except Exception as e:
22             print(error:,e)
23         response.encoding = gb2312
24         urls = etree.HTML(response.text)
25         xf_adress = urls.xpath(//div[@class="city20141104"]/div[3]/a/text()|
26                             //div[@class="city20141104"]/div[4]/a/text()|
27                             //div[@class="city20141104"]/div[5]/a/text()
28                             )
29         xf_url = urls.xpath(//div[@class="city20141104"]/div[3]/a/@href|
30                             //div[@class="city20141104"]/div[4]/a/@href|
31                             //div[@class="city20141104"]/div[5]/a/@href
32                             )
33 
34         return (dict(zip(xf_adress,xf_url)))
35 
36     def Secondhandhouse_ftx(self):
37         self.url = http://esf.sh.fang.com/newsecond/esfcities.aspx
38         try:
39             html  = requests.get(self.url,headers=self.headers,timeout=4)
40         except Exception as e:
41             print(error:,e)
42         html.encoding = gb2312
43         Secondhandhouse_urls = etree.HTML(html.text)
44         xf_url = Secondhandhouse_urls.xpath(//div[@class="onCont"]/ul/li/a/text())
45         xf_adress  = Secondhandhouse_urls.xpath(//div[@class="onCont"]/ul/li/a/@href)
46         dictx = dict(zip(xf_url,xf_adress))
47         return dictx




下面是爬取房产数据代码:

 1 import requests,redis,pymysql
 2 from mywed.fangtianxia.url import Ftx_newhouse_Secondhandhouse
 3 from lxml import etree
 4 from concurrent.futures import ThreadPoolExecutor
 5 import re,os,time
 6 from mywed.fangtianxia.logs import log_run
 7 
 8 Secondhandhouse_urls_set = {http://esf.hbjs.fang.com}
 9 dr = Ftx_newhouse_Secondhandhouse()
10 w = dr.Secondhandhouse_ftx()
11 for i in w.values():
12     Secondhandhouse_urls_set.add(i)
13 print(Secondhandhouse_urls_set)
14 
15 
16 
17 class Secondhandhouse(object):
18 
19     headers = {
20         User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36,
21         Cookie: global_cookie=5n55ylc24xzrdp58gka2fm0mx2lj4mqfqak; Integrateactivity=notincludemc; vh_newhouse=3_1499483589_17454%5B%3A%7C%40%7C%3A%5D9af16b0d610e2cdd596b0d5a35400fbd; newhouse_user_guid=925B3734-6802-3162-165C-B593DAA860F1; recentViewlpNew_newhouse=3_1502607112_9948%5B%3A%7C%40%7C%3A%5D54e263288e4374965795dfe7c94c7fd3; city=heyuan; polling_imei=232d98985399f89e; token=59c66a51681142018630f1745e1e739f; Captcha=6E6B7334505855746454384A743161514A46696B346D577833476C613647745662647355494E7570596D4C52612B564F45473832462B59674B5A6E504C63386A34614767326774426455773D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; sfut=33A48A581B218095B1D7CE492BDDCA86292F2A06B82634CBDD1201D2545F42EE4B54A2BC1247390DE02741E7CA2C9A911EA425B693C59EC2D62EDD7A4D70012C0F8DEE007CB20A5E2A74C8A9B17D4A8E3A7698ADDEAEC479D29D9DC82BC746FB; passport=usertype=1&userid=100371905&username=huangsonghui&password=&isvalid=1&validation=; agent_validation=a=0; __utma=147393320.331855580.1499000907.1504415980.1508935988.27; __utmb=147393320.49.10.1508935988; __utmc=147393320; __utmz=147393320.1508935988.27.21.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; unique_cookie=U_35b7j0utahefmuagw4fol4w8y1bj971iz3h*14
22 
23     }
24 
25     def get_newhouse_data(self,url):
26 
27         for num in range(102):
28             second_url = url + /house/i3 + str(num)
29             try:
30                 while True:
31                     reponse = requests.get(url,headers=self.headers,timeout=3)
32                     reponse.encoding = gbk
33                     #print(reponse.text)
34                     if reponse.status_code ==200:
35                         break
36                     else:
37                         print(restart donwing ......)
38             except Exception as e:
39                 log_run.File_enter_error(e)
40             select = etree.HTML(str(reponse.text))
41 
42             if not len(select.xpath(//a[@id="PageControl1_hlk_next"]/text())):
43                 break
44             else:
45                 content_list = select.xpath(//dd[@class="info rel floatr"])
46                 #print(content_list)
47 
48                 for i in content_list:
49                     title = i.xpath(./p[1]/a/@title)
50                     content = i.xpath(./p[2]/text())
51                     name = i.xpath(./p[3]/a/span/text())
52                     adress = i.xpath(./p[3]/span/text())
53                 try:
54                     size_list = select.xpath(//div[@class="area alignR"])
55                     size = [ii.xpath(./p/text()) for ii in size_list]
56                     average_price_list = select.xpath(//p[@class="danjia alignR mt5"])
57                     average_price = [/.join(iii.xpath(./text())) for iii in average_price_list]
58                     sum_price_list = select.xpath(//p[@class="mt5 alignR"])
59                     sum_price = [‘‘.join(iiii.xpath(./span/text())) for iiii in sum_price_list]
60                 except Exception as e:
61                     log_run.File_enter_error(e)
62                 print(title)
63 
64 
65 if __name__ =="__main__":
66     t = Secondhandhouse()
67     t.get_newhouse_data(http://esf.fang.com/house/i33/)
68     #s = t.get_newhouse_data
69     #pool = ThreadPoolExecutor(30)
70     #f = pool.map(s,Secondhandhouse_urls_set)

 



 

爬取房天下整个网站房产数据。。。

标签:activity   run   线程   bdd   web   poll   return   list   数据   

原文地址:http://www.cnblogs.com/Huangsh2017Come-on/p/7750417.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!