码迷,mamicode.com
首页 > 其他好文 > 详细

爬取链家任意城市二手房数据(天津)

时间:2019-08-16 21:12:50      阅读:95      评论:0      收藏:0      [点我收藏+]

标签:城市   一个   rds   res   nis   式表   work   dex   发布   

  1 #!/usr/bin/env python
  2 # -*- coding: utf-8 -*-
  3 # @Time    : 2019-08-16 12:40
  4 # @Author  : Anthony
  5 # @Email   : ianghont7@163.com
  6 # @File    : 爬取链家任意城市二手房数据.py
  7 
  8 
  9 import requests
 10 from lxml import etree
 11 import time
 12 import xlrd
 13 import os
 14 import xlwt
 15 from xlutils.copy import copy
 16 
 17 # 伪装请求
 18 headers = {
 19     User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36
 20 }
 21 
 22 xlsInfo = {}
 23 
 24 def catchHouseDetail(url):
 25     # 通过requests模块模拟get请求
 26     page_text = requests.get(url, headers=headers, stream=True)
 27 
 28     # 将互联网上获取的页面数据加载到etree对象中
 29     tree = etree.HTML(page_text.text)
 30 
 31     # 定位页面标签位置装入一个list中
 32     li_list = tree.xpath(//div[@class="leftContent"]/ul/li)
 33     all_house_list = []
 34     # 遍历列表中每一个字段
 35     for li in li_list:
 36         info = []
 37         # info = {}
 38         # info["房屋标题"] = li.xpath(‘.//div[@class="info clear"]/div[@class="title"]/a/text()‘)[0]
 39         # info["小区名称"] = li.xpath(‘.//div[@class="address"]/div[@class="houseInfo"]/text()‘)[0].split(‘|‘)[1]
 40         # info[‘建筑面积‘] = li.xpath(‘.//div[@class="address"]/div[@class="houseInfo"]/text()‘)[0].split(‘|‘)[2]
 41         # info[‘房屋朝向‘] = li.xpath(‘.//div[@class="address"]/div[@class="houseInfo"]/text()‘)[0].split(‘|‘)[3]
 42         # info[‘装修情况‘] = li.xpath(‘.//div[@class="address"]/div[@class="houseInfo"]/text()‘)[0].split(‘|‘)[4]
 43         # info[‘所在楼层‘] = li.xpath(‘.//div[@class="flood"]/div[@class="positionInfo"]/text()‘)[0].split(‘ ‘)[0]
 44         # info[‘所在区域‘] = li.xpath(‘.//div[@class="flood"]/div[@class="positionInfo"]/a/text()‘)[0]
 45         # info[‘总价‘] = li.xpath(‘.//div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()‘)[0] + ‘万‘
 46         # info[‘每平米售价‘] = li.xpath(‘.//div[@class="priceInfo"]/div[@class="unitPrice"]/span/text()‘)[0]
 47         # info[‘房屋关注人数‘] = li.xpath(‘.//div[@class="followInfo"]/text()‘)[0].split(‘/‘)[0]
 48         # info[‘房屋发布时间‘] = li.xpath(‘.//div[@class="followInfo"]/text()‘)[0].split(‘/‘)[1]
 49 
 50         #房屋标题
 51         houseTitle = li.xpath(.//div[@class="info clear"]/div[@class="title"]/a/text())[0]
 52         #小区名称
 53         houseName = li.xpath(.//div[@class="address"]/div[@class="houseInfo"]/text())[0].split(|)[1]
 54         #建筑面积
 55         houseArea = li.xpath(.//div[@class="address"]/div[@class="houseInfo"]/text())[0].split(|)[2]
 56         #房屋朝向
 57         houseTowards = li.xpath(.//div[@class="address"]/div[@class="houseInfo"]/text())[0].split(|)[3]
 58         #装修情况
 59         houseFinish = li.xpath(.//div[@class="address"]/div[@class="houseInfo"]/text())[0].split(|)[4]
 60         #所在楼层
 61         houseFloor = li.xpath(.//div[@class="flood"]/div[@class="positionInfo"]/text())[0].split( )[0]
 62         #所在区域
 63         houseSite = li.xpath(.//div[@class="flood"]/div[@class="positionInfo"]/a/text())[0]
 64         #总价
 65         housePrices = li.xpath(.//div[@class="priceInfo"]/div[@class="totalPrice"]/span/text())[0] + 
 66         #每平米售价
 67         houseSquarePrices = li.xpath(.//div[@class="priceInfo"]/div[@class="unitPrice"]/span/text())[0]
 68         #房屋关注人数
 69         houseFollowers = li.xpath(.//div[@class="followInfo"]/text())[0].split(/)[0]
 70         #房屋发布时间
 71         houseTime = li.xpath(.//div[@class="followInfo"]/text())[0].split(/)[1]
 72         info.append(houseTitle)
 73         info.append(houseName)
 74         info.append(houseArea)
 75         info.append(houseTowards)
 76         info.append(houseFinish)
 77         info.append(houseFloor)
 78         info.append(houseSite)
 79         info.append(housePrices)
 80         info.append(houseSquarePrices)
 81         info.append(houseFollowers)
 82         info.append(houseTime)
 83         all_house_list.append(info)
 84     if if_xls_exits() == True:
 85         write_excel_xls_append(xlsInfo["xlsName"],all_house_list)
 86 
 87 
 88 #获取数据写入xls表格中
 89 def write_excel_xls(path, sheet_name, value):
 90     index = len(value)  # 获取需要写入数据的行数
 91     workbook = xlwt.Workbook()  # 新建一个工作簿
 92     sheet = workbook.add_sheet(sheet_name)  # 在工作簿中新建一个表格
 93     for i in range(0, index):
 94         for j in range(0, len(value[i])):
 95             sheet.write(i, j, value[i][j])  # 像表格中写入数据(对应的行和列)
 96     workbook.save(path)  # 保存工作簿
 97     print("xls格式表格写入数据成功!")
 98 
 99 
100 
101 def write_excel_xls_append(path, value):
102     index = len(value)  # 获取需要写入数据的行数
103     workbook = xlrd.open_workbook(path)  # 打开工作簿
104     sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
105     worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
106     rows_old = worksheet.nrows  # 获取表格中已存在的数据的行数
107     new_workbook = copy(workbook)  # 将xlrd对象拷贝转化为xlwt对象
108     new_worksheet = new_workbook.get_sheet(0)  # 获取转化后工作簿中的第一个表格
109     for i in range(0, index):
110         for j in range(0, len(value[i])):
111             new_worksheet.write(i + rows_old, j, value[i][j])  # 追加写入数据,注意是从i+rows_old行开始写入
112     new_workbook.save(path)  # 保存工作簿
113     print("xls格式表格【追加】写入数据成功!")
114 
115 
116 
117 
118 def if_xls_exits():
119     while True:
120         book_name_xls = 天津链家二手房信息表.xls
121         sheet_name_xls = 房屋信息
122         value_title = [["房屋标题", "房屋户型", "建筑面积", "房屋朝向", "装修情况", "所在楼层", "所在区域", "总价", "每平米售价", "房屋关注人数", "房屋发布时间"], ]
123         if os.path.exists(./%s%book_name_xls):
124             xlsInfo["xlsName"] = book_name_xls
125             return True
126         else:
127             write_excel_xls(book_name_xls, sheet_name_xls, value_title)
128             continue
129 
130 
131 
132 def catch():
133     pages = [https://tj.lianjia.com/ershoufang/pg{}/.format(x) for x in range(1, 1000)]
134     for page in pages:
135         try:
136             info = catchHouseDetail(page)
137         except:
138             pass
139         time.sleep(3)
140 
141 
142 if __name__ == __main__:
143     catch()

效果图:

技术图片

 

爬取链家任意城市二手房数据(天津)

标签:城市   一个   rds   res   nis   式表   work   dex   发布   

原文地址:https://www.cnblogs.com/ipyanthony/p/11365962.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!