假期十二

时间：2020-02-12 23:57:47 阅读：127 评论：0 收藏：0 [点我收藏+]

标签：函数 http 链接地址 header webkit 计算 string ext spl

爬虫爬取

from bs4 import BeautifulSoup

import requests

import xlwt

def getHouseList(url):

house = []

headers = {

‘User-Agent‘:

‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER‘

}

# get从网页获取信息

res = requests.get(url, headers=headers)

# 解析内容

soup = BeautifulSoup(res.content, ‘html.parser‘)

# 房源title

housename_divs = soup.find_all(‘div‘, class_=‘title‘)

for housename_div in housename_divs:

housename_as = housename_div.find_all(‘a‘)

for housename_a in housename_as:

housename = []

# 标题

housename.append(housename_a.get_text())

# 超链接

housename.append(housename_a.get(‘href‘))

house.append(housename)

huseinfo_divs = soup.find_all(‘div‘, class_=‘houseInfo‘)

for i in range(len(huseinfo_divs)):

info = huseinfo_divs[i].get_text()

infos = info.split(‘|‘)

# 小区名称

house[i].append(infos[0])

# 户型

house[i].append(infos[1])

# 平米

house[i].append(infos[2])

# 查询总价

house_prices = soup.find_all(‘div‘, class_=‘totalPrice‘)

for i in range(len(house_prices)):

# 价格

price = house_prices[i].get_text()

house[i].append(price)

return house

# 爬取房屋详细信息：所在区域、套内面积

def houseinfo(url):

headers = {

‘User-Agent‘:

‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER‘

}

res = requests.get(url, headers=headers)

soup = BeautifulSoup(res.content, ‘html.parser‘)

msg = []

# 所在区域

areainfos = soup.find_all(‘span‘, class_=‘info‘)

for areainfo in areainfos:

# 只需要获取第一个a标签的内容即可

area = areainfo.find(‘a‘)

if (not area):

continue

hrefStr = area[‘href‘]

if (hrefStr.startswith(‘javascript‘)):

continue

msg.append(area.get_text())

break

# 根据房屋户型计算套内面积

infolist = soup.find_all(‘div‘, id=‘infoList‘)

num = []

for info in infolist:

cols = info.find_all(‘div‘, class_=‘col‘)

for i in cols:

pingmi = i.get_text()

try:

a = float(pingmi[:-2])

num.append(a)

except ValueError:

continue

msg.append(sum(num))

return msg

# 将房源信息写入excel文件

def writeExcel(excelPath, houses):

workbook = xlwt.Workbook()

# 获取第一个sheet页

sheet = workbook.add_sheet(‘git‘)

row0 = [‘标题‘, ‘链接地址‘, ‘户型‘, ‘面积‘, ‘朝向‘, ‘总价‘, ‘所属区域‘, ‘套内面积‘]

for i in range(0, len(row0)):

sheet.write(0, i, row0[i])

for i in range(0, len(houses)):

house = houses[i]

print(house)

for j in range(0, len(house)):

sheet.write(i + 1, j, house[j])

workbook.save(excelPath)

# 主函数

def main():

data = []

for i in range(1, 5):

print(‘-----分隔符‘, i, ‘-------‘)

if i == 1:

url = ‘https://sjz.lianjia.com/ershoufang/l2rs%E5%92%8C%E5%B9%B3%E4%B8%96%E5%AE%B6/‘

else:

url = ‘https://sjz.lianjia.com/ershoufang/pg‘ + str(i) + ‘l2rs%E5%92%8C%E5%B9%B3%E4%B8%96%E5%AE%B6/‘

houses = getHouseList(url)

for house in houses:

link = house[1]

if (not link or not link.startswith(‘http‘)):

continue

mianji = houseinfo(link)

# 将套内面积、所在区域增加到房源信息

house.extend(mianji)

data.extend(houses)

writeExcel(‘d:/house.xls‘, data)

假期十二

标签：函数 http 链接地址 header webkit 计算 string ext spl

原文地址：https://www.cnblogs.com/jbwen/p/12301762.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行