标签:return lag md5 group ide iges oat utils author
#!/usr/bin/env python # -*- coding:utf-8 -*- # Author : zhibo.wang # E-mail : d_1206@qq.com # Date : 18/04/24 09:22:58 # Desc : 赶集租房 import hashlib import socket import time import scrapy import copy from house_spider.agent import get_agent from bs4 import BeautifulSoup from utils.time_convert import get_time_stamp from utils.conmongodb import mongo_con_keepalive class GanJiSpider(scrapy.Spider): name = ‘ganji_rent_spider‘ allowed_domains = [‘ganji.com‘] custom_settings = { "DOWNLOAD_DELAY": 0.4, } def __init__(self): self.db = mongo_con_keepalive() self.start_data = [{"cityname": i.get("cityname"), "city_code": i.get("ganji_code"), "url": "http://{0}.ganji.com/fang1/m1/".format(i.get("ganji_code"))} for i in self.db.get_collection("para_city_list").find({}, {"_id":0})] self.efs = self.db.get_collection(‘pathdir_dict‘) self.ganji_header = get_agent(‘ganji_header‘) self.path_dir = "realstate/house/ganji/rent/{0}/".format(get_time_stamp()) self.efs.insert_one({‘pathdir‘: self.path_dir, ‘website‘: ‘ganji_rent‘, ‘flag‘: False}) super(GanJiSpider, self).__init__() def start_requests(self): reqs = [] for start_ in self.start_data: hd = copy.deepcopy(self.ganji_header[0]) url = start_.get("url") hd[‘Host‘] = url.split("/")[2] data = {"cityname": start_.get("cityname"), "city_code": start_.get("city_code"), "url": url, "host": hd[‘Host‘]} reqs.append(scrapy.Request(url, headers=hd, callback=self.parse_district, meta=data)) return reqs def parse_district(self, response): hd = copy.deepcopy(self.ganji_header[0]) host = response.request.meta.get("host") cityname = response.request.meta.get("cityname") city_code = response.request.meta.get("city_code") hd[‘Host‘] = host hd["Referer"] = response.request.meta.get("url") soup = BeautifulSoup(response.text, ‘html.parser‘) district_list = soup.find("div", class_="thr-list").find_all("li", class_="item") for district_ in district_list: url = "http://{0}.ganji.com{1}".format(city_code, district_.find("a").get("href")) data = {"cityname": cityname, "city_code": city_code,"url": url, "host": hd[‘Host‘], "page": True} yield scrapy.Request(url, headers=hd, callback=self.parse_page, meta=data) def parse_page(self, response): # 处理列表页 hd = copy.deepcopy(self.ganji_header[0]) host = response.request.meta.get("host") cityname = response.request.meta.get("cityname") hd[‘Host‘] = host hd["Referer"] = response.request.meta.get("url") soup = BeautifulSoup(response.text, ‘html.parser‘) city_code = response.request.meta.get("city_code") try: house_list = soup.find("div", class_="f-list js-tips-list").find_all("div", class_="f-list-item ershoufang-list") # 获取列表页 所有房源地址 house_datas = [ "http://{0}.ganji.com{1}".format(city_code, i.get("href")) for i in house_list ] for house_data in house_datas: yield scrapy.Request(house_data, headers=hd, callback=self.parse_item) except: print("没有租房数据") if response.request.meta.get("page"): try: all_count = int( soup.find("p", class_="m-result f-fr").find("span", class_="num").get_text().replace("套", "")) end_apge = int(all_count/50) if all_count % 50 != 0: end_apge += 1 page_urls = [ response.request.meta.get("url").replace("m1/", "m1o{0}/".format(i)) for i in range(2, int(end_apge/2))] for page_url in page_urls: data = {"cityname": cityname, "city_code": city_code, "url": page_url, "host": hd[‘Host‘], "page": False} yield scrapy.Request(page_url, headers=hd, callback=self.parse_page, meta=data) except Exception as e: print("没有分页: ", e) def parse_item(self, response): # fname = ‘%s_%s_%s_%s.html‘ % (socket.gethostname(), response.url.split(‘//‘)[-1].split(‘/‘)[0].replace(‘.‘, ‘-‘), hashlib.md5(response.url.encode()).hexdigest(), str(time.time()).split(‘.‘)[0]) doc = response.text + ‘\n<!--%s-->‘ % response.url return {‘path‘: self.path_dir + fname, ‘data‘: doc}
def etl(html):
soup = BeautifulSoup(html, "html.parser")
try:
card_top = soup.find("div", class_="card-top")
name = card_top.find("p", class_="card-title").find("i").get_text()
card_pay = card_top.find("ul", class_="card-pay f-clear")
price_num = card_pay.find("span", class_="num").get_text()
price_rmb = card_pay.find("span", class_="rmb").get_text()
price_month = card_pay.find("span", class_="month").get_text()
er_ = card_top.find("ul", class_="er-list f-clear")
item_list = er_.find_all("li", class_="item f-fl")
item = [{i.find("span", class_="t").get_text().strip().replace(":", ""):
i.find("span", class_="content").get_text().strip().replace(";", "").replace("\xa0 ", "")}
for i in item_list]
info_dict = {}
for i in item:
for k, v in i.items():
info_dict[k] = v
field_mapping = {
"户型": "huxing",
"面积": "buildingArea",
"朝向": "orientation",
"楼层": "floor",
"电梯情况": "is_Elevator",
"装修情况": "renovation",
"入住时间": "in_house_date",
"看房时间": "see_house_date",
}
_info = {}
for key in field_mapping.keys():
try:
_info[field_mapping[key]] = info_dict[key]
except Exception as e:
_info[field_mapping[key]] = None
address_data = card_top.find("ul", class_="er-list-two f-clear")
add = [i.get_text().replace("\n", "") for i in address_data.find_all("li", class_="er-item f-fl")]
xiaoqu_name, address = None, None
for i in add:
if "小区名称:" in i:
xiaoqu_name = add[0].replace("小区名称:", "").split(" ")[0].strip()
elif "所在地址:" in i:
address = add[-1].replace("所在地址:", "")
others = soup.find("div", class_="f-main-left f-fl f-w970")
house_peizhi = ",".join([i.get_text()
for i in others.find("div",
{"class":"f-group", "id":"js-house-peizhi"}).find_all("p", class_="text")])
describe = others.find("div",
{"class":"f-group", "id":"js-house-describe"}).find("div", class_="describe").get_text().strip().replace("\r", "")
try:
lo = json.loads(others.find("div", class_="col-sub map-wrap").find("div", class_="map-content js-map-tab js-so-map-tab").get("data-ref"))
lnglat = lo.get("lnglat")
lng_b, lat_b = [float(i.replace("b", "")) for i in lnglat.split(",")]
except:
lng_b, lat_b = None, None
if lng_b:
lng_a, lat_a = bd09togcj02(lng_b, lat_b) # 高德
lng_g, lat_g = gcj02towgs84(lng_a, lat_a) # GPS
else:
lng_a, lat_a = None, None
lng_g, lat_g = None, None
urlPath = soup.find("div", class_="f-crumbs f-w1190").get_text().strip().replace("\n", "")
try:
cityname = urlPath.split(">")[0].replace("赶集", "")
except:
cityname = None
end_json = {
"cityname": cityname,
"lng_a": lng_a, # 高德
"lat_a": lat_a,
"lng_b": lng_b, # 百度
"lat_b": lat_b,
"lng_g": lng_g, # GPS
"lat_g": lat_g,
"gps_s": "b",
"urlPath": urlPath,
"name": name,
"price": price_num,
"price_rmb": price_rmb,
"pricetype": price_month,
"peitao": house_peizhi,
"describe": describe,
"projname": xiaoqu_name,
"address": address
}
end_data = end_json.copy()
end_data.update(_info)
except:
end_data = None
return end_data
{
"crawlTime" : "2018-04-23 17:46:14",
"recordBatchNo" : "17",
"province" : "广东省",
"cityname" : "云浮 ",
"lng_a" : 112.03291232909547,
"lat_a" : 22.931326338916666,
"lng_b" : 112.039476574,
"lat_b" : 22.9370670779,
"lng_g" : 112.02761893741206,
"lat_g" : 22.934005392048782,
"gps_s" : "b",
"urlPath" : "云浮赶集 > 云浮房产 > 云浮租房 > 云城租房",
"name" : "富临花园交通便利物业很好",
"price" : "700",
"price_rmb" : "¥",
"pricetype" : "/月",
"peitao" : "电视,空调,热水器,洗衣机,冰箱,床,沙发,衣柜,暖气,宽带网,可做饭,独立阳台,独卫",
"describe" : "147家电齐全 有空调 洗衣机 电视机(富临花园) 热水器,带一个阳台,视野无遮挡。",
"projname" : "富临花园",
"address" : null,
"huxing" : "1室1厅1卫",
"buildingArea" : "整租51㎡",
"orientation" : "南北向",
"floor" : "低层/共20层",
"is_Elevator" : "有电梯",
"renovation" : "精装修",
"in_house_date" : "2018-04-09",
"see_house_date" : "周六/日"
}
标签:return lag md5 group ide iges oat utils author
原文地址:https://www.cnblogs.com/dockers/p/9238454.html