标签:return lag md5 group ide iges oat utils author
#!/usr/bin/env python # -*- coding:utf-8 -*- # Author : zhibo.wang # E-mail : d_1206@qq.com # Date : 18/04/24 09:22:58 # Desc : 赶集租房 import hashlib import socket import time import scrapy import copy from house_spider.agent import get_agent from bs4 import BeautifulSoup from utils.time_convert import get_time_stamp from utils.conmongodb import mongo_con_keepalive class GanJiSpider(scrapy.Spider): name = ‘ganji_rent_spider‘ allowed_domains = [‘ganji.com‘] custom_settings = { "DOWNLOAD_DELAY": 0.4, } def __init__(self): self.db = mongo_con_keepalive() self.start_data = [{"cityname": i.get("cityname"), "city_code": i.get("ganji_code"), "url": "http://{0}.ganji.com/fang1/m1/".format(i.get("ganji_code"))} for i in self.db.get_collection("para_city_list").find({}, {"_id":0})] self.efs = self.db.get_collection(‘pathdir_dict‘) self.ganji_header = get_agent(‘ganji_header‘) self.path_dir = "realstate/house/ganji/rent/{0}/".format(get_time_stamp()) self.efs.insert_one({‘pathdir‘: self.path_dir, ‘website‘: ‘ganji_rent‘, ‘flag‘: False}) super(GanJiSpider, self).__init__() def start_requests(self): reqs = [] for start_ in self.start_data: hd = copy.deepcopy(self.ganji_header[0]) url = start_.get("url") hd[‘Host‘] = url.split("/")[2] data = {"cityname": start_.get("cityname"), "city_code": start_.get("city_code"), "url": url, "host": hd[‘Host‘]} reqs.append(scrapy.Request(url, headers=hd, callback=self.parse_district, meta=data)) return reqs def parse_district(self, response): hd = copy.deepcopy(self.ganji_header[0]) host = response.request.meta.get("host") cityname = response.request.meta.get("cityname") city_code = response.request.meta.get("city_code") hd[‘Host‘] = host hd["Referer"] = response.request.meta.get("url") soup = BeautifulSoup(response.text, ‘html.parser‘) district_list = soup.find("div", class_="thr-list").find_all("li", class_="item") for district_ in district_list: url = "http://{0}.ganji.com{1}".format(city_code, district_.find("a").get("href")) data = {"cityname": cityname, "city_code": city_code,"url": url, "host": hd[‘Host‘], "page": True} yield scrapy.Request(url, headers=hd, callback=self.parse_page, meta=data) def parse_page(self, response): # 处理列表页 hd = copy.deepcopy(self.ganji_header[0]) host = response.request.meta.get("host") cityname = response.request.meta.get("cityname") hd[‘Host‘] = host hd["Referer"] = response.request.meta.get("url") soup = BeautifulSoup(response.text, ‘html.parser‘) city_code = response.request.meta.get("city_code") try: house_list = soup.find("div", class_="f-list js-tips-list").find_all("div", class_="f-list-item ershoufang-list") # 获取列表页 所有房源地址 house_datas = [ "http://{0}.ganji.com{1}".format(city_code, i.get("href")) for i in house_list ] for house_data in house_datas: yield scrapy.Request(house_data, headers=hd, callback=self.parse_item) except: print("没有租房数据") if response.request.meta.get("page"): try: all_count = int( soup.find("p", class_="m-result f-fr").find("span", class_="num").get_text().replace("套", "")) end_apge = int(all_count/50) if all_count % 50 != 0: end_apge += 1 page_urls = [ response.request.meta.get("url").replace("m1/", "m1o{0}/".format(i)) for i in range(2, int(end_apge/2))] for page_url in page_urls: data = {"cityname": cityname, "city_code": city_code, "url": page_url, "host": hd[‘Host‘], "page": False} yield scrapy.Request(page_url, headers=hd, callback=self.parse_page, meta=data) except Exception as e: print("没有分页: ", e) def parse_item(self, response): # fname = ‘%s_%s_%s_%s.html‘ % (socket.gethostname(), response.url.split(‘//‘)[-1].split(‘/‘)[0].replace(‘.‘, ‘-‘), hashlib.md5(response.url.encode()).hexdigest(), str(time.time()).split(‘.‘)[0]) doc = response.text + ‘\n<!--%s-->‘ % response.url return {‘path‘: self.path_dir + fname, ‘data‘: doc}
def etl(html): soup = BeautifulSoup(html, "html.parser") try: card_top = soup.find("div", class_="card-top") name = card_top.find("p", class_="card-title").find("i").get_text() card_pay = card_top.find("ul", class_="card-pay f-clear") price_num = card_pay.find("span", class_="num").get_text() price_rmb = card_pay.find("span", class_="rmb").get_text() price_month = card_pay.find("span", class_="month").get_text() er_ = card_top.find("ul", class_="er-list f-clear") item_list = er_.find_all("li", class_="item f-fl") item = [{i.find("span", class_="t").get_text().strip().replace(":", ""): i.find("span", class_="content").get_text().strip().replace(";", "").replace("\xa0 ", "")} for i in item_list] info_dict = {} for i in item: for k, v in i.items(): info_dict[k] = v field_mapping = { "户型": "huxing", "面积": "buildingArea", "朝向": "orientation", "楼层": "floor", "电梯情况": "is_Elevator", "装修情况": "renovation", "入住时间": "in_house_date", "看房时间": "see_house_date", } _info = {} for key in field_mapping.keys(): try: _info[field_mapping[key]] = info_dict[key] except Exception as e: _info[field_mapping[key]] = None address_data = card_top.find("ul", class_="er-list-two f-clear") add = [i.get_text().replace("\n", "") for i in address_data.find_all("li", class_="er-item f-fl")] xiaoqu_name, address = None, None for i in add: if "小区名称:" in i: xiaoqu_name = add[0].replace("小区名称:", "").split(" ")[0].strip() elif "所在地址:" in i: address = add[-1].replace("所在地址:", "") others = soup.find("div", class_="f-main-left f-fl f-w970") house_peizhi = ",".join([i.get_text() for i in others.find("div", {"class":"f-group", "id":"js-house-peizhi"}).find_all("p", class_="text")]) describe = others.find("div", {"class":"f-group", "id":"js-house-describe"}).find("div", class_="describe").get_text().strip().replace("\r", "") try: lo = json.loads(others.find("div", class_="col-sub map-wrap").find("div", class_="map-content js-map-tab js-so-map-tab").get("data-ref")) lnglat = lo.get("lnglat") lng_b, lat_b = [float(i.replace("b", "")) for i in lnglat.split(",")] except: lng_b, lat_b = None, None if lng_b: lng_a, lat_a = bd09togcj02(lng_b, lat_b) # 高德 lng_g, lat_g = gcj02towgs84(lng_a, lat_a) # GPS else: lng_a, lat_a = None, None lng_g, lat_g = None, None urlPath = soup.find("div", class_="f-crumbs f-w1190").get_text().strip().replace("\n", "") try: cityname = urlPath.split(">")[0].replace("赶集", "") except: cityname = None end_json = { "cityname": cityname, "lng_a": lng_a, # 高德 "lat_a": lat_a, "lng_b": lng_b, # 百度 "lat_b": lat_b, "lng_g": lng_g, # GPS "lat_g": lat_g, "gps_s": "b", "urlPath": urlPath, "name": name, "price": price_num, "price_rmb": price_rmb, "pricetype": price_month, "peitao": house_peizhi, "describe": describe, "projname": xiaoqu_name, "address": address } end_data = end_json.copy() end_data.update(_info) except: end_data = None return end_data
{ "crawlTime" : "2018-04-23 17:46:14", "recordBatchNo" : "17", "province" : "广东省", "cityname" : "云浮 ", "lng_a" : 112.03291232909547, "lat_a" : 22.931326338916666, "lng_b" : 112.039476574, "lat_b" : 22.9370670779, "lng_g" : 112.02761893741206, "lat_g" : 22.934005392048782, "gps_s" : "b", "urlPath" : "云浮赶集 > 云浮房产 > 云浮租房 > 云城租房", "name" : "富临花园交通便利物业很好", "price" : "700", "price_rmb" : "¥", "pricetype" : "/月", "peitao" : "电视,空调,热水器,洗衣机,冰箱,床,沙发,衣柜,暖气,宽带网,可做饭,独立阳台,独卫", "describe" : "147家电齐全 有空调 洗衣机 电视机(富临花园) 热水器,带一个阳台,视野无遮挡。", "projname" : "富临花园", "address" : null, "huxing" : "1室1厅1卫", "buildingArea" : "整租51㎡", "orientation" : "南北向", "floor" : "低层/共20层", "is_Elevator" : "有电梯", "renovation" : "精装修", "in_house_date" : "2018-04-09", "see_house_date" : "周六/日" }
标签:return lag md5 group ide iges oat utils author
原文地址:https://www.cnblogs.com/dockers/p/9238454.html