标签:间隔 hex dig split href util lin website 文件名
#!/root/.pyenv/shims/python3.6 # -*- coding:utf-8 -*- # Anuthor : zhibo.wang # E-mail : d_1206@qq.com # Date : 18/04/12 16:11:28 # Desc : 美团酒店 import time import hashlib import socket import random import json import requests from bs4 import BeautifulSoup from data_utils.time_convert import get_time_stamp from data_utils.conmongodb import mongo_con_keepalive from data_utils.ali_oss import OSS2 class Crawl: is_proxy = True # proxyMeta = "http://xxxx:xxxx@proxy.abuyun.com:9020" proxies = { "http": proxyMeta, "https": proxyMeta, } start_url = "http://hotel.meituan.com/" time_stamp = get_time_stamp() path_dir = "hotel/meituan/{0}/".format(time_stamp) time_local = time.localtime(int(time_stamp)) date = time.strftime("%Y%m%d", time_local) data_url = "https://ihotel.meituan.com/hbsearch/HotelSearch" "?utm_medium=pc" "&version_name=999.9" "&cateId=20" "&attr_28=129" "&uuid=" "&cityId=cityId" "&offset=0" "&limit=20" "&startDay={0}" "&endDay={1}" "&q=" "&sort=defaults" "&poi_attr_20022=poi_attr_20022".format(date, date) params_citys = "params_citys" website = "hotel_meituan" timeout = 20 # 超时时间 if is_proxy: wait_time = [0.16, 0.17] else: wait_time = [1, 1.1, 1.2, 1.3] # 间隔时间 headers = { "Host": "hotel.meituan.com", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Cache-Control" : "max-age=0", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Content-Type": "text/html" } def __init__(self): self.db = mongo_con_keepalive() self.db.get_collection(‘pathdir_dict‘).insert_one({‘pathdir‘: self.path_dir, ‘website‘: self.website, ‘flag‘: False}) self.oss = OSS2() super(Crawl, self).__init__() def req(self, url, headers, pattern=True, num=3): # 请求数据 time.sleep(random.choice(self.wait_time)) soup = None if not num: return soup try: if self.is_proxy: r = requests.get(url, headers=headers, timeout=self.timeout, proxies=self.proxies) else: r = requests.get(url, headers=headers, timeout=self.timeout) if r.status_code == 200: r.encoding = ‘utf-8‘ if pattern: soup = BeautifulSoup(r.text, "html.parser") else: soup = r.json() elif r.status_code != 200: num -=1 return self.req(url, headers, pattern, num) except Exception as e: print("fun req error: ", e) return soup def get_hotel_type_code(self, city_data): # 获取酒店类型 city_url = "{0}{1}/".format(self.start_url, city_data.get("meituan_code")) headers = self.headers soup = self.req(city_url, headers, pattern=True) end_data = None if soup: txt = soup.find_all("div", class_="search-row-content")[2] end_data = [{"name": i.get_text().strip(), "poi_attr": i.get("href").split("/")[-2].replace("c", "")} for i in txt.find_all("a")] return end_data def create_filename(self, url): # 生成文件名 fname = ‘%s_%s_%s_%s.html‘ % (socket.gethostname(), url.split(‘//‘)[-1].split(‘/‘)[0].replace(‘.‘, ‘-‘), hashlib.md5(url.encode()).hexdigest(), str(time.time()).split(‘.‘)[0]) return fname def get_data_totalcount(self, tot_url, headers): # 获取数据总数 data = self.req(tot_url, headers, pattern=False) count = None if data: count = data.get("data").get("totalcount") return count def start(self): city_datas = self.db.get_collection(self.params_citys).find({}) for city_data in city_datas: cityname = city_data.get("cityname") if city_data.get("meituan_code"): referer = "{0}{1}/".format(self.start_url, city_data.get("meituan_code")) hotel_type_codes = self.get_hotel_type_code(city_data) # 获取酒店类型 if hotel_type_codes: headers = self.headers headers["Content-Type"] = "application/json, text/plain, */*" headers["Host"] = "ihotel.meituan.com" headers["Origin"] = "http://hotel.meituan.com" headers["Referer"] = referer for hotel_code in hotel_type_codes: hotel_type_name = hotel_code.get("name") hotel_type_code = hotel_code.get("poi_attr") tot_url = self.data_url.replace("cityId=cityId", "cityId={0}".format(city_data.get("meituan_id"))) .replace("poi_attr_20022=poi_attr_20022","poi_attr_20022={0}".format(hotel_code.get("poi_attr"))) totalcount = self.get_data_totalcount(tot_url, headers) # 获取数据总数 if totalcount: all_url = [tot_url.replace("offset=0", "offset={0}".format(c)) for c in range(0, totalcount+1, 20)] # 根据数据总数生成所有分页地址 for url_ in all_url: data = self.req(url_, headers, pattern=False) if data: file_ = "{0}{1}".format(self.path_dir, self.create_filename(url_)) data["cityname"] = cityname data["hotel_type_name"] = hotel_type_name data["hotel_type_code"] = hotel_type_code self.oss.uploadfiledata(file_, json.dumps(data)) if __name__ == "__main__": C = Crawl() C.start()
标签:间隔 hex dig split href util lin website 文件名
原文地址:https://www.cnblogs.com/dockers/p/9238473.html