标签:间隔 hex dig split href util lin website 文件名
#!/root/.pyenv/shims/python3.6
# -*- coding:utf-8 -*-
# Anuthor : zhibo.wang
# E-mail : d_1206@qq.com
# Date : 18/04/12 16:11:28
# Desc : 美团酒店
import time
import hashlib
import socket
import random
import json
import requests
from bs4 import BeautifulSoup
from data_utils.time_convert import get_time_stamp
from data_utils.conmongodb import mongo_con_keepalive
from data_utils.ali_oss import OSS2
class Crawl:
is_proxy = True #
proxyMeta = "http://xxxx:xxxx@proxy.abuyun.com:9020"
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
start_url = "http://hotel.meituan.com/"
time_stamp = get_time_stamp()
path_dir = "hotel/meituan/{0}/".format(time_stamp)
time_local = time.localtime(int(time_stamp))
date = time.strftime("%Y%m%d", time_local)
data_url = "https://ihotel.meituan.com/hbsearch/HotelSearch" "?utm_medium=pc" "&version_name=999.9" "&cateId=20" "&attr_28=129" "&uuid=" "&cityId=cityId" "&offset=0" "&limit=20" "&startDay={0}" "&endDay={1}" "&q=" "&sort=defaults" "&poi_attr_20022=poi_attr_20022".format(date, date)
params_citys = "params_citys"
website = "hotel_meituan"
timeout = 20 # 超时时间
if is_proxy:
wait_time = [0.16, 0.17]
else:
wait_time = [1, 1.1, 1.2, 1.3] # 间隔时间
headers = {
"Host": "hotel.meituan.com",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Cache-Control" : "max-age=0",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Content-Type": "text/html"
}
def __init__(self):
self.db = mongo_con_keepalive()
self.db.get_collection(‘pathdir_dict‘).insert_one({‘pathdir‘: self.path_dir, ‘website‘: self.website, ‘flag‘: False})
self.oss = OSS2()
super(Crawl, self).__init__()
def req(self, url, headers, pattern=True, num=3):
# 请求数据
time.sleep(random.choice(self.wait_time))
soup = None
if not num:
return soup
try:
if self.is_proxy:
r = requests.get(url, headers=headers, timeout=self.timeout, proxies=self.proxies)
else:
r = requests.get(url, headers=headers, timeout=self.timeout)
if r.status_code == 200:
r.encoding = ‘utf-8‘
if pattern:
soup = BeautifulSoup(r.text, "html.parser")
else:
soup = r.json()
elif r.status_code != 200:
num -=1
return self.req(url, headers, pattern, num)
except Exception as e:
print("fun req error: ", e)
return soup
def get_hotel_type_code(self, city_data):
# 获取酒店类型
city_url = "{0}{1}/".format(self.start_url, city_data.get("meituan_code"))
headers = self.headers
soup = self.req(city_url, headers, pattern=True)
end_data = None
if soup:
txt = soup.find_all("div", class_="search-row-content")[2]
end_data = [{"name": i.get_text().strip(), "poi_attr": i.get("href").split("/")[-2].replace("c", "")} for i in txt.find_all("a")]
return end_data
def create_filename(self, url):
# 生成文件名
fname = ‘%s_%s_%s_%s.html‘ % (socket.gethostname(),
url.split(‘//‘)[-1].split(‘/‘)[0].replace(‘.‘, ‘-‘),
hashlib.md5(url.encode()).hexdigest(),
str(time.time()).split(‘.‘)[0])
return fname
def get_data_totalcount(self, tot_url, headers):
# 获取数据总数
data = self.req(tot_url, headers, pattern=False)
count = None
if data:
count = data.get("data").get("totalcount")
return count
def start(self):
city_datas = self.db.get_collection(self.params_citys).find({})
for city_data in city_datas:
cityname = city_data.get("cityname")
if city_data.get("meituan_code"):
referer = "{0}{1}/".format(self.start_url, city_data.get("meituan_code"))
hotel_type_codes = self.get_hotel_type_code(city_data)
# 获取酒店类型
if hotel_type_codes:
headers = self.headers
headers["Content-Type"] = "application/json, text/plain, */*"
headers["Host"] = "ihotel.meituan.com"
headers["Origin"] = "http://hotel.meituan.com"
headers["Referer"] = referer
for hotel_code in hotel_type_codes:
hotel_type_name = hotel_code.get("name")
hotel_type_code = hotel_code.get("poi_attr")
tot_url = self.data_url.replace("cityId=cityId", "cityId={0}".format(city_data.get("meituan_id"))) .replace("poi_attr_20022=poi_attr_20022","poi_attr_20022={0}".format(hotel_code.get("poi_attr")))
totalcount = self.get_data_totalcount(tot_url, headers)
# 获取数据总数
if totalcount:
all_url = [tot_url.replace("offset=0", "offset={0}".format(c)) for c in range(0, totalcount+1, 20)]
# 根据数据总数生成所有分页地址
for url_ in all_url:
data = self.req(url_, headers, pattern=False)
if data:
file_ = "{0}{1}".format(self.path_dir, self.create_filename(url_))
data["cityname"] = cityname
data["hotel_type_name"] = hotel_type_name
data["hotel_type_code"] = hotel_type_code
self.oss.uploadfiledata(file_, json.dumps(data))
if __name__ == "__main__":
C = Crawl()
C.start()
标签:间隔 hex dig split href util lin website 文件名
原文地址:https://www.cnblogs.com/dockers/p/9238473.html