码迷,mamicode.com
首页 > 其他好文 > 详细

榛果 美团 登录 爬虫 requests session

时间:2018-08-22 14:55:41      阅读:591      评论:0      收藏:0      [点我收藏+]

标签:ima   post   cep   min   div   pymysql   auth   nsx   wow   

所有美团方面旗下的登陆都采用重定向来解决登陆问题  

即利用session 对话来解决登陆问题 

当然也可以每次都模拟他的cookie来进行登陆

 

我用的代理是阿布云代理   你们也可以选择别代理

这次是爬取的美团旗下的榛果民宿

  1 import requests
  2 from urllib.parse import urlencode
  3 import json
  4 import time, datetime
  5 import logging
  6 from lxml import etree
  7 import pymysql
  8 from pymysql.err import IntegrityError
  9 
 10 proxies_ = {
 11     http: @http-dyn.abuyun.com:9020,
 12     https: @http-dyn.abuyun.com:9020,
 13 }
 14 headers = {
 15     User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36 OPR/48.0.2685.52
 16 }
 17 session = requests.Session()
 18 
 19 
 20 def session_get(url, header=headers, tab=12):
 21     if tab == 0:
 22         return False
 23     try:
 24         response = session.get(url, headers=header, proxies=proxies_)
 25         time.sleep(2)
 26         return response if response.status_code == 200 else session_get(url, header, tab - 1)
 27     except Exception as e:
 28         if tab == 1:
 29             logging.exception(e)
 30         return session_get(url, header, tab - 1)
 31 
 32 
 33 def session_post(url, header=headers, data=None, tab=12):
 34     if tab == 0:
 35         return False
 36     try:
 37         response = session.post(url, headers=header, data=data, proxies=proxies_)
 38         time.sleep(2)
 39         return response if response.status_code == 200 else session_post(url, header, data, tab - 1)
 40     except Exception as e:
 41         if tab == 1:
 42             logging.exception(e)
 43         return session_post(url, header, data, tab - 1)
 44 
 45 
 46 def get_node_text(node, xpath):
 47     """
 48     通过节点和xpath来获取节点需要的内容
 49     :param node:
 50     :param xpath:
 51     :return:
 52     """
 53     try:
 54         if xpath == "string(.)": return node.xpath(string(.)).strip()
 55         if len(node.xpath(xpath)) > 0:
 56             return node.xpath(xpath)[0].strip() if isinstance(node.xpath(xpath)[0], str) else node.xpath(xpath)[0]
 57         return ""
 58     except:
 59         logging.exception(获取xpath %s 出错 % (xpath))
 60         return None
 61 
 62 
 63 def get_youjia_tpp_conn():
 64     """
 65     获取井队数据库连接
 66     :return:
 67     """
 68     return pymysql.connect(host=host, user=user, passwd=passwd, db=db, port=3306,
 69                            charset=utf8)
 70 
 71 
 72 def storage_database_text(data_json, t_name, l_name="youjia_tpp"):
 73     """
 74     非json类型数据存储数据库
 75     :param data_json:
 76     :param t_name:
 77     :param l_name:
 78     :return:
 79     """
 80     now_time = str(datetime.datetime.now().strftime(%Y-%m-%d %H:%M:%S))
 81     data_list = []
 82     insert_sql = "INSERT INTO " + l_name + "." + t_name + " ("
 83     update_sql = "UPDATE " + l_name + "." + t_name + " SET "
 84     for key in data_json:
 85         update_sql += str(key) + "=%s , "
 86         if str(key) == "id":
 87             id_key = data_json[key]
 88         insert_sql += str(key) + ","
 89     update_sql += "modify_time = ‘" + str(now_time) + "‘ where id = ‘" + str(id_key) + ""
 90     insert_sql = insert_sql[:-1]
 91     insert_sql += ")VALUES("
 92     for key in data_json:
 93         insert_sql += "%s,"
 94         data_list.append(str(data_json[key]))
 95     insert_sql = insert_sql[:-1]
 96     insert_sql += ");"
 97     # print(update_sql)
 98     # print(insert_sql)
 99     with get_youjia_tpp_conn() as conn:
100         try:
101             print("storage_database_text  insert_sql : ", t_name)
102             conn.execute(insert_sql, tuple(data_list))
103         except IntegrityError:
104             print("storage_database_text  update_sql : ", t_name)
105             conn.execute(update_sql, tuple(data_list))
106         except Exception as msg:
107             logging.exception(msg)
108 
109 
110 def storage_database_json(id_, data_json, j_name, t_name, l_name="youjia_tpp"):
111     """
112     存储json形式至数据库
113     :param id_: id
114     :param data_json: json
115     :param j_name: json的名字
116     :param t_name: 表名
117     :param l_name: 库名
118     :return:
119     """
120     now_time = str(datetime.datetime.now().strftime(%Y-%m-%d %H:%M:%S))
121     insert_sql = "INSERT INTO " + l_name + "." + t_name + " (`id`,`" + j_name + "`)VALUES(%s,%s);"
122     updatesql = "update " + l_name + "." + t_name + " set `" + j_name + "`=%s , modify_time=%s where id = %s;"
123     # print(updatesql % (data_json, now_time, id_))
124     with get_youjia_tpp_conn() as conn:
125         try:
126             print("storage_database_json  insert_sql : ", t_name)
127             conn.execute(insert_sql, (id_, data_json))
128         except IntegrityError:
129             print("storage_database_json  update_sql : ", t_name)
130             conn.execute(updatesql, (data_json, now_time, id_))
131         except Exception as msg:
132             logging.exception(msg)
133 
134 
135 def pre_login():
136     try:
137         param = {
138             # ‘uuid‘: ‘e8514dbe200b4fde9393.1532912269.1.0.0‘,
139             service: phoenix,
140             continue: https://www.zhenguo.com/auth/authenticated/?continue=/help/trust/,
141         }
142         url = https://passport.meituan.com/account/unitivelogin? + urlencode(param)
143         response = session_get(url=url, header=headers, tab=5)
144         if response.status_code == 200:
145             print("pre_login 成功")
146             return response.text
147         else:
148             return None
149     except ConnectionError as e:
150         print(e.args)
151         print(预登陆出错)
152 
153 
154 def parse_param(html):
155     try:
156         html = etree.HTML(html)
157         csrf = html.xpath(//input[@name="csrf"]/@value)[0]
158         origin = html.xpath(//input[@name="origin"]/@value)[0]
159         fingerprint = html.xpath(//input[@name="fingerprint"]/@value)[0]
160         uuid = html.xpath(//i[@class="form-uuid"]/text())[0]
161         need_captcha = html.xpath(//div[@class="form-field J-form-field-captcha form-field--captcha"]/@style)[
162             0].replace("display:", "")
163         return (csrf, uuid, need_captcha, origin, fingerprint)
164     except:
165         print(解析csrf,uuid,need_captcha出错)
166 
167 
168 def formal_login(username, password, param):
169     csrf = param[0]
170     uuid = param[1]
171     origin, fingerprint = param[3], param[4]
172     if 1 == 1:
173         captcha_param = {
174             uuid: uuid,
175         }
176         url = https://passport.meituan.com/account/captcha? + urlencode(captcha_param)
177         print(url)
178         image_resp = session_get(url)
179         with open(C:/Users/admin/Desktop/image/zg.jpg, wb) as file:
180             file.write(image_resp.content)
181         captcha = input(需要验证码:)
182     # else:
183     #     captcha = ‘‘
184     url_param = {
185         uuid: uuid,
186         service: phoenix,
187         continue: https://www.zhenguo.com/auth/authenticated/?continue=/help/trust/,
188     }
189     postdata = {
190         email: username,
191         password: password,
192         captcha: captcha,
193         origin: origin,
194         fingerprint: fingerprint,
195         csrf: csrf
196     }
197     url = https://passport.meituan.com/account/unitivelogin? + urlencode(url_param)
198     try:
199         response = session_post(url, data=postdata, header=headers)
200         if response.status_code == 200:
201             print("登陆成功!")
202             return response.text
203         else:
204             return None
205     except ConnectionError as e:
206         print(e.args)
207         print(登录出错)
208 
209 
210 def parse_token(html):
211     try:
212         html = etree.HTML(html)
213         action_url = html.xpath(//form[@class="J-form mainbox__content"]/@action)[0]
214         token = html.xpath(//input[@name="token"]/@value)[0]
215         expire = html.xpath(//input[@name="expire"]/@value)[0]
216         isdialog = html.xpath(//input[@name="isdialog"]/@value)[0]
217         autologin = html.xpath(//input[@name="autologin"]/@value)[0]
218         csrf = html.xpath(//*[@id="csrf"]/text())[0]
219 
220         # headers[‘x-csrf-token‘] = csrf
221         # trust_response = session.post(action_url, data=postdata, headers=headers)
222         # print(trust_response.text)
223         return {"action_url": action_url, "token": token, "expire": expire, "isdialog": isdialog,
224                 "autologin": autologin, "csrf": csrf}
225     except:
226         logging.exception(解析token出错)
227 
228 
229 def redirect_login(token_json):
230     """
231     {"action_url": action_url, "token": token, "expire": expire, "isdialog": isdialog,
232                 "autologin": autologin, "csrf": csrf}
233     :param token:
234     :return:
235     """
236     postdata = {
237         token: token_json[token],
238         expire: token_json[expire],
239         isdialog: token_json[isdialog],
240         autologin: token_json[autologin],
241         logintype: normal
242     }
243     headers[x-csrf-token] = token_json[csrf]
244     try:
245         trust_response = session_post(token_json[action_url], data=postdata, header=headers)
246         print("重定向成功!!")
247         # tt = session.get("https://www.zhenguo.com/house/list/", headers=t_h)
248     except ConnectionError as e:
249         print(e.args)
250         print(重定向出错)
251 
252 
253 def test():
254     try:
255         time.sleep(5)
256         url = http://maoyan.com/profile
257         response = session_get(url, header=headers)
258         print(response.status_code)
259         print(response.text)
260     except ConnectionError as e:
261         print(e.args)
262         print(测试出错)
263 
264 
265 def crawl_order(account_id, token, page_no=1, page_size=20):
266     orders_url = "https://www.zhenguo.com/host/orders/"
267     response = session_get(orders_url, header=headers)
268     print(response.status_code)
269     html = etree.HTML(response.text)
270     csrf = html.xpath(//meta[@name="csrf-token"]/@content)[0]
271     headers[x-csrf-token] = csrf
272     print(csrf)
273     queryOrderByTypeUrl = "https://www.zhenguo.com/gw/order/api/v1/orderSearch/queryOrderByType"
274     OrderByType = {pageNow: page_no, pageSize: page_size, orderStatusType: 9}
275     headers[Accept] = "application/json"
276     headers[Content-Type] = "application/json"
277     query_response = session_post(queryOrderByTypeUrl, data=json.dumps(OrderByType), header=headers)
278     query_json = query_response.json()
279     query_list = query_json[data][list]
280     print(len(query_list))
281     for order_json in query_list:
282         order_id = order_json[orderId]
283         storage_database_json(order_id, json.dumps(order_json), order, zhenguo_order)
284         storage_database_text({"id": order_id, account_id: account_id}, zhenguo_order)
285 
286     if len(query_list) == page_size:
287         crawl_order(account_id, page_no + 1)
288 
289 
290 def house_detail(list_json):
291     """
292     解析房屋详情的
293     :param list_json:
294     :return:
295     """
296     room_id = list_json["id"]
297     room_url = "https://www.zhenguo.com/housing/%s" % room_id
298     room_response = session_get(room_url)
299     if room_response:
300         html = etree.HTML(room_response.text)
301         room_type = get_node_text(html,
302                                   //*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[1]/text())
303         list_json["room_type"] = room_type
304         house_wear = get_node_text(html,
305                                    //*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[2]/text())
306         list_json["house_wear"] = house_wear
307         room_area = get_node_text(html,
308                                   //*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[3]/text())
309         list_json["room_area"] = room_area
310         for node in html.xpath(//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[2]/ul/li):
311             text = get_node_text(node, ./div[1]/text())
312             node_detail = get_node_text(node, ./div[2]/text())
313             if text == "房源":
314                 room_count = node_detail
315                 list_json["room_count"] = room_count
316             if text == "评价":
317                 comment_count = node_detail
318                 list_json["comment_count"] = comment_count
319             if text == "咨询回复率":
320                 rep_rate = node_detail
321                 list_json["rep_rate"] = rep_rate
322             if text == "咨询回复时长":
323                 rep_length = node_detail
324                 list_json["rep_length"] = rep_length
325         str(1).strip()
326         reserve = get_node_text(html, //*[@id="J-layout"]/div[2]/div/
327                                       div[2]/div/div[2]/section[8]/ul[1]/li[2]/text()).split("")
328         # list_json["reserve"] = reserve
329         if len(reserve) > 1:
330             less_day = reserve[0].replace("最少预订", "").replace("", "").strip()
331             more_day = reserve[1].replace("最多预订", "").replace("", "").strip()
332             list_json["less_day"] = less_day
333             list_json["more_day"] = more_day
334         unsubscribe = get_node_text(html, //*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[8]/ul[2]/li/text())
335         list_json["unsubscribe"] = unsubscribe
336     return list_json
337 
338 
339 def crawl_room(account_id, token):
340     comment_url = "https://www.zhenguo.com/gw/ugc/api/v1/product/comments?productId=%s&pageNow=1&pageSize=100"
341     room_list_url = "https://www.zhenguo.com/house/list/"
342     room_response = session_get(url=room_list_url, header=headers)
343     if room_response:
344         html = etree.HTML(room_response.text)
345     for node in html.xpath(//div[@class="houseCard__block"]):
346         title = get_node_text(node, ./div[@class="houseCard__titleLine"]/text())  # 标题
347         price = get_node_text(node, ./div[@class="houseCard__addLine clearfix"]
348                                     /span[1]/span[@class="houseCard__price"]/text()).replace("", "")  # 价格
349         state = get_node_text(node, ./div[@class="houseCard__bottomLine clearfix"]/
350                                     div[1]/span[@class="houseCard__verifyStatus-5"]/text())  # 状态
351         room_id = get_node_text(node, ./div[@class="houseCard__bottomLine clearfix"]
352                                       /div[1]/@data-product-id)  # 房源id
353         print(account_id, title, price, state, room_id)
354         list_json = {"account_id": account_id, "title": title,
355                      "price": price, "state": state, "id": room_id, "room_id": room_id}
356         comment_ = comment_url % room_id
357 
358         house_json = house_detail(list_json)
359         response = session_get(url=comment_)
360         if response:
361             print(response.text)
362             storage_database_json(room_id, json.dumps(response.json()), "comment", "zhenguo_room_info",
363                                   l_name="youjia_tpp")
364         storage_database_text(house_json, zhenguo_room_info)
365 
366 
367 def crawl_room_list(account_id, token):
368     app_header = {"User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; vivo X7 Build/LMY47V) AppleWebKit/537.36 "
369                                 "(KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36 TitansX/11.6.12 "
370                                 "KNB/1.2.0 android/5.1.1 phoenix/com.meituan.phoenix/2.6.0 com.meituan.phoenix/2.6.0",
371                   "Cookie": "token=" + token}
372     list_url = "https://iphx.meituan.com/ds/product/online/list"
373     list_resp = session_get(url=list_url, header=app_header)
374     if list_resp:
375         list_json = list_resp.json()
376         for room_json in list_json[data][list]:
377             room_id = room_json[productId]
378             product_quota_url = "https://iphx.meituan.com/api/product/api/v1/product/getProductQuota/"+str(room_id)
379             product_quota_resp = session_get(url=product_quota_url, header=app_header)
380             print(room_json)
381             print(product_quota_resp.json()[data])
382 
383 
384 
385 def crawl(account_id, token):
386     """
387     登录的session搞定之后 开始爬取详细信息
388     :return:
389     """
390     crawl_room_list(account_id, token)  # 爬取手机端信息
391 
392     # crawl_room(account_id, token)  # 房屋爬取
393     # crawl_order(account_id, token)  # 订单爬虫
394 
395 
396 def login(username, password):
397     html_pre_login = pre_login()
398     param = parse_param(html_pre_login)
399     print("param: ", param)
400     html_login = formal_login(username, password, param)
401     # print(html_login)
402     token_json = parse_token(html_login)
403     print("token_json: ", token_json)
404     redirect_login(token_json)
405     return token_json[token]
406 
407 
408 if __name__ == __main__:
409     username = username
410     password = username
411     token = login(username, password)
412     crawl(1, token)

 

榛果 美团 登录 爬虫 requests session

标签:ima   post   cep   min   div   pymysql   auth   nsx   wow   

原文地址:https://www.cnblogs.com/bianzhiwei/p/9517282.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!