码迷,mamicode.com
首页 > 其他好文 > 详细

驴妈妈旅游爬虫

时间:2019-03-14 13:20:21      阅读:189      评论:0      收藏:0      [点我收藏+]

标签:bsp   index   product   join   cep   pymysql   web   time   dig   

概览页抓取链接

  1 import requests
  2 import re
  3 import pymysql
  4 import hashlib
  5 import datetime
  6 
  7 
  8 class Demo(object):
  9     def __init__(self):
 10         self.host = 127.0.0.1
 11         self.db = app_mark
 12         self.user = root
 13         self.passwd = 123456
 14         self.charset = utf8mb4
 15         self.headers = {
 16             User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36,
 17         }
 18         self.url = http://www.lvmama.com/
 19         self.channel_link = [
 20             http://s.lvmama.com/group/H13K110000?keyword=%E6%99%AE%E5%90%89%E5%B2%9B&k=0#list,  # 海岛
 21             http://s.lvmama.com/route/H13K310000?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list,  # 东南亚
 22             http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list,  # 中国港澳台
 23             http://s.lvmama.com/group/H13K110000?keyword=%E8%BF%AA%E6%8B%9C&k=0#list,  # 迪拜
 24             http://s.lvmama.com/group/C262H13K110000?keyword=%E4%BF%84%E7%BD%97%E6%96%AF&tabType=group#list,  # 俄罗斯
 25             http://s.lvmama.com/group/H13K110000Y4?keyword=%E8%B6%8A%E5%8D%97#list#list,  # 越南
 26             http://s.lvmama.com/group/C265H13K110000?keyword=%E6%B3%95%E5%9B%BD&tabType=group#list%22,  # 法瑞意德
 27             http://s.lvmama.com/group/H13K110000?keyword=%E5%B7%B4%E5%8E%98%E5%B2%9B&k=0#list,  # 巴厘岛
 28             http://s.lvmama.com/route/H13K310000?keyword=%E6%97%A5%E6%9C%AC&k=0#list,  # 日本
 29             http://s.lvmama.com/route/H13K310000?keyword=%E6%AC%A7%E6%B4%B2&k=0#list,  # 欧美
 30             http://s.lvmama.com/route/H13K440100?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list,  # 新加坡
 31             http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list,  # 香港
 32             http://s.lvmama.com/route/H13K310000?keyword=%E6%BE%B3%E6%B4%B2&k=0#list,  # 澳洲
 33             http://s.lvmama.com/route/H13K310000?keyword=%E6%B3%B0%E5%9B%BD&k=0#list,  # 泰国
 34             http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%89%E4%BA%9A&k=0#list,  # 三亚
 35             http://s.lvmama.com/route/H13K440300P2?keyword=%E4%B8%89%E4%BA%9A&tabType=route350,  # 三亚p2
 36             http://s.lvmama.com/route/H13K440300P3?keyword=%E4%B8%89%E4%BA%9A&tabType=route350,  # 三亚p3
 37             http://s.lvmama.com/route/H13K440300P4?keyword=%E4%B8%89%E4%BA%9A&tabType=route350,  # 三亚p4
 38             http://s.lvmama.com/route/H13K440300?keyword=%E5%8E%A6%E9%97%A8&k=0#list,  # 厦门
 39             http://s.lvmama.com/route/H13K440300?keyword=%E5%B9%BF%E4%B8%9C&k=0#list,  # 广东
 40             http://s.lvmama.com/route/H13K440300?keyword=%E4%BA%91%E5%8D%97&k=0#list,  # 云南
 41             http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%8A%E6%B5%B7&k=0#list,  # 上海
 42             http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%AE%89&k=0#list,  # 西安
 43             http://s.lvmama.com/route/H13K440300?keyword=%E6%88%90%E9%83%BD&k=0#list,  # 成都
 44             http://s.lvmama.com/route/H13K440300?keyword=%E5%90%89%E6%9E%97&k=0#list,  # 吉林
 45             http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%8C%97&k=0#list,  # 西北
 46             http://s.lvmama.com/scenictour/K110000?keyword=%E5%8C%97%E4%BA%AC&k=0#list,  # 北京
 47             http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E4%B8%9C&k=0#list,  # 山东
 48             http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E8%A5%BF&k=0#list,  # 山西
 49             http://s.lvmama.com/scenictour/K110000?keyword=%E6%B2%B3%E5%8C%97&k=0#list,  # 河北
 50             http://s.lvmama.com/scenictour/K110000?keyword=%E8%BE%BD%E5%AE%81&k=0#list,  # 辽宁
 51             ]
 52         self.channel_name = [
 53             海岛,
 54             东南亚,
 55             中国港澳台,
 56             迪拜,
 57             俄罗斯,
 58             越南,
 59             法瑞意德,
 60             巴厘岛,
 61             日本,
 62             欧洲,
 63             新加坡,
 64             香港,
 65             澳洲,
 66             泰国,
 67             三亚,
 68             三亚p2,
 69             三亚p3,
 70             三亚p4,
 71             厦门,
 72             广东,
 73             云南,
 74             上海,
 75             西安,
 76             成都,
 77             吉林,
 78             西北,
 79             北京,
 80             山东,
 81             山西,
 82             河北,
 83             辽宁,
 84         ]
 85 
 86     def get_html(self, url):
 87         response = requests.get(url, headers=self.headers)
 88         response.encoding = response.apparent_encoding
 89         html = response.text
 90         return html
 91 
 92     def get_data(self):
 93         # 首页抓取
 94         # html = self.get_html(self.url)
 95         # datas = re.findall(‘<li data-mmurl=.*?<div class="footLink">‘, html, re.S)[0]
 96         # lis = re.findall(‘(<li data-mmurl=.*?</li>)‘, datas, re.S)
 97         # for li in lis:
 98         #     # detail_url = re.findall(‘<li data-mmurl="(.*?)"‘, li, re.S)  # 详情页app链接
 99         #     detail_url = re.findall(‘href="(.*?)"‘, li, re.S)[0]  # 详情页网页链接
100         #     self.save_data(detail_url)
101         # print(datas)
102 
103         # 频道抓取
104         urls = []
105         # 正则匹配链接
106         for index, channel in enumerate(self.channel_link):
107             html = self.get_html(channel)
108             divs = re.findall(<div class="product-left".*<div class="paging orangestyle", html, re.S)[0]
109             divs = re.findall(<div class="product-section">.*?</div>, divs, re.S)
110             for div in divs:
111                 print(self.channel_name[index])
112                 url = re.findall(<a href="(.*?)", div, re.S)[0]
113                 self.save_data(url)
114 
115     def save_data(self, url):
116         print(url)
117         hkey = hashlib.md5(url.encode(encoding=utf-8)).hexdigest()
118         sitename = 驴妈妈旅游
119         lasttime = datetime.datetime.now().strftime(%Y-%m-%d %H:%M:%S)
120         tag = 0
121         list_sql = [url, hkey, tag, sitename, lasttime]
122         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
123         cur = con.cursor()
124         sql = insert into gly(link, hkey, tag, sitename, lasttime) values (%s, %s, %s, %s, %s)
125         try:
126             cur.execute(sql, list_sql)
127             print(insert success)
128         except Exception as e:
129             con.rollback()
130             print(error~, e)
131         else:
132             con.commit()
133         cur.close()
134         con.close()
135 
136 
137 if __name__ == __main__:
138     demo = Demo()
139     demo.get_data()

 

细览页解析字段

  1 import pymysql
  2 import re
  3 import requests
  4 from multiprocessing.dummy import Pool as ThreadPool
  5 import datetime
  6 
  7 
  8 class XLY(object):
  9     def __init__(self):
 10         self.host = 127.0.0.1
 11         self.db = app_mark
 12         self.user = root
 13         self.passwd = 123456
 14         self.charset = utf8mb4
 15         self.headers = {
 16             User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36
 17         }
 18         self.start = datetime.datetime.now()
 19 
 20     def get_data(self):
 21         # 从gly表中拿链接
 22         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
 23         cur = con.cursor()
 24         sql = select link from gly where tag = "1" and sitename="驴妈妈旅游"
 25         after_sql = update gly set tag="1" where tag="0" and sitename = "驴妈妈旅游"
 26         try:
 27             cur.execute(sql)
 28             results = cur.fetchall()
 29             cur.execute(after_sql)
 30         except Exception as e:
 31             con.rollback()
 32             results = None
 33             print(error~, e)
 34         else:
 35             con.commit()
 36         cur.close()
 37         con.close()
 38         return results
 39 
 40     def parse_data(self, url):
 41         # 正则匹配各个字段
 42         print(url)
 43         url = url[0]
 44         # 匹配id
 45         id = url.split(/)[-1]
 46         id = re.sub(\?.*, ‘‘, id)
 47         # print(id)
 48         response = requests.get(url, headers=self.headers)
 49         html = response.text
 50         if scenic not in url and hotel not in url:
 51             # 去掉酒店和景点
 52             # 匹配标题
 53             title = re.findall(<h.*?tit">(.*?)</h.*?>, html, re.S)
 54             if title:
 55                 title = title[0]
 56                 title = re.sub(\n|\r|&nbsp;|自营|<[\s\S]*?>, ‘‘, title)
 57                 title = title.strip()
 58             else:
 59                 title = re.findall(<p class="nchtitle">(.*?)</p>, html, re.S)
 60                 if title:
 61                     title = title[0]
 62                     title = re.sub(\n|\r|&nbsp;|自营|<[\s\S]*?>, ‘‘, title)
 63                     title = title.strip()
 64             # 匹配价格
 65             price = re.findall(<dfn.*?>(\d+)</dfn>, html, re.S)
 66             if price:
 67                 price = price[0]
 68             else:
 69                 price = re.findall(<span class="product_price">.*?(\d+).*?</span>, html, re.S)
 70                 if price:
 71                     price = price[0]
 72                 else:
 73                     price = re.findall(¥<em>(\d+)</em>, html, re.S)
 74                     if price:
 75                         price = price[0]
 76                     else:
 77                         price = re.findall(<span class="product-price-value">.*?(\d+).*?</span>, html, re.S)
 78                         if price:
 79                             price = price[0]
 80                         else:
 81                             price = None
 82             # 匹配好评率
 83             praise = re.findall(<p class="product_top_dp">[\s\S]*?<span>([\s\S]*?)</span>[\s\S]*?</p>, html, re.S)
 84             if praise:
 85                 praise = praise[0]
 86                 praise = re.sub(<.*?>, ‘‘, praise)
 87                 praise = praise.strip()
 88             else:
 89                 praise = re.findall(<a href="#pro_comment".*?<span>([\s\S]*?)</span>, html, re.S)
 90                 if praise:
 91                     praise = praise[0]
 92                 else:
 93                     praise = re.findall(<span class="c_f60">([\s\S]*?)</span>, html, re.S)
 94                     if praise:
 95                         praise = praise[0]
 96                         praise = praise.strip()
 97                     else:
 98                         praise = re.findall(<p class="product_top_dp">[\s\S]*?<span>([\s\S]*?)<small>%</small>[\s\S]*?</span>, html, re.S)
 99                         if praise:
100                             praise = praise[0]
101                             praise = praise.strip()
102                         else:
103                             praise = re.findall(<span class="val">([\s\S]*?)</span>, html, re.S)
104                             if praise:
105                                 praise = praise[0]
106             if praise:
107                 if % in praise:
108                     praise = re.sub(%, ‘‘, praise)
109                 praise = float(praise)
110                 if praise > 100:
111                     praise = None
112                     print(好评率抓取错误)
113                 else:
114                     pass
115             else:
116                 praise = None
117             # 匹配出发地
118             starting_city = re.findall(<dl class="info-city">[\s\S]*?出发城市[\s\S]*?<ii>([\s\S]*?)</ii></dd>, html, re.S)
119             target_city = re.findall(<dt>目的地[\s\S]*?<dd>([\s\S]*?)</dd>, html, re.S)
120             if starting_city:
121                 starting_city = starting_city[0]
122                 starting_city = re.sub(<.*?>, ‘‘, starting_city)
123                 # 匹配目的地
124                 target_city = target_city[0]
125                 target_city = re.sub(<.*?>, ‘‘, target_city)
126                 # 匹配天数
127                 days_spent = re.findall(<dt>出游天数[\s\S]*?<dd>([\s\S]*?)</dd>, html, re.S)[0]
128                 days_spent = re.sub(<.*?>, ‘‘, days_spent)
129                 # print(days_spent)
130             else:
131                 starting_city = target_city = days_spent = None
132             # 匹配类型
133             type_ = re.findall(<i class="t-category">([\s\S]*?)</i>, html, re.S)
134             if type_:
135                 type_ = type_[0]
136             else:
137                 type_ = re.findall(<span class="product_top_type product_type_zyx">([\s\S]*?)</span>, html, re.S)
138                 if type_:
139                     type_ = type_[0]
140                 else:
141                     type_ = re.findall(<span class="dpn_group">([\s\S]*?)</span>, html, re.S)
142                     if type_:
143                         type_ = type_[0]
144                     else:
145                         type_ = None
146             # print(type_)
147             list_data = [id, title, price, praise, starting_city, target_city, days_spent, type_, url]
148             self.save_data(list_data)
149 
150     def save_data(self, list_data):
151         # 写入数据库
152         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
153         cur = con.cursor()
154         sql = insert into lvmama(id_num, title, price, praise, starting_city, target_city, days_spent, type_, link) values (%s, %s, %s, %s, %s, %s, %s, %s, %s)
155         # cur.execute(sql, list_data)
156         # con.commit()
157         try:
158             cur.execute(sql, list_data)
159             print(insert success)
160         except Exception as e:
161             con.rollback()
162             print(error~, e)
163         else:
164             con.commit()
165         cur.close()
166         con.close()
167 
168 
169 if __name__ == __main__:
170     xly = XLY()
171     urls = xly.get_data()
172     if urls:
173         # 开启多线程
174         pool = ThreadPool(20)
175         pool.map(xly.parse_data, urls)
176         pool.close()
177         pool.join()
178     end = datetime.datetime.now()
179     print(耗时:, (end-xly.start))
180     # for url in urls:
181     #     url = url[0]
182     #     xly.parse_data(url)
183         # break

 

驴妈妈旅游爬虫

标签:bsp   index   product   join   cep   pymysql   web   time   dig   

原文地址:https://www.cnblogs.com/MC-Curry/p/10529578.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!