码迷,mamicode.com
首页 > 编程语言 > 详细

python反反爬,爬取猫眼评分

时间:2019-04-26 13:35:00      阅读:189      评论:0      收藏:0      [点我收藏+]

标签:创建   爬取   amp   win   tar   new   image   ttf   对象   

python反反爬,爬取猫眼评分.
解决网站爬取时,内容类似:$#x12E0;样式,且每次字体文件变化。
下载FontCreator

技术图片

.
用FontCreator打开base.woff.查看对应字体关系

技术图片

初始化时将对应关系写入字典中。





 1 #!/usr/bin/env python
 2 # coding:utf-8
 3 # __author__ = "南楼"
 4 
 5 
 6 import requests
 7 import re
 8 import os
 9 
10 from fontTools.ttLib import TTFont
11 
12 #下载字体
13 class MaoYan(object):
14 
15     def __init__(self):
16         self.url = http://maoyan.com/films/1198214
17         self.headers = {
18             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
19         }
20         self.base_num = {}  # 编号—数字
21         self.base_obj = {}  # 编号—对象
22         # base.woff 为当前网站下载的一个字体
23         self.base_font_file = TTFont(./fonts/base.woff)
24         # 需要先下载字体编辑软件(FontCreator),以便查看对应关系
25         self.base_num["uniF3BA"] = "0"
26         self.base_num["uniF2A9"] = "1"
27         self.base_num["uniE6A5"] = "2"
28         self.base_num["uniF680"] = "3"
29         self.base_num["uniE69C"] = "4"
30         self.base_num["uniE710"] = "5"
31         self.base_num["uniE07D"] = "6"
32         self.base_num["uniE5A7"] = "7"
33         self.base_num["uniEC7A"] = "8"
34         self.base_num["uniE2A3"] = "9"
35 
36         for key in self.base_num:
37             self.base_obj[key] =self.base_font_file[glyf][key]
38 
39     def baseobj(self):
40         for key in self.base_num:
41 
42             self.base_obj[key] =self.base_font_file[glyf][key]  # 获得woff内编号对应的字体对象
43             return self.base_obj
44 
45     # 发送请求获得响应
46     def get_html(self, url):
47         response = requests.get(url, headers=self.headers)
48         return response.content
49 
50     def create_font(self, re_font_file):
51         # 列出已下载文件
52         file_list = os.listdir(./fonts)
53         # 判断是否已下载
54         if re_font_file not in file_list:
55 
56             print(不在字体库中, 下载:, re_font_file)
57             url = http://vfile.meituan.net/colorstone/ + re_font_file
58             new_file = self.get_html(url)
59             with open(./fonts/ + re_font_file, wb) as f:
60                 f.write(new_file)
61 
62         # 打开字体文件,创建 self.font_file属性
63         self.font_file = TTFont(./fonts/ + re_font_file)
64 
65     def get_num_from_font_file(self, re_star):
66 
67         newstar = re_star.upper().replace("&#X", "uni")
68         realnum = newstar.replace(";", "")
69         numlist = realnum.split(".")
70         # gly_list = self.font_file.getGlyphOrder() #uni列表[‘glyph00000‘, ‘x‘, ‘uniF680‘, ‘uniE2A3‘, ‘uniE710‘, ‘uniE69C‘, ‘uniEC7A‘, ‘uniF2A9‘, ‘uniE5A7‘, ‘uniE07D‘, ‘uniE6A5‘, ‘uniF3BA‘]
71         star_rating = []
72         for hax_num in numlist:
73             font_file_num = self.font_file[glyf][hax_num]
74             for key in self.baseobj():
75                 if font_file_num == self.base_obj[key]:
76                     star_rating.append(self.base_num[key])
77         # 星级评分待优化,暂不支持10.0,
78         star_rating = star_rating[0]+"."+star_rating[1]
79         return star_rating
80 
81     def start_crawl(self):
82         html = self.get_html(self.url).decode(utf-8)
83 
84         # 正则匹配字体文件
85         re_font_file = re.findall(rvfile\.meituan\.net\/colorstone\/(\w+\.woff), html)[0]
86         self.create_font(re_font_file)
87         # 正则匹配星级评分
88         re_star_rating = re.findall(r<span class="index-left info-num ">\s+<span class="stonefont">(.*?)</span>\s+</span>, html)[0]
89         star_rating = self.get_num_from_font_file(re_star_rating)
90         print("星级评分:", star_rating)
91 
92 
93 if __name__ == __main__:
94 
95     m = MaoYan()
96     m.start_crawl()

 

python反反爬,爬取猫眼评分

标签:创建   爬取   amp   win   tar   new   image   ttf   对象   

原文地址:https://www.cnblogs.com/cola-lxj/p/10773563.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!