标签:创建 爬取 amp win tar new image ttf 对象
python反反爬,爬取猫眼评分.
解决网站爬取时,内容类似:$#x12E0;样式,且每次字体文件变化。
下载FontCreator
.
用FontCreator打开base.woff.查看对应字体关系
初始化时将对应关系写入字典中。
1 #!/usr/bin/env python 2 # coding:utf-8 3 # __author__ = "南楼" 4 5 6 import requests 7 import re 8 import os 9 10 from fontTools.ttLib import TTFont 11 12 #下载字体 13 class MaoYan(object): 14 15 def __init__(self): 16 self.url = ‘http://maoyan.com/films/1198214‘ 17 self.headers = { 18 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36" 19 } 20 self.base_num = {} # 编号—数字 21 self.base_obj = {} # 编号—对象 22 # base.woff 为当前网站下载的一个字体 23 self.base_font_file = TTFont(‘./fonts/base.woff‘) 24 # 需要先下载字体编辑软件(FontCreator),以便查看对应关系 25 self.base_num["uniF3BA"] = "0" 26 self.base_num["uniF2A9"] = "1" 27 self.base_num["uniE6A5"] = "2" 28 self.base_num["uniF680"] = "3" 29 self.base_num["uniE69C"] = "4" 30 self.base_num["uniE710"] = "5" 31 self.base_num["uniE07D"] = "6" 32 self.base_num["uniE5A7"] = "7" 33 self.base_num["uniEC7A"] = "8" 34 self.base_num["uniE2A3"] = "9" 35 36 for key in self.base_num: 37 self.base_obj[key] =self.base_font_file[‘glyf‘][key] 38 39 def baseobj(self): 40 for key in self.base_num: 41 42 self.base_obj[key] =self.base_font_file[‘glyf‘][key] # 获得woff内编号对应的字体对象 43 return self.base_obj 44 45 # 发送请求获得响应 46 def get_html(self, url): 47 response = requests.get(url, headers=self.headers) 48 return response.content 49 50 def create_font(self, re_font_file): 51 # 列出已下载文件 52 file_list = os.listdir(‘./fonts‘) 53 # 判断是否已下载 54 if re_font_file not in file_list: 55 56 print(‘不在字体库中, 下载:‘, re_font_file) 57 url = ‘http://vfile.meituan.net/colorstone/‘ + re_font_file 58 new_file = self.get_html(url) 59 with open(‘./fonts/‘ + re_font_file, ‘wb‘) as f: 60 f.write(new_file) 61 62 # 打开字体文件,创建 self.font_file属性 63 self.font_file = TTFont(‘./fonts/‘ + re_font_file) 64 65 def get_num_from_font_file(self, re_star): 66 67 newstar = re_star.upper().replace("&#X", "uni") 68 realnum = newstar.replace(";", "") 69 numlist = realnum.split(".") 70 # gly_list = self.font_file.getGlyphOrder() #uni列表[‘glyph00000‘, ‘x‘, ‘uniF680‘, ‘uniE2A3‘, ‘uniE710‘, ‘uniE69C‘, ‘uniEC7A‘, ‘uniF2A9‘, ‘uniE5A7‘, ‘uniE07D‘, ‘uniE6A5‘, ‘uniF3BA‘] 71 star_rating = [] 72 for hax_num in numlist: 73 font_file_num = self.font_file[‘glyf‘][hax_num] 74 for key in self.baseobj(): 75 if font_file_num == self.base_obj[key]: 76 star_rating.append(self.base_num[key]) 77 # 星级评分待优化,暂不支持10.0, 78 star_rating = star_rating[0]+"."+star_rating[1] 79 return star_rating 80 81 def start_crawl(self): 82 html = self.get_html(self.url).decode(‘utf-8‘) 83 84 # 正则匹配字体文件 85 re_font_file = re.findall(r‘vfile\.meituan\.net\/colorstone\/(\w+\.woff)‘, html)[0] 86 self.create_font(re_font_file) 87 # 正则匹配星级评分 88 re_star_rating = re.findall(r‘<span class="index-left info-num ">\s+<span class="stonefont">(.*?)</span>\s+</span>‘, html)[0] 89 star_rating = self.get_num_from_font_file(re_star_rating) 90 print("星级评分:", star_rating) 91 92 93 if __name__ == ‘__main__‘: 94 95 m = MaoYan() 96 m.start_crawl()
标签:创建 爬取 amp win tar new image ttf 对象
原文地址:https://www.cnblogs.com/cola-lxj/p/10773563.html