标签:detail dir decode 爬取 exception gecko XML main 方式
1 # -*- coding: utf-8 -*- 2 # __author__ = "maple" 3 4 5 from base64 import b64decode 6 from lxml import etree 7 import requests 8 import json 9 import re 10 import os 11 12 13 class XiGuaSpider: 14 15 def __init__(self): 16 self.headers = { 17 ‘Referer‘: ‘https://www.ixigua.com‘, 18 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36‘, 19 ‘cookie‘: ‘wafid=8b91d940-81ec-4620-af0f-f45d479a62c2; wafid.sig=BZgx1eD0aFGn25mL-y-SEh17cng; ttwid=6841106955945346564; ttwid.sig=glkPgElc0Yh0OEDyNL0P91fmbZg; xiguavideopcwebid=6841106955945346564; xiguavideopcwebid.sig=avM_v_QTwC7VqM26Yqde9eer3xA; _ga=GA1.2.1235075053.1592819342; SLARDAR_WEB_ID=fa1eb835-d608-4ade-850d-bc0409bd541f; _gid=GA1.2.303152420.1593089518; ixigua-a-s=1; Hm_lvt_db8ae92f7b33b6596893cdf8c004a1a2=1593094562,1593095154,1593098009,1593147688; Hm_lpvt_db8ae92f7b33b6596893cdf8c004a1a2=1593153331‘, 20 } 21 22 self.video_dirs = ‘./video‘ 23 24 def download_file(self, file_path, download_url): 25 print(‘*‘ * 100) 26 print(f"保存路径:{file_path}") 27 print(f‘下载URL:{download_url}‘) 28 response = requests.get(url=download_url, headers=self.headers, stream=True) 29 content_size = int(response.headers["content-length"]) # 视频内容的总大小 30 size = 0 31 with open(file_path, "wb") as file: # 非纯文本都以字节的方式写入 32 for data in response.iter_content(chunk_size=1024): # 循环写入 33 file.write(data) # 写入视频文件 34 file.flush() # 刷新缓存 35 size += len(data) # 叠加每次写入的大小 36 # 打印下载进度 37 print("\r文件下载进度:%d%%(%0.2fMB/%0.2fMB)" % ( 38 float(size / content_size * 100), (size / 1024 / 1024), 39 (content_size / 1024 / 1024)), 40 end=" ") 41 print() 42 43 def get_response(self, url): 44 response = None 45 try: 46 response = requests.get(url, headers=self.headers) 47 except Exception as e: 48 print(e) 49 return response 50 51 def parse_detail(self, url): 52 response = self.get_response(url) 53 if not response: 54 return 55 html = response.text 56 document = etree.HTML(html) 57 title = ‘‘.join(document.xpath(‘//*[@class="hasSource"]/text()‘)) 58 if not title: 59 title = ‘‘.join(document.xpath(‘//*[@class="teleplayPage__Description__header"]/h1/text()‘)) 60 61 title = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "-", title) 62 pattern = r‘\<script.*?\>window\._SSR_HYDRATED_DATA=(.*?)\</script\>‘ 63 result = re.findall(pattern, html) 64 if len(result) < 1: 65 print(‘没有找到下载链接。。。‘) 66 return None 67 result = result[0] 68 data = json.loads(result) 69 with open(‘video.json‘, ‘w‘, encoding=‘utf-8‘) as f: 70 json.dump(data, f) 71 72 try: 73 video_list = data[‘Projection‘][‘video‘][‘videoResource‘][‘normal‘][‘video_list‘] 74 except Exception as e: 75 print(‘异常信息:‘, e) 76 video_list = data[‘Teleplay‘][‘videoResource‘][‘normal‘][‘video_list‘] 77 78 video_3 = video_list.get(‘video_3‘) 79 if not video_3: 80 video_3 = video_list.get(‘video_2‘) 81 video_url = video_3[‘main_url‘] 82 video_url = b64decode(video_url).decode(‘utf-8‘) 83 84 if not os.path.exists(self.video_dirs): 85 os.mkdir(self.video_dirs) 86 file_path = f"{self.video_dirs}/{title}.mp4" 87 self.download_file(file_path, video_url) 88 89 def start_requests(self): 90 url = ‘https://www.ixigua.com/i6618828724525597192‘ 91 self.parse_detail(url) 92 93 def run(self): 94 self.start_requests() 95 96 97 if __name__ == ‘__main__‘: 98 XiGuaSpider().run()
标签:detail dir decode 爬取 exception gecko XML main 方式
原文地址:https://www.cnblogs.com/shiguanggege/p/13195073.html