标签:pil att cep requests spl class html write width
import requests from requests import exceptions import re import json def get_one_page(url): try: response = request.get(url) if response.status_code == 200: return response.text return None except: return None def parse_one_page(html): pattern = re.compile(‘<table width=".*<div cass="pl2">.*?>(.*?)</a>.*?class="pl">(.*?)</p>‘ + ‘.*?<span class="rating_nums">(.*?)</span>.*?class=“pl”>(.*?)</span>‘, re.S) items = re.findall(pattern, html) for item in items: yield{ ‘title‘: item[0].split("/")[0], ‘time‘: item[1].split("/")[0], ‘actor‘: ietm[1].split("/")[1:], ‘average‘: item[2], ‘content‘: item[3], } def write_to_file(content): with open(‘2018.txt‘,‘a‘,encoding=‘utf-8‘) as f: f.write(json.dumps(content, ensure_ascii=False) + ‘\n‘) f.close() def main(start): url = ‘https://movie.douban.com/tag/2018/start=‘+str(start)+‘&type=T‘ html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item) main(i*20) if __name__ == ‘__main__‘: for i in range(84,194): main(i*20)
标签:pil att cep requests spl class html write width
原文地址:https://www.cnblogs.com/wangtao27/p/8992063.html