标签:file div usr imp headers 程序 like enc utf-8
#!/usr/bin/env python #!encoding: UTF-8 # get_transcript.py """ 一个自动从https://podcast.duolingo.com/spanish中下载transcripts的程序 """ # requests.encoding 编码 # requests.status_code 状态码 # 200 成功 # 4xx 客户端错误 -> 404 Page Not Found # 5xx 服务器错误 import requests import re import os main = ‘https://podcast.duolingo.com/spanish‘ # 主页面 headers = { ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36‘, } for i in range(1, 10): # 遍历所有页面 if i == 1: # 第一页即主页面 page = main else: # ‘https://podcast.duolingo.com/spanish2‘ 以此类推 page = main + str(i) r = requests.get(page, headers=headers) print(‘{page} with status code {status}.‘.format(page=page, status=r.status_code)) if r.status_code == 404: # 如果找不到更多的页面,跳出 print(‘404 Page Not Found!‘) break hrefs = re.findall(‘entry-title">\s*<a href="(.*)" rel‘, r.text) # 获取页面所有节目链接 for h in hrefs: title = h[2:] episode = main[:-7] + title # 节目链接 filename = ‘transcript/‘ + title + ‘.txt‘ if os.path.exists(filename): print(filename, ‘existed!‘) continue req = requests.get(episode, headers=headers) print(‘{episode} with status code {status}.‘.format(episode=episode, status=req.status_code)) if not os.path.exists(‘transcript‘): os.mkdir(‘transcript‘) with open(filename, ‘w+‘) as fp: for lines in re.findall(‘strong>(.*)</strong>(.*)</p>‘, req.text): for line in lines: fp.write(line) fp.write(‘\n\n‘) print(filename, ‘added!‘)
结果:
标签:file div usr imp headers 程序 like enc utf-8
原文地址:https://www.cnblogs.com/noonjuan/p/11192582.html