标签:usr oge for isa mail https x86 email x86_64
下载后会在给定目录生成多pdf文件,文件名为每一节的名称
#!/usr/bin/env python3.5
# -*- coding: utf-8 -*-
# @Time : 2019/11/18 下午10:48
# @Author : yon
# @Email : 2012@qq.com
# @File : day1.py
import os
import re
import time
import logging
import pdfkit
from bs4 import BeautifulSoup
import requests
def gethtml(url):
targeturl = url
filepath = '/home/yon/Desktop/pdf/'
headers = {
# 'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7',
'Cache-Control': 'no-cache',
'accept-encoding': 'gzip, deflate, br',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
'Referer': 'https://www.google.com/'
}
resp = requests.get(targeturl, headers=headers)
soup = BeautifulSoup(resp.content, "html.parser")
txt = soup.find("article")
title = filepath + txt.h1.text.replace(" ", "") + ".pdf"
# print(title)
pdfkit.from_string(str(txt), title)
if __name__ == '__main__':
# gethtml("https://www.thisamericanlife.org/664/transcript")
for number in range(665, 687):
urltoget = "https://www.thisamericanlife.org/" + str(number) + "/transcript"
gethtml(urltoget)
time.sleep(10)
将多个pdf 合并,并根据每节各自生成书签
标签:usr oge for isa mail https x86 email x86_64
原文地址:https://www.cnblogs.com/g2thend/p/12003179.html