爬取王垠的博客并生成pdf

时间：2019-01-28 10:43:40 阅读：351 评论：0 收藏：0 [点我收藏+]

标签：.text find 获得 3.4 end 爬取 soup ons sts

尚未完善，有待改进

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

__author__ = ‘jiangwenwen‘
import pdfkit
import time
import requests
import random
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

# 请求头
ua = UserAgent()

headers = {
    ‘cache-control‘: "no-cache",
    "Host": "www.yinwang.org",
    "User-Agent": ua.random,
    "Referer": "http://www.yinwang.org/",
}

# IP代理池
ip_pool = [‘123.55.114.217:9999‘,
           ‘110.52.235.91:9999‘,
           ‘183.163.43.61:9999‘,
           ‘119.101.126.52:9999‘,
           ‘119.101.124.165:9999‘,
           ‘119.101.125.38:9999‘,
           ‘119.101.125.84:9999‘,
           ‘110.52.235.80:9999‘,
           ‘119.101.125.49:9999‘,
           ‘110.52.235.162:9999‘,
           ‘119.101.124.23:9999‘
           ]


# 打印成pdf
def print_pdf(url, file_name):
    start = time.time()
    print("正在打印中...")
    headers["User-Agent"] = ua.random
    print("User-Agent是：{0}".format(headers["User-Agent"]))
    content = requests.get(url, headers=headers, timeout=3, proxies=get_proxy(ip_pool)).text
    pdfkit.from_string(content, file_name)
    end = time.time()
    print("打印成功，本次打印耗时：%0.2f秒" % (end - start))


# 获得有效代理
def get_proxy(ip_pool):
    for ip in ip_pool:
        url = "http://www.yinwang.org/"
        # 用requests来验证ip是否可用
        try:
            requests.get(url, proxies={"http": "http://{}".format(ip), }, timeout=3)
        except:
            continue
        else:
            proxies = {
                "http": "http://{}".format(ip),
                "https": "http://{}".format(ip),
            }
            return proxies


response = requests.get("http://www.yinwang.org/", headers=headers, proxies=get_proxy(ip_pool))
soup = BeautifulSoup(response.content, ‘html.parser‘)
tags = soup.find_all("li", class_="list-group-item title")

for child in tags:
    article_url = "http://www.yinwang.org" + child.a.get(‘href‘)
    article_file_name = "桌面\\" + child.a.string + ".pdf"
    print_pdf(article_url, article_file_name)

爬取王垠的博客并生成pdf

标签：.text find 获得 3.4 end 爬取 soup ons sts

原文地址：https://www.cnblogs.com/jiangwenwen1/p/10328339.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行