python爬取豆瓣新书清单

时间：2019-04-02 16:49:57 阅读：156 评论：0 收藏：0 [点我收藏+]

标签：爬取 strip() pool 注意 ffffff os.path return __name__ code

使用python3的requests库快速获取豆瓣图书推荐的新书清单，并保存书籍信息和图书缩略图图片到本地

#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author:Aiker Zhao
@file:douban3.py
@time:上午10:34
"""
import json
import os
import re
from multiprocessing import Pool
import requests
from requests.exceptions import RequestException

dir = ‘z:\\douban\\‘

def get_web(url):
    try:
        rq = requests.get(url)
        if rq.status_code == 200:
            return rq.text
        return None
    except RequestException:
        return None

def parse_web(html):
    pattern = re.compile(‘<li\sclass="">.*?cover".*?href="(.*?)"\stitle="(.*?)".*?img\***c="(.*?)"‘ +
                         ‘.*?class="author">(.*?)<.*?year">(.*?)<.*?publisher">(.*?)<.*?</li>‘, re.S)
    results = re.findall(pattern, html)
    # print(results)
    for i in results:
        # url, title, img, author, yeah, publisher = i
        # author = re.sub(‘\s‘, ‘‘, author)
        # yeah = re.sub(‘\s‘, ‘‘, yeah)
        # publisher = re.sub(‘\s‘, ‘‘, publisher)
        # print(url, title, img, author, yeah, publisher)
        yield {
            ‘title‘: i[1],
            ‘url‘: i[0],
            ‘img‘: i[2],
            ‘author‘: i[3].strip(),
            ‘yeah‘: i[4].strip(),
            ‘publisher‘: i[5].strip()
        }
        # print(url, title, img, author, yeah, publisher)
        # return img,title

def save_image(title, img):
    images = dir + title + ‘.jpg‘
    if os.path.exists(images):
        pass
    else:
        with open(images, ‘wb‘) as f:
            f.write(requests.get(img).content)
            f.close()

def save_info(content):
    info = dir + ‘info.txt‘
    with open(info, ‘a‘, encoding=‘utf-8‘) as fd: #防止出现ascII
        fd.write(json.dumps(content, ensure_ascii=False) + ‘\n‘) ##防止出现ascII
        fd.close()

def main():
    url = ‘https://book.douban.com/‘
    html = get_web(url)
    # parse_web(html)
    for i in parse_web(html):
        print(i)
        save_info(i)
        save_image(i.get(‘title‘), i.get(‘img‘))

if __name__ == ‘__main__‘:
    main()

技术图片

心得：
- 需要注意正则的匹配规则的准确度，否则会没有响应，或者无限超时

原文地址：https://blog.51cto.com/m51cto/2373119

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行