爬取豆瓣 Top250书籍

时间：2020-07-02 16:39:12 阅读：70 评论：0 收藏：0 [点我收藏+]

标签：book pen item tps else open except -- load

‘‘‘爬取豆瓣top250书籍‘‘‘
import requests
import json
import csv
from bs4 import BeautifulSoup

books = []


def book_name(url):
    headers = {
            ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36‘
    }
    res = requests.get(url=url, headers=headers)
    html = res.text
    soup = BeautifulSoup(html, ‘html.parser‘)
    items = soup.find(‘div‘,attrs={‘class‘:‘grid-16-8 clearfix‘}).find(‘div‘,attrs={‘class‘:‘indent‘}).find_all(‘table‘)

    for i in items:
        book = []
        title = i.find(‘div‘,attrs={‘class‘:‘pl2‘}).find(‘a‘)
        book.append(‘《‘ + title.text.replace(‘\n‘, ‘‘).strip() + ‘》‘)

        star = i.find(class_="star clearfix").find(class_="rating_nums")
        book.append(star.text + ‘分‘)

        try:
            brief = i.find(class_="quote").find(class_="inq")
        except AttributeError:
            book.append("~暂无简介~！")
        else:
            book.append(brief.text)
            link = i.find(class_="pl2").find(‘a‘)[‘href‘]
            book.append(link)

        global books
        books.append(book)

        print(book)

    try:
        next = soup.find(class_="paginator").find(
            class_="next").find(‘a‘)[‘href‘]
    # 翻到最后一页
    except TypeError:
        return 0
    else:
        return next

next = ‘https://book.douban.com/top250?start=0&filter=‘
count = 0

while next != 0:
    count += 1
    next = book_name(next)
    print(‘-----------以上是第‘+str(count)+‘页的内容-----------‘)

csv_file=open(‘top250_books.csv‘,‘w‘,newline=‘‘,encoding=‘utf-8‘)
w=csv.writer(csv_file)
w.writerow([‘书名‘,‘评分‘,‘简介‘,‘链接‘])
for b in books:
    w.writerow(b)

技术图片

爬取豆瓣 Top250书籍

标签：book pen item tps else open except -- load

原文地址：https://www.cnblogs.com/memory-ccy/p/13225055.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行