网络爬虫: 抓取allitebooks.com书籍信息及ISBN码, save books name and ISBN into .csv(2) from backslash112

时间：2017-07-12 21:16:25 阅读：193 评论：0 收藏：0 [点我收藏+]

标签：书籍 images ret next lxml bs4 title writer .com

from urllib2 import urlopen
from bs4 import BeautifulSoup
import csv

# Get the next page url from the current page url
def get_next_page_url(url):
    page = urlopen(url)
    soup_page = BeautifulSoup(page, ‘lxml‘)
    page.close()
    # Get current page and next page tag
    current_page_tag = soup_page.find(class_="current")
    next_page_tag = current_page_tag.find_next_sibling()
    # Check if the current page is the last one
    if next_page_tag is None:
        next_page_url = None
    else:
        next_page_url = next_page_tag[‘href‘]
    return next_page_url 

# Get the book detail urls by page url 
def get_book_detail_urls(url):
    page = urlopen(url)
    soup = BeautifulSoup(page, ‘lxml‘)
    page.close()
    urls = []
    book_header_tags = soup.find_all(class_="entry-title")
    for book_header_tag in book_header_tags:
        urls.append(book_header_tag.a[‘href‘])
    return urls
    

# Get the book detail info by book detail url
def get_book_isbn(url):
    page = urlopen(url)
    book_isbn_soup = BeautifulSoup(page, ‘lxml‘)
    page.close()
    title_tag = book_isbn_soup.find(class_="single-title")
    title = title_tag.string
    isbn_key_tag = book_isbn_soup.find(text="ISBN-10:").parent
    isbn_tag = isbn_key_tag.find_next_sibling()
    isbn = isbn_tag.string.strip() # Remove the whitespace with the strip method
    return [  title, isbn ]
    # return [‘title:‘, title, ‘isbn:‘, isbn]

    
def save_to_csv(book_info_list):
    path=‘/Users/zhouxin/Desktop/books.csv‘
    with open(path, ‘w‘) as fp:
        a = csv.writer(fp, delimiter=‘,‘)
        a.writerow([‘title‘,‘isbn‘])
        a.writerows(book_info_list)
    
def start():
    url = "http://www.allitebooks.com/marketing-seo/"
    book_info_list = []
    def next_page(page_url):
        book_detail_urls = get_book_detail_urls(page_url)
        for book_detail_url in book_detail_urls: #print all books ISBN one by one
            # print(book_detail_url)
            book_info = get_book_isbn(book_detail_url)
            # print (book_info)     
            print",".join(book_info) # print ISBD
            book_info_list.append(book_info)
        next_page_url = get_next_page_url(page_url)
        if next_page_url is not None:
            next_page(next_page_url)
        else:
            return 
    next_page(url)
    save_to_csv(book_info_list)

Output:

技术分享

网络爬虫: 抓取allitebooks.com书籍信息及ISBN码, save books name and ISBN into .csv(2) from backslash112

标签：书籍 images ret next lxml bs4 title writer .com

原文地址：http://www.cnblogs.com/XinZhou-Annie/p/7157240.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行