python--批量下载豆瓣图片

时间：2016-10-07 11:15:36 阅读：215 评论：0 收藏：0 [点我收藏+]

标签：

溜达豆瓣的时候，发现一些图片，懒得一个一个扒，之前写过c#和python版本的图片下载，因此拿之前的Python代码来改了改，折腾出一个豆瓣版本，方便各位使用

# -*- coding:utf8 -*-
import urllib2, urllib, socket
import re
import requests
from lxml import etree
import os, time

DEFAULT_DOWNLOAD_TIMEOUT = 30


class AppURLopener(urllib.FancyURLopener):
    version = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT)"


def check_save_path(save_path):
    if not os.path.exists(save_path):
        os.makedirs(save_path)


def get_image_name(image_link):
    file_name = os.path.basename(image_link)
    return file_name


def save_image1(image_link, save_path):
    file_name = get_image_name(image_link)
    file_path = save_path + "\\" + file_name
    print("准备下载{0} 到{1}".format(image_link, file_path))
    try:
        urllib._urlopener = AppURLopener()
        socket.setdefaulttimeout(DEFAULT_DOWNLOAD_TIMEOUT)
        urllib.urlretrieve(url=image_link, filename=save_path)
        return True
    except Exception, ex:
        print(ex.args)
        print("下载文件出错:{0}".format(ex.message))
        return False


def save_image(image_link, save_path):
    file_name = get_image_name(image_link)
    file_path = save_path + "\\" + file_name
    print("准备下载{0} 到{1}".format(image_link, file_path))
    try:
        file_handler = open(file_path, "wb")
        image_handler = urllib2.urlopen(url=image_link, timeout=DEFAULT_DOWNLOAD_TIMEOUT).read()
        file_handler.write(image_handler)
        return True
    except Exception, ex:
        print("下载文件出错:{0}".format(ex.message))
        return False


def get_thumb_picture_link(thumb_page_link):
    try:
        html_content = urllib2.urlopen(url=thumb_page_link, timeout=DEFAULT_DOWNLOAD_TIMEOUT).read()
        html_tree = etree.HTML(html_content)
        # print(str(html_tree))
        link_tmp_list = html_tree.xpath(‘//div[@class="photo_wrap"]/a[@class="photolst_photo"]/img/@src‘)
        page_link_list = []
        for link_tmp in link_tmp_list:
            page_link_list.append(link_tmp)
        return page_link_list
    except Exception, ex:
        print(ex.message)
        return []


def download_pictures(album_link, min_page_id, max_page_id, picture_count_per_page, save_path):
    check_save_path(save_path)
    min_page_id = 0
    while min_page_id < max_page_id:
        thumb_page_link = album_link + "?start={0}".format(min_page_id * picture_count_per_page)
        thumb_picture_links = get_thumb_picture_link(thumb_page_link)
        for thumb_picture_link in thumb_picture_links:
            full_picture_link = thumb_picture_link.replace("photo/thumb", "photo/large")
            save_flag = save_image(image_link=full_picture_link, save_path=save_path)
            if not save_flag:
                full_picture_link = thumb_picture_link.replace("photo/thumb", "photo/photo")
                save_image(image_link=full_picture_link, save_path=save_path)
            time.sleep(1)
        min_page_id += 1
    print("下载完成")


# 设置图片保存的本地文件夹
save_path = "J:\\douban\\meiren2"
# 设置相册地址，注意以反斜杠结尾
album_link = "https://www.douban.com/photos/album/43697061/"
# 设置相册总页数
max_page_id = 9
# 设置每页图片数量，默认为18张
picture_count_per_page = 18
download_pictures(album_link, max_page_id, picture_count_per_page, save_path)

View Code

=============================================================

相对urllib2来说，urllib 真的比较坑，如果不设置User-Agent，下载速度会超慢无比，另外还需要调用socket模块来设置超时时间，比较折腾，最终可能还会踩到其他坑里去，比如我下着下着就被豆瓣给‘屏蔽’啦，so建议使用urllib2。