python中urllib2与多线程使用

时间：2015-08-15 20:01:43 阅读：161 评论：0 收藏：0 [点我收藏+]

问题提出
几天前,我在上一篇博客中写了如何使用urllib2模块来批量下载wallheaven上的图片资源，但是在我几次运行下来之后发现了一个非常严重的问题，如果下载图片数量非常多的话，程序需要运行很长时间。所以显然这样不是一个很好的解决方法，所以后来我在程序中加入了多线程，程序性能提升了何止数倍，下面是具体的解决过程。
问题解决
从我上一边的博客中不难看出，第一次的下载程序每次只能下载一张图片，这样完全浪费了计算机的内存资源和网络资源。所以之后我加入了多线程，每次可以根据不同的需求开启更多的下载量，下面是源代码

# -*- coding: utf-8 -*-
"""
Created on Tue Aug 11 10:25:34 2015

@author: Gryps
"""

from Queue import Queue
from threading import Thread
from time  import ctime,time,sleep
from urllib2 import HTTPError,URLError
from pathlib import Path

import urllib2,sys,math,os,socket

def down(down_dir,link):
    """
    Download the images from the given link and store it to the designated directory
    Args:
        down_dir:the images storage folder
        link:the download link
    Raises:
        HTTPError:An error occured accessing the website
        URLError:An error occured when os no connection
        socket.error:An error occured during TCP/IP connection
    """
    req = urllib2.Request(link)
    req.add_header("User-Agent", "Mozilla 5.0")
    conn = urllib2.urlopen(req)
    directory = down_dir / os.path.basename(link)
    f = directory.open(‘wb‘)
    f.write(conn.read())
    f.close()

def setup_dir():
    """
    Set the download directory of images downloaded
    Return:
        download_dir:the setting directory of images to be stored
    """
    download_dir = Path("H:\\pic")
    return download_dir

def get_link(start,stop):
    """
    Acquire all downloading links and set to links array
    Args:
        start:the image initialed index value
        stop:the image terminated index value
    Return:
        links:the all link array for downloading
    """
    links = []
    for x in range(start, stop):
        url = ‘https://wallpapers.wallhaven.cc/wallpapers/full/wallhaven-‘+str(x)+‘.jpg‘
        links.append(url)
    return links

class DownloadWorker(Thread):
    """
    The class of image downloading thread
    Function:
        __init__():initialization of threading
        run:the running of threading
    """
    def __init__(self,queue):
        Thread.__init__(self)
        self.queue = queue

    def run(self):
        while True:
            down_dir, link = self.queue.get()
            print os.path.basename(link)+"\n"
            try:
                down(down_dir,link)
            except HTTPError,e:
                print os.path.basename(link)+‘: ‘+e.reason
            except URLError,e:
                print e.__weakref__
            except socket.error,e:
                print e.__weakref__
            self.queue.task_done()

def main():
    ts = time()
    start = input("Please input the start value: ")
    stop = input("Please input the stop value: ")
    # set the number of threads
    threads = input("Please input the download numbers every piece: ")
    k = stop-start
    # acquire the download links
    links = [l for l in get_link(start,stop)]
    # set the download storage directory
    down_dir = setup_dir()
    queue = Queue()
    # judge download numbers if greater than threads or not
    # if K< = threads ,set the k for threads,else set the threads for the number of thread
    if k <= threads:
        for x in range(k-1):
            print queue.qsize()
            worker = DownloadWorker(queue)
            worker.setDaemon(True)
            worker.start()
    else:
        for x in range(threads):
            worker = DownloadWorker(queue)
            worker.setDaemon(True)
            worker.start()
    # traverse the links and put the link to queue
    for link in links:
        queue.put((down_dir,link))
    # the new queue joining
    queue.join()
    print ‘Took {}‘.format(time()-ts)
    print "The finished time:" + ctime()

if __name__ == ‘__main__‘:
    main()

标签：python 多线程性能 urllib2

原文地址：http://blog.csdn.net/yuanpengfred/article/details/47684809

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行