标签:
#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import with_statement import sys import os import urllib2 from urlparse import urlparse import random import re import gevent from gevent import monkey monkey.patch_all() def get(url): setup = urllib2.build_opener() # TODO: Write appropriate headers. setup.add_headers = [(‘None‘, ‘None‘)] urllib2.install_opener(setup) try: request = urllib2.Request(url) except (urllib2.HTTPError, urllib2.URLError), e: sys.exit(-1) return setup.open(request) def is_url(url): res = urlparse(url) return ‘imgur.com‘ in res.netloc def fetch(url): res = urlparse(url) key = res.path.split(‘/‘)[2] urll = ‘https://imgur.com/a/%s/noscript‘ % key return get(urll).read(), key def get_or_create_folder(key, folder=None): foldername = key if folder is not None: foldername = folder if not os.path.exists(foldername): os.makedirs(foldername) return foldername def fetch_images(foldername, images): gevent.sleep(random.randint(0, 1) * 0.0001) path = os.path.join(foldername, images[1]) with open(path, ‘wb‘) as img: img.write(get(images[0]).read()) print ‘Done:\t%s‘ % images[0] def save(url, folder=None): data, key = fetch(url) REGEX = re.compile(r‘<img src="(http\:\/\/i\.imgur\.com\/([a-zA-Z0-9]{5}\.(jpg|png|gif)))"‘) images = REGEX.findall(data) foldername = get_or_create_folder(key, folder) return foldername, images if __name__ == ‘__main__‘: url = sys.argv[1] try: folder = sys.argv[2] except IndexError: folder = None foldername, images = save(url, folder=folder) threads = [gevent.spawn(fetch_images, foldername, image) for image in images] gevent.joinall(threads)
标签:
原文地址:http://www.cnblogs.com/bergus/p/4592772.html