标签:
当写一个任务较多的爬虫时需要用到线程,如果说我有一群虫子,我把它分成4队,每一队虫子都有自己的任务,互相执行自己的任务而不干扰。线程分为串行线程和并行线程,串行线程是线程1执行完执行线程2,并行线程是线程3和线程4同时进行,加快任务完成速度。
目标网站:http://jandan.net/,爬取网站的所有图片,因工作量太大,只取几页做实验。
编程思路:1队虫子首先爬出所有需要爬取的URL放入队列中,1队虫子任务结束。然后2队虫子从队列中逐个取出URL进行分析爬取该URL下的所有需要下载的img_url,放入列表中,2队虫子任务结束。然后3队虫子和4队虫子从列表中各取一半任务同时进行下载。
import threading
from time import time
from Queue import Queue
import urllib2,urllib
from bs4 import BeautifulSoup
start = time()
baseurl = ‘http://jandan.net/ooxx/‘
threads1 = []
threads2 = []
list = []
class Geturl(threading.Thread):
def __init__(self,G_name,queue):
threading.Thread.__init__(self,name = G_name)
self.data1 = queue
def run(self):
for pageid in range(1000,1005):
url = baseurl+‘page-%s‘%pageid
self.data1.put(url)
print ‘Geturl:There is %d pages‘%self.data1.qsize()
class Jpgurl(threading.Thread):
def __init__(self,J_name,queue):
threading.Thread.__init__(self,name = J_name)
self.data2 = queue
def run(self):
while not self.data2.empty():
pageurl = self.data2.get()
print pageurl
req_header = {‘User-Agent‘:‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘}
req_timeout = 20
req = urllib2.Request(pageurl,None,req_header)
html = urllib2.urlopen(req,None,req_timeout)
#html = urllib2.urlopen(pageurl).read()
soup = BeautifulSoup(html)
img = soup.find_all([‘img‘])
print ‘img is %d‘%len(img)
for myimg in img:
Jpgurl = myimg.get(‘src‘)
list.append(Jpgurl)
print ‘list is %d‘%len(list)
print ‘Jpgurl is finished! Yong shi:%s‘%(time()-start)
class Downjpg1(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
def run(self):
list1 = list[:len(list)/2]
for downurl1 in list1:
urllib.urlretrieve(downurl1,‘D:/Python/picture‘+‘/‘+downurl1[-11:])
print ‘Now is loading the %dst picture‘%list.index(downurl1)
class Downjpg2(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
def run(self):
list2 = list[len(list)/2:]
for downurl2 in list2:
urllib.urlretrieve(downurl2,‘D:/Python/picture‘+‘/‘+downurl2[-11:])
print ‘Now is loading the %dst picture‘%list.index(downurl2)
def main():
queue = Queue()
t1 = Geturl(‘a‘,queue)
t2 = Jpgurl(‘b‘,queue)
t3 = Downjpg1(queue)
t4 = Downjpg2(queue)
threads1.append(t1)
threads1.append(t2)
threads2.append(t3)
threads2.append(t4)
for t1 in threads1:
t1.setDaemon(True)
t1.start()
t1.join()
for t2 in threads2:
t2.setDaemon(True)
t2.start()
t2.join()
if __name__ == ‘__main__‘:
main()
print ‘Downjpg is finished! Yong shi:%s‘%(time()-start)
标签:
原文地址:http://www.cnblogs.com/pylab/p/4621630.html