多线程:
import threading from multiprocessing import Queue from time import sleep from bs4 import BeautifulSoup from requests import get import re class myThread(threading.Thread): def __init__(self, qlock, queue): threading.Thread.__init__(self) self.qlock = qlock self.queue = queue def run(self): process(self.qlock, self.queue) def process(qlock, queue): qlock.acquire() # 互斥锁 try: data = queue.get() # 获取队列 print(data) finally: qlock.release() # 释放锁 sleep(1) # 建立队列 workQueue = Queue(50) qlock = threading.Lock() url = ‘https://www.pixiv.net/ranking.php?mode=daily‘ r = get(url, timeout=1) html = r.text soup = BeautifulSoup(html,‘lxml‘) urls = soup.find_all(‘img‘) links = [] for url in urls: r = re.compile(r‘data-src="(.+?)"‘) link = r.findall(str(url)) workQueue.put(link) # 写入队列 links.append(link) threads = [] for url in links: thread = myThread(qlock, workQueue) thread.daemon = True thread.start() threads.append(thread) # 清空队列 while not workQueue.empty(): pass # 等待线程结束 for t in threads: t.join()
多进程:
1.使用Pool模块创建进程池:
from multiprocessing import Pool from bs4 import BeautifulSoup from requests import get import re import os def run_process(url): print(url) if __name__ == ‘__main__‘: url = ‘https://www.pixiv.net/ranking.php?mode=daily‘ html = get(url, timeout=1).text soup = BeautifulSoup(html, ‘lxml‘) urls = soup.find_all(‘img‘) links = [] for u in urls: r = re.compile(r‘data-src="(.+?.jpg)"‘) link = r.findall(str(u)) links.append(link) process = Pool(os.cpu_count()) # cpu核个数 for u in links: process.apply_async(run_process,args=(u,)) process.close() process.join()
2.Process模块、Queue模块进行进程间的通信(但我的写入队列没有用多进程):
from multiprocessing import Process, Queue from bs4 import BeautifulSoup from requests import get import re class myProcess(Process): def __init__(self, queue): Process.__init__(self) self.queue = queue def run(self): run_process(self.queue) def run_process(queue): data = queue.get() print(data) if __name__ == ‘__main__‘: url = ‘https://www.pixiv.net/ranking.php?mode=daily‘ html = get(url, timeout=1).text soup = BeautifulSoup(html, ‘lxml‘) urls = soup.find_all(‘img‘) queue = Queue(50) links = [] for u in urls: r = re.compile(r‘data-src="(.+?.jpg)"‘) link = r.findall(str(u)) queue.put(link) links.append(link) for u in links: process = myProcess(queue) process.start() while not queue.empty(): pass process.join()
第2个比第1个明显慢了很多,不知道为什么...
但上面只是cpu密集型,测试一下用io密集型的小爬虫来看看效果:
1.多线程:
import threading from multiprocessing import Queue from time import sleep from bs4 import BeautifulSoup from requests import get import re class myThread(threading.Thread): def __init__(self, qlock, queue): threading.Thread.__init__(self) self.qlock = qlock self.queue = queue def run(self): process(self.qlock, self.queue) def process(qlock, queue): qlock.acquire() # 互斥锁 try: url = queue.get()[0] # 获取队列 img = get(url,timeout=1).content name = url.split(‘/‘)[-1] imgid = name[:8] with open(‘C:/Users/adimin/Desktop/video/{}.jpg‘.format(imgid), ‘wb‘) as fp: fp.write(img) print(‘download: ‘ + url) finally: qlock.release() # sleep(1) # 建立队列 workQueue = Queue(50) qlock = threading.Lock() url = ‘https://www.pixiv.net/ranking.php?mode=daily‘ html = get(url, timeout=1).text soup = BeautifulSoup(html,‘lxml‘) urls = soup.find_all(‘img‘) links = [] for u in urls: r = re.compile(r‘data-src="(.+?.jpg)"‘) link = r.findall(str(u)) workQueue.put(link) # 写入队列 links.append(link) threads = [] for u in links: thread = myThread(qlock, workQueue) thread.start() threads.append(thread) # 清空队列 while not workQueue.empty(): pass # 等待线程结束 for t in threads: t.join()
2.多进程:
from multiprocessing import Process, Queue from bs4 import BeautifulSoup from requests import get import re class myProcess(Process): def __init__(self, queue): Process.__init__(self) self.queue = queue def run(self): run_process(self.queue) def run_process(queue): url = queue.get()[0] # 获取队列 img = get(url, timeout=1).content name = url.split(‘/‘)[-1] imgid = name[:8] with open(‘C:/Users/adimin/Desktop/video/{}.jpg‘.format(imgid), ‘wb‘) as fp: fp.write(img) print(‘download: ‘ + url) if __name__ == ‘__main__‘: url = ‘https://www.pixiv.net/ranking.php?mode=daily‘ html = get(url, timeout=1).text soup = BeautifulSoup(html, ‘lxml‘) urls = soup.find_all(‘img‘) queue = Queue(50) links = [] for u in urls: r = re.compile(r‘data-src="(.+?.jpg)"‘) link = r.findall(str(u)) queue.put(link) links.append(link) for u in links: process = myProcess(queue) process.start() while not queue.empty(): pass process.join()
最后,感觉运行时间都差不多...还是看不太出来差距。