码迷,mamicode.com
首页 > 编程语言 > 详细

python - 多线程/多进程

时间:2018-02-23 20:50:23      阅读:214      评论:0      收藏:0      [点我收藏+]

标签:data   top   time   import   get   html   队列   join()   load   

  多线程:

import threading
from multiprocessing import Queue
from time import sleep
from bs4 import BeautifulSoup
from requests import get
import re

class myThread(threading.Thread):
    def __init__(self, qlock, queue):
        threading.Thread.__init__(self)
        self.qlock = qlock
        self.queue = queue

    def run(self):
        process(self.qlock, self.queue)

def process(qlock, queue):
    qlock.acquire() # 互斥锁
    try:
        data = queue.get() # 获取队列
        print(data)
    finally:
        qlock.release() # 释放锁
    sleep(1)

# 建立队列
workQueue = Queue(50)
qlock = threading.Lock()

url = ‘https://www.pixiv.net/ranking.php?mode=daily‘

r = get(url, timeout=1)
html = r.text
soup = BeautifulSoup(html,‘lxml‘)

urls = soup.find_all(‘img‘)

links = []
for url in urls:
    r = re.compile(r‘data-src="(.+?)"‘)
    link = r.findall(str(url))
    workQueue.put(link)  # 写入队列
    links.append(link)

threads = []
for url in links:
    thread = myThread(qlock, workQueue)
    thread.daemon = True
    thread.start()
    threads.append(thread)

# 清空队列
while not workQueue.empty():
    pass

# 等待线程结束
for t in threads:
    t.join()

  多进程:

  1.使用Pool模块创建进程池:

from multiprocessing import Pool
from bs4 import BeautifulSoup
from requests import get
import re
import os

def run_process(url):
    print(url)

if __name__ == ‘__main__‘:
    url = ‘https://www.pixiv.net/ranking.php?mode=daily‘
    html = get(url, timeout=1).text
    soup = BeautifulSoup(html, ‘lxml‘)
    urls = soup.find_all(‘img‘)

    links = []
    for u in urls:
        r = re.compile(r‘data-src="(.+?.jpg)"‘)
        link = r.findall(str(u))
        links.append(link)

    process = Pool(os.cpu_count()) # cpu核个数
    for u in links:
        process.apply_async(run_process,args=(u,))
    process.close()
    process.join()

  2.Process模块、Queue模块进行进程间的通信(但我的写入队列没有用多进程):

from multiprocessing import Process, Queue
from bs4 import BeautifulSoup
from requests import get
import re

class myProcess(Process):
    def __init__(self, queue):
        Process.__init__(self)
        self.queue = queue

    def run(self):
        run_process(self.queue)

def run_process(queue):
    data = queue.get()
    print(data)

if __name__ == ‘__main__‘:
    url = ‘https://www.pixiv.net/ranking.php?mode=daily‘
    html = get(url, timeout=1).text
    soup = BeautifulSoup(html, ‘lxml‘)
    urls = soup.find_all(‘img‘)

    queue = Queue(50)
    links = []
    for u in urls:
        r = re.compile(r‘data-src="(.+?.jpg)"‘)
        link = r.findall(str(u))
        queue.put(link)
        links.append(link)

    for u in links:
        process = myProcess(queue)
        process.start()

    while not queue.empty():
        pass

    process.join()

  第2个比第1个明显慢了很多,不知道为什么...

  但上面只是cpu密集型,测试一下用io密集型的小爬虫来看看效果:

  1.多线程:

import threading
from multiprocessing import Queue
from time import sleep
from bs4 import BeautifulSoup
from requests import get
import re

class myThread(threading.Thread):
    def __init__(self, qlock, queue):
        threading.Thread.__init__(self)
        self.qlock = qlock
        self.queue = queue

    def run(self):
        process(self.qlock, self.queue)

def process(qlock, queue):
    qlock.acquire() # 互斥锁
    try:
        url = queue.get()[0] # 获取队列
        img = get(url,timeout=1).content
        name = url.split(‘/‘)[-1]
        imgid = name[:8]
        with open(‘C:/Users/adimin/Desktop/video/{}.jpg‘.format(imgid), ‘wb‘) as fp:
            fp.write(img)
        print(‘download: ‘ + url)
    finally:
        qlock.release() #
    sleep(1)

# 建立队列
workQueue = Queue(50)
qlock = threading.Lock()

url = ‘https://www.pixiv.net/ranking.php?mode=daily‘

html = get(url, timeout=1).text
soup = BeautifulSoup(html,‘lxml‘)
urls = soup.find_all(‘img‘)

links = []
for u in urls:
    r = re.compile(r‘data-src="(.+?.jpg)"‘)
    link = r.findall(str(u))
    workQueue.put(link)  # 写入队列
    links.append(link)

threads = []
for u in links:
    thread = myThread(qlock, workQueue)
    thread.start()
    threads.append(thread)

# 清空队列
while not workQueue.empty():
    pass

# 等待线程结束
for t in threads:
    t.join()

  2.多进程:

from multiprocessing import Process, Queue
from bs4 import BeautifulSoup
from requests import get
import re

class myProcess(Process):
    def __init__(self, queue):
        Process.__init__(self)
        self.queue = queue

    def run(self):
        run_process(self.queue)

def run_process(queue):
    url = queue.get()[0]  # 获取队列
    img = get(url, timeout=1).content
    name = url.split(‘/‘)[-1]
    imgid = name[:8]
    with open(‘C:/Users/adimin/Desktop/video/{}.jpg‘.format(imgid), ‘wb‘) as fp:
        fp.write(img)
    print(‘download: ‘ + url)

if __name__ == ‘__main__‘:
    url = ‘https://www.pixiv.net/ranking.php?mode=daily‘
    html = get(url, timeout=1).text
    soup = BeautifulSoup(html, ‘lxml‘)
    urls = soup.find_all(‘img‘)

    queue = Queue(50)
    links = []
    for u in urls:
        r = re.compile(r‘data-src="(.+?.jpg)"‘)
        link = r.findall(str(u))
        queue.put(link)
        links.append(link)

    for u in links:
        process = myProcess(queue)
        process.start()

    while not queue.empty():
        pass

    process.join()

  最后,感觉运行时间都差不多...还是看不太出来差距。

 

python - 多线程/多进程

标签:data   top   time   import   get   html   队列   join()   load   

原文地址:https://www.cnblogs.com/darkchii/p/8463147.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!