码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫性能相关

时间:2017-10-24 22:41:46      阅读:211      评论:0      收藏:0      [点我收藏+]

标签:性能   爬虫   相关   

一、单进程,单线程引起等待
import requests

def fetch_async(url):
    response = requests.get(url)
    return response

url_list = [‘http://www.github.com‘, ‘http://www.baidu.com‘,‘http://www.bing.com‘]

for url in url_list:
    fetch_async(url)
    
多线程执行
from concurrent.futures import ThreadPoolExecutor
import requests

def fetch_async(url):
    response = requests.get(url)
    return response

url_list = [‘http://www.github.com‘, ‘http://www.baidu.com‘,‘http://www.bing.com‘]
pool = ThreadPoolExecutor(5)
for url in url_list:
    pool.submit(fetch_async,url)
pool.shutdown(wait=True)


多线程+回调函数执行
from concurrent.futures import ThreadPoolExecutor
import requests

def fetch_async(url):
    response = requests.get(url)
    return response

def callback(future):
    print(future.result())

url_list = [‘http://www.github.com‘, ‘http://www.baidu.com‘,‘http://www.bing.com‘]
pool = ThreadPoolExecutor(5)
for url in url_list:
    v = pool.submit(fetch_async,url)
    v.add_done_callback(callback)

pool.shutdown(wait=True)

多进程执行
from concurrent.futures import ProcessPoolExecutor
import requests

def fetch_async(url):
    response = requests.get(url)
    return response

if __name__ == ‘__main__‘:
    url_list = [‘http://www.github.com‘, ‘http://www.baidu.com‘, ‘http://www.bing.com‘]
    pool = ProcessPoolExecutor(5)
    for url in url_list:
        pool.submit(fetch_async, url)
    pool.shutdown(wait=True)
    
多进程+回调函数
from concurrent.futures import ProcessPoolExecutor
import requests

def fetch_async(url):
    response = requests.get(url)
    return response

def callback(future):
    print(future.result())

url_list = [‘http://www.github.com‘, ‘http://www.baidu.com‘, ‘http://www.bing.com‘]
if __name__ == ‘__main__‘:
    pool = ProcessPoolExecutor(5)
    for url in url_list:
        v = pool.submit(fetch_async, url)
        v.add_done_callback(callback)
    pool.shutdown(wait=True)
    
    
二、异步io库asyncio,asyncio的编程模型就是一个消息循环。我们从asyncio模块中直接获取一个EventLoop的引用,然后把需要执行的协程扔到EventLoop中执行,就实现了异步IO
示例1
import asyncio

@asyncio.coroutine
def func1():
    print(‘before...func1......‘)
    yield from asyncio.sleep(5)
    print(‘end...func1......‘)

@asyncio.coroutine
def func2():
    print(‘before...func2......‘)
    yield from asyncio.sleep(5)
    print(‘end...func2......‘)

tasks = [func1(), func2()]

loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*tasks))
loop.close()

示例2
import asyncio

@asyncio.coroutine
def fetch_async(host, url=‘/‘):
    print(host, url)
    reader, writer = yield from asyncio.open_connection(host, 80)

    request_header_content = """GET {0} HTTP/1.0\r\nHost: {1}\r\n\r\n""".format(url, host)
    request_header_content = bytes(request_header_content, encoding=‘utf-8‘)

    writer.write(request_header_content)
    yield from writer.drain()
    text = yield from reader.read()
    print(host, url, text)
    writer.close()

tasks = [
    fetch_async(‘www.cnblogs.com‘, ‘/wupeiqi/‘),
    fetch_async(‘dig.chouti.com‘, ‘/pic/show?nid=4073644713430508&lid=10273091‘)
]

loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()



aiohttp+asyncio模块

import aiohttp
import asyncio

@asyncio.coroutine
def fetch_async(url):
    print(url)
    response = yield from aiohttp.request(‘GET‘, url)
    print(url, response)
    response.close()

tasks = [fetch_async(‘http://www.baidu.com‘), fetch_async(‘http://www.chouti.com‘)]
event_loop = asyncio.get_event_loop()
results = event_loop.run_until_complete(asyncio.gather(*tasks))
event_loop.close()


asyncio+requests模块
import asyncio
import requests

@asyncio.coroutine
def fetch_async(func, *args):
    loop = asyncio.get_event_loop()
    future = loop.run_in_executor(None, func, *args)
    response = yield from future
    print(response.url, response.content)

tasks = [
    fetch_async(requests.get, ‘http://www.cnblogs.com/wupeiqi/‘),
    fetch_async(requests.get, ‘http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091‘)
]

loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()


gevent+requests模块
import gevent
import requests
from gevent import monkey

monkey.patch_all()

def fetch_async(method, url, req_kwargs):
    print(method, url, req_kwargs)
    response = requests.request(method=method, url=url, **req_kwargs)
    print(response.url, response.content)

gevent.joinall([
    gevent.spawn(fetch_async, method=‘get‘, url=‘https://www.python.org/‘, req_kwargs={}),
    gevent.spawn(fetch_async, method=‘get‘, url=‘https://www.baidu.com/‘, req_kwargs={}),
    gevent.spawn(fetch_async, method=‘get‘, url=‘https://www.sina.com/‘, req_kwargs={})
])


grequests模块
import grequests

request_list = [
    grequests.get(‘http://httpbin.org/delay/1‘,timeout=0.001),
    grequests.get(‘http://fakedomain/‘),
    grequests.get(‘http://httpbin.org/status/500‘)
]

twisted模块
from twisted.web.client import getPage, defer
from twisted.internet import reactor

def all_done(arg):
    reactor.stop()

def callback(contents):
    print(contents)

deferred_list = []
url_list = [‘http://www.bing.com‘, ‘http://www.baidu.com‘,]
for url in url_list:
    deferred = getPage(bytes(url, encoding=‘utf8‘))
    deferred.addCallback(callback)
    deferred_list.append(deferred)

dlist = defer.DeferredList(deferred_list)
dlist.addBoth(all_done)

reactor.run()


tornado模块
from tornado.httpclient import AsyncHTTPClient
from tornado.httpclient import HTTPRequest
from tornado import ioloop

def handle_response(response):
    if response.error:
        print("Error:", response.error)
    else:
        print(response.body)

def func():
    url_list = [
        ‘http://www.baidu.com‘,
        ‘http://www.bing.com‘,
    ]
    for url in url_list:
        print(url)
        http_client = AsyncHTTPClient()
        http_client.fetch(HTTPRequest(url), handle_response)

ioloop.IOLoop.current().add_callback(func)
ioloop.IOLoop.current().start()


tornado更多
from twisted.internet import reactor
from twisted.web.client import getPage
import urllib.parse

def one_done(arg):
    print(arg)
    reactor.stop()

post_data = urllib.parse.urlencode({‘check_data‘: ‘adf‘})
post_data = bytes(post_data, encoding=‘utf8‘)
headers = {b‘Content-Type‘: b‘application/x-www-form-urlencoded‘}
response = getPage(bytes(‘http://dig.chouti.com/login‘,encoding=‘utf8‘),
                   method=bytes(‘POST‘, encoding=‘utf8‘),
                   postdata=post_data,
                   cookies={},
                   headers=headers)
response.addBoth(one_done)

reactor.run()



以上均是Python内置以及第三方模块提供异步IO请求模块,使用简便大大提高效率,而对于异步IO请求的本质则是【非阻塞Socket】+【IO多路复用】


本文出自 “linux技术” 博客,请务必保留此出处http://haoyonghui.blog.51cto.com/4278020/1975697

爬虫性能相关

标签:性能   爬虫   相关   

原文地址:http://haoyonghui.blog.51cto.com/4278020/1975697

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!