Scrapy 框架中间件，信号，定制命令

时间：2019-02-10 09:35:31 阅读：177 评论：0 收藏：0 [点我收藏+]

标签：war 判断 yield imp line .com mozilla define 自定义

中间件

下载器中间件

写中间件

from scrapy.http import HtmlResponse
from scrapy.http import Request

class Md1(object):
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        return s

    def process_request(self, request, spider):    
        print(‘md1.process_request‘,request)
        return None # 返回如果是 空就会继续往下执行下一个中间件的 process_request 方法，如果一旦有返回值就要考虑情况
        """
        # 1. 返回 Response
        # 返回 Response 之后会往下执行 最后一个中间件的 process_response 方法 
        # import requests
        # result = requests.get(request.url)
        # return HtmlResponse(url=request.url, status=200, headers=None, body=result.content)
        
        # 2. 返回 Request
        # 返回 Request 之后 相当于无视了这次的请求 重新回到 调制器 那边，相当于又产生了新的任务
        # return Request(‘https://dig.chouti.com/r/tec/hot/1‘)

        # 3. 抛出异常    
        # 抛出异常 必须要 有 process_exception 方法进行捕捉异常，不然会报错
        # process_exception 方法在进行一系列的操作 在捕捉到异常的时候 
        # from scrapy.exceptions import IgnoreRequest
        # raise IgnoreRequest
        
        # 4. 对请求进行加工(*) 
        # 通常我们都是用于对请求加工，然后再继续下面操作不返回东西 
        # request.headers[‘user-agent‘] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
        # return None
        """

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object    # 返回一个 Response 来代替当前的 Response
        # - return a Request object        # 返回一个 Request 开启新任务 
        # - or raise IgnoreRequest        # 返回一个 IgnoreRequest 进行异常捕捉 
        print(‘m1.process_response‘,request,response)
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
            # 通常我们都是直接返回 None 就可以了
        # - return a Response object: stops process_exception() chain    # 只要返回了 Response 当前的 process_exception 就不做操作了 
            # 返回 Response 表示交给下一个 中间件的 process_exception 继续处理 
        # - return a Request object: stops process_exception() chain    # 只要返回了 Request 当前的 process_exception 就不做操作了 
            # 返回 Request 放弃本次任务，新建任务     
        pass

配置文件

DOWNLOADER_MIDDLEWARES = {    
    #‘xdb.middlewares.XdbDownloaderMiddleware‘: 543,
    # ‘xdb.proxy.XdbProxyMiddleware‘:751,
    ‘xdb.md.Md1‘:666,    # 依旧是 0-1000 越小越优先 
    ‘xdb.md.Md2‘:667,
}

执行顺序梳理

调度器 给 下载器的时候先走 process_request（从第一个中间件往最后一个走） 然后如果根据返回情况进行判断接下来的方向
　　返回 None 继续下一个中间件的 process_request
　　返回 Response 进入 最后一个下载中间件的 process_response 流程
　　返回 Request 返回 调度器开启新任务 
　　返回 异常  进入当前中间件的 process_exception 进行异常处理

下载器 还给 爬虫的时候要走 process_response（从最后一个中间件往第一个走）然后如果根据返回情况进行判断接下来的方向
　　返回 None 继续上一个中间件的 process_response
　　返回 Response 替换当前Response 进入上一个下载中间件的 process_response 流程
　　返回 Request 返回 调度器开启新任务 放弃当前的任务  
　　返回 异常  进入当前中间件的 process_exception 进行异常处理

应用场景

- user-agent # 所有的请求都加 user-agent    
    # 其实不需要做，默认自带一个 可以添加 user-agent 的功能
    # 再 settings 中 USER_AGENT = ‘‘ 直接配置就可以实现这个功能 
- 代理     # 请求代理操作

爬虫中间件

写中间件

class Sd1(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    # 只在爬虫启动时，执行一次。
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

配置文件

SPIDER_MIDDLEWARES = {
   # ‘xdb.middlewares.XdbSpiderMiddleware‘: 543,
    ‘xdb.sd.Sd1‘: 666,    # 同爬虫中间件一样的判断机制 
    ‘xdb.sd.Sd2‘: 667,
}

执行流程

1. 第一次启动爬虫文件封装好 request 之后走 process_start_requests 上传给引擎

2. 引擎将封装好的 request 给调度器

3. 调度器继续执行给下载器

4. 下载器下载了内容之后给引擎

5. 引擎再给爬虫文件的时候要走 process_spider_input

6. 爬虫文件处理完之后如果有 yield 就要在走 process_spider_output 给引擎

应用

- 深度

- 优先级

信号

使用框架预留的位置，帮助你自定义一些功能

使用实例

from scrapy import signals

class MyExtend(object):
    def __init__(self):
        pass

    @classmethod
    def from_crawler(cls, crawler):
        self = cls()

        crawler.signals.connect(self.x1, signal=signals.spider_opened) # 绑定信号发生时允许的函数
        crawler.signals.connect(self.x2, signal=signals.spider_closed)

        return self

    def x1(self, spider):
        print(‘open‘)

    def x2(self, spider):
        print(‘close‘)

# 信号可选类型  from scrapy import signals 中可以看到 
engine_started = object()
engine_stopped = object()

spider_opened = object()
spider_idle = object()
spider_closed = object()
spider_error = object()

request_scheduled = object()
request_dropped = object()
response_received = object()
response_downloaded = object()

item_scraped = object()
item_dropped = object()

# settings.py 

EXTENSIONS = {
            ‘xdb.ext.MyExtend‘:666,
        }

定制命令

单爬虫运行

import sys
from scrapy.cmdline import execute

if __name__ == ‘__main__‘:
    execute(["scrapy","crawl","chouti","--nolog"])

所有爬虫

- 在spiders同级创建任意目录，如：commands
- 在其中创建 crawlall.py 文件 （此处文件名就是自定义的命令）
- 在settings.py 中添加配置 COMMANDS_MODULE = ‘项目名称.目录名称‘
- 在项目目录执行命令：scrapy crawlall

# crawlall.py

from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings


class Command(ScrapyCommand):

    requires_project = True

    def syntax(self):
        return ‘[options]‘

    def short_desc(self):
        return ‘Runs all of the spiders‘

    def run(self, args, opts):
        spider_list = self.crawler_process.spiders.list()
        for name in spider_list:
            self.crawler_process.crawl(name, **opts.__dict__)
        self.crawler_process.start()

# settings.py

COMMANDS_MODULE  = "xdb.commands"

Scrapy 框架中间件，信号，定制命令

标签：war 判断 yield imp line .com mozilla define 自定义

原文地址：https://www.cnblogs.com/shijieli/p/10358611.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行

Scrapy 框架 中间件，信号，定制命令

中间件

下载器中间件

写中间件

配置文件

执行顺序梳理

应用场景

爬虫中间件

写中间件

配置文件

执行流程

应用

信号

定制命令

单爬虫运行

所有爬虫

Scrapy 框架中间件，信号，定制命令