码迷,mamicode.com
首页 > 编程语言 > 详细

scrapy主动退出爬虫的代码片段(python3)

时间:2019-01-16 19:17:25      阅读:182      评论:0      收藏:0      [点我收藏+]

标签:.com   更改   bsp   cti   sub   lin   文件的   pass   写法   

问题:在运行scrapy的过程中,如果想主动退出该怎么做?

背景:比如说我只要爬取当日的新闻,那么在遍历的时候,如果出现了超过1条不是当日的新闻,那么就不爬取了,就主动退出爬虫,这个时候该怎么做呢?

IDE:pycharm

版本:python3

框架:scrapy

系统:windows10

代码如下:

# -*- coding: utf-8 -*-
import scrapy
from torrentSpider.items.NavigationItem import NavigationItem
from torrentSpider.items.TorrentItem import TorrentItem
import time
import random
import logging
import os


class XxxSpider(scrapy.Spider):
    name = "xxx_spider"
    allowed_domains = [www.xxx.com]
    start_urls = [http://www.xxx.com/1.html]

    # 网站前缀
    web_pre_url = http://xxx.com
    # 计数
    count = 0

    def parse(self, response):

        # 设置请求也随机延迟
        time.sleep(random.randint(0, 5))

        # 获取导航栏的数量
        navigation_type_number = response.xpath(//*[@id="hypoNav"]/div/ul/li/em/a/text()).extract()
        for n_k in range(1, len(navigation_type_number)):
            navigation_item = NavigationItem()
            # 网站标题
            navigation_item[navigation_title] = response.xpath(//*[@id="logoSea"]/div[1]/a/img/@alt).extract()[0]
            # 导航栏目分类名称
            navigation_item[navigation_type] = response.xpath(//*[@id="hypoNav"]/div/ul/li[+str(n_k+1)+]/em/a/text()).extract()[0]
            # 导航链接
            navigation_item[navigation_url] = response.xpath(//*[@id="hypoNav"]/div/ul/li[+str(n_k+1)+]/em/a/@href).extract()[0]

        # 获取子导航栏的数量
        sub_navigation_type_number = response.xpath(//*[@id="nodeNav"]/div/ul/li/em/a/span/text()).extract()
        for sub_k in range(1, len(sub_navigation_type_number)):
            sub_navigation_item = NavigationItem()
            # 网站标题
            sub_navigation_item[navigation_title] = response.xpath(//*[@id="logoSea"]/div[1]/a/img/@alt).extract()[0]
            # 副导航栏目分类名称
            sub_navigation_item[sub_navigation_type] = response.xpath(//*[@id="nodeNav"]/div/ul/li[+str(sub_k)+]/em/a/span/text()).extract()[0]
            # 副导航栏链接
            sub_navigation_item[sub_navigation_url] = response.xpath(//*[@id="nodeNav"]/div/ul/li[+str(sub_k)+]/em/a/@href).extract()[0]

        # 获取每页电影条目数长度
        movie_name_tr_array = response.xpath(/html/body/div[2]/table[1]/tr/td[1]/table[2]/tbody/tr).extract()
        for i_k in range(1, len(movie_name_tr_array)):
            # 子链接
            str_sub_url = /html/body/div[2]/table[1]/tr/td[1]/table[2]/tbody/tr[+str(i_k)+]/td[1]/a/@href
            m_link = self.web_pre_url + response.xpath(str_sub_url).extract()[0]
            yield scrapy.Request(url=m_link, callback=self.parse_links, dont_filter=True)

        # 解析下一页
        next_link = response.xpath(//*[@class="pagegbk"]/@href).extract()
        if next_link:
            if len(next_link) == 1:
                next_link = next_link[0]
            else:
                next_link = next_link[1]
            yield scrapy.Request(self.web_pre_url + next_link, callback=self.parse)

    # 爬取子链接
    def parse_links(self, response):
        torrent_item = TorrentItem()
        # 标题
        torrent_item[torrent_title] = self.check_xpath_value(response, /html/body/div[2]/table[1]/tbody/tr/td/font/text())
        # 影片名称
        torrent_item[torrent_name] = self.check_xpath_value(response, /html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[1]/text())
        # 导演
        torrent_item[torrent_director] = self.check_xpath_value(response, /html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[2]/text())
        # 影片演员
        torrent_item[torrent_actor] = self.check_xpath_value(response, /html/body/div[2]/table[2]/tbody/tr/td/div[1]/span/font[2]/text())
        # 语言
        torrent_item[torrent_language] = self.check_xpath_value(response, /html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[3]/text())
        # 影片类型
        torrent_item[torrent_type] = self.check_xpath_value(response, /html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[4]/text())
        # 影片地区
        torrent_item[torrent_region] = self.check_xpath_value(response, /html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[5]/text())
        # 更新时间
        torrent_item[torrent_update_time] = self.check_xpath_value(response, /html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[6]/text())
        # 影片状态
        torrent_item[torrent_status] = self.check_xpath_value(response, /html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[7]/text())
        # 上映日期
        torrent_item[torrent_show_time] = self.check_xpath_value(response, /html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[8]/text())
        # 剧情介绍
        torrent_item[torrent_introduction] = self.check_xpath_value(response, /html/body/div[2]/table[2]/tbody/tr/td/div[2]/text())
        # 影片地址
        torrent_item[torrent_url] = self.check_xpath_value(response, //*[@id="plist"]/table[2]/tbody/tr[2]/td/ul/li/input/@value)

        # 获取当前时间并格式化
        current_date = time.strftime(%Y-%m-%d, time.localtime())
        print(current_date = %s % str(current_date))
        print(torrent_update_time = %s % torrent_item[torrent_update_time])
        # 如果不是当天的就不爬取,并且计数
        if torrent_item[torrent_update_time] == str(current_date):
            yield torrent_item
        else:
            self.count = self.count + 1
            # 判断计数是否超过50,超过就不爬取了
            if self.count > 1:
                # logging.info("计数超过10,停止爬虫")
                self.crawler.engine.close_spider(self, ‘计数超过10,停止爬虫!‘)
            pass

    # 判断是否为空
    @staticmethod
    def check_xpath_value(response, xpath_url):
        xpath_value = response.xpath(xpath_url).extract()
        if xpath_value:
            if xpath_value[0].strip() != ‘‘:
                return xpath_value[0]
            else:
                return "null"
        else:
            return "null"

注意以上代码中标红的地方:

self.crawler.engine.close_spider(self, ‘计数超过10,停止爬虫!‘)

1,此行代码是写在spider文件中的

2,虽然这一行代码会停止爬虫,但是这一行代码的停止并不是立即停止

原因是因为当我们不更改爬虫的setting.py文件的时候,默认配置是:

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

含义就是:Scrapy downloader 并发请求(concurrent requests)的最大值,默认: 16

那么这个时候的问题来了,按照以上的写法,在队列里就已经有十几个请求了,你停止之后,这十几个请求依旧会执行下去,所以并不是立即停止,如果想改变的话,就必须改变此项配置,设为:

CONCURRENT_REQUESTS = 1

 

具体scrapy爬虫原理请自行百度,并请自行调试,谢谢~

 

scrapy主动退出爬虫的代码片段(python3)

标签:.com   更改   bsp   cti   sub   lin   文件的   pass   写法   

原文地址:https://www.cnblogs.com/huangtao1927/p/10278501.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!