标签:递归 直接 程序 page 实例 粘贴 odi ini font
# 新建py文件:duplication.py
# 我们新建了一个文件,专门用来去重。在scrapy源码中已经把结构写好了,我们只需复制粘贴过来
from scrapy.dupefilter import BaseDupeFilter
‘‘‘
class BaseDupeFilter(object):
@classmethod
def from_settings(cls, settings):
return cls()
def request_seen(self, request):
return False
def open(self): # can return deferred
pass
def close(self, reason): # can return a deferred
pass
def log(self, request, spider): # log that a request has been filtered
pass
‘‘‘
# 可以看到,以上就是scrapy中BaseDupeFilter这个类,框架结构帮我们搭好了,因此我们只需要自定制以下即可
class DupeFilter(object):
# 使用构造方法,还是用之前的过滤方法
def __init__(self):
self.urls = set()
@classmethod
def from_settings(cls, settings):
return cls()
def request_seen(self, request):
# 这里的request.url就是我们爬取的url
# 如果在集合里面,那么返回True,意思是成功了不用再爬了
if request.url in self.urls:
return True
# 不再集合里面返回False,意思是错误,虫子还没有爬取此url
self.urls.add(request.url)
return False
def open(self): # 开始
pass
def close(self, reason): # 结束
pass
def log(self, request, spider): # 记录日志
pass
# 可以看到@classmethod下的类方法,直接返回cls(),这在scrapy中非常常见,因此我们不用实例化
# scrapy会自动地调用这个方法,生成一个实例对象,因此我们只需要写好相应的结构即可
主程序:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class GetChoutiSpider(scrapy.Spider):
name = ‘get_chouti‘
allowed_domains = [‘chouti.com‘]
start_urls = [‘https://dig.chouti.com/‘]
# # 当递归查找时,会反复执行parse,因此md5_urls不能定义在parse函数里面
# md5_urls = set()
# 将url添加到集合中,是我们自己自定制的方法,其实scrapy为我们准备了更好的去重方法
def parse(self, response):
# 通过返回结果,我们可以看到确实scrapy帮我们去重了
print(response.url)
‘‘‘
https://dig.chouti.com/
https://dig.chouti.com/all/hot/recent/2
https://dig.chouti.com/all/hot/recent/10
https://dig.chouti.com/all/hot/recent/8
https://dig.chouti.com/all/hot/recent/6
https://dig.chouti.com/all/hot/recent/9
https://dig.chouti.com/all/hot/recent/4
https://dig.chouti.com/all/hot/recent/5
https://dig.chouti.com/all/hot/recent/7
https://dig.chouti.com/all/hot/recent/3
https://dig.chouti.com/all/hot/recent/1
https://dig.chouti.com/all/hot/recent/11
https://dig.chouti.com/all/hot/recent/12
https://dig.chouti.com/all/hot/recent/14
https://dig.chouti.com/all/hot/recent/13
https://dig.chouti.com/all/hot/recent/18
https://dig.chouti.com/all/hot/recent/16
https://dig.chouti.com/all/hot/recent/17
https://dig.chouti.com/all/hot/recent/15
https://dig.chouti.com/all/hot/recent/19
https://dig.chouti.com/all/hot/recent/20
https://dig.chouti.com/all/hot/recent/21
https://dig.chouti.com/all/hot/recent/23
https://dig.chouti.com/all/hot/recent/25
https://dig.chouti.com/all/hot/recent/24
https://dig.chouti.com/all/hot/recent/27
https://dig.chouti.com/all/hot/recent/29
https://dig.chouti.com/all/hot/recent/26
https://dig.chouti.com/all/hot/recent/28
https://dig.chouti.com/all/hot/recent/22
https://dig.chouti.com/all/hot/recent/30
https://dig.chouti.com/all/hot/recent/33
https://dig.chouti.com/all/hot/recent/31
https://dig.chouti.com/all/hot/recent/32
https://dig.chouti.com/all/hot/recent/34
https://dig.chouti.com/all/hot/recent/37
https://dig.chouti.com/all/hot/recent/36
https://dig.chouti.com/all/hot/recent/41
https://dig.chouti.com/all/hot/recent/38
https://dig.chouti.com/all/hot/recent/40
https://dig.chouti.com/all/hot/recent/39
https://dig.chouti.com/all/hot/recent/45
https://dig.chouti.com/all/hot/recent/42
https://dig.chouti.com/all/hot/recent/44
https://dig.chouti.com/all/hot/recent/43
https://dig.chouti.com/all/hot/recent/49
https://dig.chouti.com/all/hot/recent/47
https://dig.chouti.com/all/hot/recent/46
https://dig.chouti.com/all/hot/recent/48
https://dig.chouti.com/all/hot/recent/50
https://dig.chouti.com/all/hot/recent/53
https://dig.chouti.com/all/hot/recent/51
https://dig.chouti.com/all/hot/recent/52
https://dig.chouti.com/all/hot/recent/56
https://dig.chouti.com/all/hot/recent/57
https://dig.chouti.com/all/hot/recent/55
https://dig.chouti.com/all/hot/recent/35
https://dig.chouti.com/all/hot/recent/54
https://dig.chouti.com/all/hot/recent/59
https://dig.chouti.com/all/hot/recent/60
https://dig.chouti.com/all/hot/recent/61
https://dig.chouti.com/all/hot/recent/58
https://dig.chouti.com/all/hot/recent/62
https://dig.chouti.com/all/hot/recent/63
https://dig.chouti.com/all/hot/recent/64
https://dig.chouti.com/all/hot/recent/65
https://dig.chouti.com/all/hot/recent/66
https://dig.chouti.com/all/hot/recent/67
https://dig.chouti.com/all/hot/recent/68
https://dig.chouti.com/all/hot/recent/69
https://dig.chouti.com/all/hot/recent/70
https://dig.chouti.com/all/hot/recent/71
https://dig.chouti.com/all/hot/recent/73
https://dig.chouti.com/all/hot/recent/72
https://dig.chouti.com/all/hot/recent/74
https://dig.chouti.com/all/hot/recent/76
https://dig.chouti.com/all/hot/recent/75
https://dig.chouti.com/all/hot/recent/77
https://dig.chouti.com/all/hot/recent/78
https://dig.chouti.com/all/hot/recent/79
https://dig.chouti.com/all/hot/recent/80
https://dig.chouti.com/all/hot/recent/81
https://dig.chouti.com/all/hot/recent/82
https://dig.chouti.com/all/hot/recent/83
https://dig.chouti.com/all/hot/recent/84
https://dig.chouti.com/all/hot/recent/85
https://dig.chouti.com/all/hot/recent/86
https://dig.chouti.com/all/hot/recent/87
https://dig.chouti.com/all/hot/recent/88
https://dig.chouti.com/all/hot/recent/89
https://dig.chouti.com/all/hot/recent/90
https://dig.chouti.com/all/hot/recent/92
https://dig.chouti.com/all/hot/recent/91
https://dig.chouti.com/all/hot/recent/93
https://dig.chouti.com/all/hot/recent/94
https://dig.chouti.com/all/hot/recent/97
https://dig.chouti.com/all/hot/recent/95
https://dig.chouti.com/all/hot/recent/96
https://dig.chouti.com/all/hot/recent/98
https://dig.chouti.com/all/hot/recent/99
https://dig.chouti.com/all/hot/recent/100
https://dig.chouti.com/all/hot/recent/101
https://dig.chouti.com/all/hot/recent/102
https://dig.chouti.com/all/hot/recent/103
https://dig.chouti.com/all/hot/recent/104
https://dig.chouti.com/all/hot/recent/105
https://dig.chouti.com/all/hot/recent/108
https://dig.chouti.com/all/hot/recent/106
https://dig.chouti.com/all/hot/recent/107
https://dig.chouti.com/all/hot/recent/109
https://dig.chouti.com/all/hot/recent/111
https://dig.chouti.com/all/hot/recent/110
https://dig.chouti.com/all/hot/recent/112
https://dig.chouti.com/all/hot/recent/113
https://dig.chouti.com/all/hot/recent/114
https://dig.chouti.com/all/hot/recent/115
https://dig.chouti.com/all/hot/recent/116
https://dig.chouti.com/all/hot/recent/117
https://dig.chouti.com/all/hot/recent/120
https://dig.chouti.com/all/hot/recent/118
https://dig.chouti.com/all/hot/recent/119
‘‘‘
# 这里我们要如何去重呢?新建一个文件定义一个类
res2 = response.xpath(‘//div[@id="dig_lcpage"]//a/@href‘).extract()
for url in res2:
# 之间的统统都可以不要了
url = "https://dig.chouti.com%s" % url
yield Request(url=url, callback=self.parse)
配置文件:
DEPTH_LIMIT = 0 # 当然在配置文件里,必须指定一下,过滤所用到的类 # 这样才会用我们定义的类进行过滤 DUPEFILTER_CLASS = ‘chouti.duplication.DupeFilter‘
标签:递归 直接 程序 page 实例 粘贴 odi ini font
原文地址:https://www.cnblogs.com/traditional/p/9257702.html