标签:crawl false pytho 允许 管道 pid The art lse
movie.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from zlsPro.items import ZlsproItem
from redis import Redis
class MovieSpider(CrawlSpider):
name = ‘movie‘
start_urls = [‘https://www.4567kan.com/index.php/vod/show/id/1.html‘]
rules = (
Rule(LinkExtractor(allow=r‘/index\.php/vod/show/id/1/page/\d+\.html‘), callback=‘parse_item‘, follow=True),
)
coon = Redis(host=‘127.0.0.1‘,port=6379)
def parse_item(self, response):
li_list = response.xpath(‘//div[1]/div/div/div/div[2]/ul/li‘)
for li in li_list:
title = li.xpath(‘./div/div/h4/a/text()‘)[0].extract()
href = ‘https://www.4567kan.com‘ + li.xpath(‘./div/div/h4/a/@href‘)[0].extract()
item = ZlsproItem()
item[‘title‘] = title
item[‘href‘] = href
ex = self.coon.sadd(‘movie_url‘,href)
if ex == 1:
print(‘有新增‘)
yield scrapy.Request(url=href,callback=self.parse_href,meta={‘item‘: item})
else:
print(‘暂无新增‘)
def parse_href(self,response):
detail = response.xpath(‘/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()‘).extract_first()
item = response.meta[‘item‘]
item[‘detail‘] = detail
yield item
settings.py
USER_AGENT = ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36‘
ROBOTSTXT_OBEY = False
LOG_LEVEL = ‘ERROR‘
# 使用scrapy-redis组件的去重队列
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 是否允许暂停
SCHEDULER_PERSIST = True
# 指定管道
ITEM_PIPELINES = {
‘scrapy_redis.pipelines.RedisPipeline‘: 400
}
# 指定数据库
REDIS_HOST = ‘127.0.0.1‘
REDIS_PORT = 6379
items.py
import scrapy
class ZlsproItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
href = scrapy.Field()
detail = scrapy.Field()
运行项目 scrapy crawl movie
标签:crawl false pytho 允许 管道 pid The art lse
原文地址:https://www.cnblogs.com/shiyi525/p/14286167.html