标签:ret com 操作 请求 crawl pipeline 初始化 field doc
基于上篇博文存在的问题
https://www.cnblogs.com/Alexephor/p/11432195.html
-无法完成爬虫刚开始 打开连接 爬虫关闭时:关闭连接
-分工不明确
上篇博文不足之处主要体现在爬虫部分parse中在爬数据的过程操作了实时打开关闭文件处理,而且爬虫逻辑分工不明确
处理方式:1.在爬虫开始位置就打开文件操作或者数据库,在爬完之后就关闭掉即可
2.爬的数据应该交给process_item处理
Chouti.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from wyb.items import WybItem 4 5 6 class ChoutiSpider(scrapy.Spider): 7 name = ‘chouti‘ 8 # 爬取定向的网页 只允许这个域名的 9 allowed_domains = [‘chouti.com‘] 10 start_urls = [‘http://chouti.com/‘] 11 12 def parse(self, response): 13 from scrapy.http.response.html import HtmlResponse 14 # print(response, type(response)) 15 # print(response.text) 16 item_list = response.xpath(‘//div[@id="content-list"]/div[@class="item"]‘) 17 for item in item_list: 18 text = item.xpath(‘.//a/text()‘).extract_first() 19 href = item.xpath(‘.//a/@href‘).extract_first() 20 yield WybItem(text=text, href=href) 21 page_list = response.xpath(‘//div[@id="dig_lcpage"]//a/@href‘).extract() 22 for page in page_list: 23 from scrapy.http import Request 24 page = "https://dig.chouti.com"+page 25 # 继续发请求,回调函数parse 26 yield Request(url=page, callback=self.parse)
pipelines.py
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 8 from .settings import HREF_FILE_PATH 9 from scrapy.exceptions import DropItem 10 11 12 class WybPipeline(object): 13 def __init__(self, path): 14 self.f = None 15 self.path = path 16 17 @classmethod 18 def from_crawler(cls, crawler): 19 """ 20 初始化时候,用于创建pipline对象 21 :param crawler: 22 :return: 23 """ 24 path = crawler.settings.get(‘HREF_FILE_PATH‘) 25 return cls(path) 26 27 def open_spider(self, spider): 28 """ 29 爬虫开始执行被调用 30 :param spider: 31 :return: 32 """ 33 # if spider.name == "chouti": 34 # self.f = open(HREF_FILE_PATH, ‘a+‘) 35 self.f = open(self.path, ‘a+‘) 36 37 def process_item(self, item, spider): 38 # item就是yield返回的内容 39 # spider就是当前ChoutiSpider类的实例 40 # f = open(‘news.log‘, ‘a+‘) 41 # f.write(item[‘href‘]) 42 # f.close() 43 self.f.write(item[‘href‘] + ‘\n‘) 44 # return item # 不交给下一个pipeline的process_item去处理 45 raise DropItem() # 后续的 pipeline的process_item不再执行了 46 47 def close_spider(self, spider): 48 """ 49 爬虫关闭时调用 50 :param spider: 51 :return: 52 """ 53 self.f.close() 54 55 56 class DBPipeline(object): 57 def __init__(self, path): 58 self.f = None 59 self.path = path 60 61 @classmethod 62 def from_crawler(cls, crawler): 63 """ 64 初始化时候,用于创建pipline对象 65 :param crawler: 66 :return: 67 """ 68 path = crawler.settings.get(‘HREF_DB_PATH‘) 69 return cls(path) 70 71 def open_spider(self, spider): 72 """ 73 爬虫开始执行被调用 74 :param spider: 75 :return: 76 """ 77 # self.f = open(HREF_DB_PATH, ‘a+‘) 78 self.f = open(self.path, ‘a+‘) 79 80 def process_item(self, item, spider): 81 # item就是yield返回的内容 82 # spider就是当前ChoutiSpider类的实例 83 # f = open(‘db.log‘, ‘a+‘) 84 # f.write(item[‘href‘]) 85 # f.close() 86 self.f.write(item[‘href‘] + ‘\n‘) 87 return item 88 89 def close_spider(self, spider): 90 """ 91 爬虫关闭时调用 92 :param spider: 93 :return: 94 """ 95 self.f.close()
items.py
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # https://docs.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class WybItem(scrapy.Item): 12 # define the fields for your item here like: 13 # name = scrapy.Field() 14 title = scrapy.Field() 15 href = scrapy.Field()
settings.py
1 ITEM_PIPELINES = { 2 ‘wyb.pipelines.WybPipeline‘: 300, 3 ‘wyb.pipelines.DBPipeline‘: 301, 4 # 优先级 0-1000 越小优先级越快 5 } 6 7 8 # 保存路径 9 HREF_FILE_PATH = ‘news.log‘ 10 HREF_DB_PATH = ‘db.log‘
总结:
1 - 持久化 piplines/items 2 a.先写piplines类 3 class WybPipeline(object): 4 def process_item(self, item, spider): 5 print(item) 6 # item就是yield返回的内容 7 # spider就是当前ChoutiSpider类的实例 8 return item 9 b.写Item类 10 class WybItem(scrapy.Item): 11 # define the fields for your item here like: 12 # name = scrapy.Field() 13 title = scrapy.Field() 14 href = scrapy.Field() 15 c.配置文件 16 ITEM_PIPELINES = { 17 ‘wyb.pipelines.WybPipeline‘: 300, 18 # 优先级 0-1000 越小优先级越高 19 } 20 d.爬虫 yield执行一次就执行一次process_item 21 yield WybItem(text=text, href=href)
对pipeline文件详情部分
源码流程内容
1 1.判断当前WybPipeline内中是否有from_crawler 2 有: 3 obj = WybPipeline.from_crawler() 4 无: 5 obj = WybPipeline() 6 2.obj.open_spider() 7 3.obj.process_item()/obj.process_item()/obj.process_item()/ 8 4.obj.close_spider()
在这里看出来再定义Pipeline类时:
五方法
1 __init__(self, path)
2
3 @classmethod
4 from_crawler(cls, crawler)
5
6 open_spider(self, spider)
7
8 process_item(self, item, spider)
9
10 close_spider(self, spider)
标签:ret com 操作 请求 crawl pipeline 初始化 field doc
原文地址:https://www.cnblogs.com/Alexephor/p/11436726.html