Scrapy 框架，持久化文件相关

时间：2019-02-10 09:26:41 阅读：220 评论：0 收藏：0 [点我收藏+]

持久化相关

持久化流程

1.爬虫文件爬取到数据后，需要将数据封装到 items对象中。

2.使用 yield 关键字将items对象提交给 pipelines 管道进行持久化操作。

3.在管道文件中的 process_item 方法中接收爬虫文件提交过来的item对象，然后编写持久化存储的代码将item对象中存储的数据进行持久化存储

4.settings.py配置文件中开启管道

示例

爬虫文件

import scrapy
from secondblood.items import SecondbloodItem
class QiubaidemoSpider(scrapy.Spider):
    name = ‘qiubaiDemo‘
    allowed_domains = [‘www.qiushibaike.com‘]
    start_urls = [‘http://www.qiushibaike.com/‘]
    def parse(self, response):
        odiv = response.xpath(‘//div[@id="content-left"]/div‘)
        for div in odiv:
            # xpath函数返回的为列表，列表中存放的数据为Selector类型的数据。我们解析到的内容被封装在了Selector对象中，需要调用extract()函数将解析的内容从Selecor中取出。
            author = div.xpath(‘.//div[@class="author clearfix"]//h2/text()‘).extract_first()
            author = author.strip(‘\n‘) # 过滤空行
            content = div.xpath(‘.//div[@class="content"]/span/text()‘).extract_first()
            content = content.strip(‘\n‘) # 过滤空行
            # 将解析到的数据封装至items对象中
            item = SecondbloodItem()
            item[‘author‘] = author
            item[‘content‘] = content
            yield item # 提交item到管道文件（pipelines.py）

items文件

items.py

import scrapy
class SecondbloodItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    author = scrapy.Field() # 存储作者
    content = scrapy.Field() # 存储段子内容

管道文件

pipelines.py

from scrapy.exceptions import DropItem

class SecondbloodPipeline(object):

    def __init__(self,path):
        self.f = None
        self.path = path    
        # 写入文件的路径参数 ，放在 setting 中了。
        # 通过 from_crawler 来拿到 path 

    @classmethod
    def from_crawler(cls, crawler): 
        """
        初始化时候，用于创建pipeline对象
        """
        print(‘File.from_crawler‘)
        path = crawler.settings.get(‘HREF_FILE_PATH‘) 
        return cls(path)

    def open_spider(self,spider):
        """
        爬虫开始执行时，调用 
        用于 文件的打开
        """
        # if spider.name == "chouti":  # spider参数 用于筛选个性化定制 
        print(‘File.open_spider‘)
        self.f = open(self.path,‘a+‘)

    def process_item(self, item, spider):
        # f = open(‘xx.log‘,‘a+‘)
        # f.write(item[‘href‘]+‘\n‘)
        # f.close() 
        # 这样写太low了，每次都要打开关闭文件
        # 因此选择 将 文件操作绕开每次循环。
        print(‘File‘,item[‘author‘])
        print(‘File‘,item[‘content‘])
        self.f.write(item[‘author‘] + ‘:‘ + item[‘content‘] + ‘\n‘)
        
        # return item      # 交给下一个pipeline的process_item方法
        raise DropItem()# 后续的 pipeline的process_item方法不再执行

    def close_spider(self,spider):
        """
        爬虫关闭时，被调用
        用于 文件的关闭 
        """
        print(‘File.close_spider‘)
        self.f.close()

注意：pipeline是所有爬虫公用，如果想要给某个爬虫定制需要使用spider参数自己进行处理

ps：

数据的处理当然可以写入数据库，或者 redis 如下实例

# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#导入数据库的类
import pymysql
 
class QiubaiproPipelineByMysql(object):
    conn = None  #mysql的连接对象声明
    cursor = None#mysql游标对象声明
    def open_spider(self,spider):
        print(‘开始爬虫‘)
        #链接数据库
        self.conn = pymysql.Connect(host=‘127.0.0.1‘,port=3306,user=‘root‘,password=‘123456‘,db=‘qiubai‘)
    
    #编写向数据库中存储数据的相关代码
    def process_item(self, item, spider):
        #1.链接数据库
        #2.执行sql语句
        sql = ‘insert into qiubai values("%s","%s")‘%(item[‘author‘],item[‘content‘])
        self.cursor = self.conn.cursor()
        #执行事务
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return item
 
    def close_spider(self,spider):
        print(‘爬虫结束‘)
        self.cursor.close()
        self.conn.close()

MySQL 的数据处理

# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 
import redis
class QiubaiproPipelineByRedis(object):
    conn = None
    def open_spider(self,spider):
        print(‘开始爬虫‘)
        #创建链接对象
        self.conn = redis.Redis(host=‘127.0.0.1‘,port=6379)
 
    def process_item(self, item, spider):
        dict = {
            ‘author‘:item[‘author‘],
            ‘content‘:item[‘content‘]
        }
        #写入redis中
        self.conn.lpush(‘data‘, dict)
        return item

redis 的数据处理

配置文件

settings.py

#开启管道
 
ITEM_PIPELINES = {
    ‘secondblood.pipelines.SecondbloodPipeline‘: 300, # 300表示为优先级，值越小优先级越高
}

ps：

可以写多个Pipeline类

1、如果优先级高的Pipeline的process_item返回一个值或者None，会自动传给下一个pipline的process_item,

2、如果只想让第一个Pipeline执行，那得让第一个pipline的process_item抛出异常raise DropItem()

3、可以用spider.name == ‘爬虫名‘ 来控制哪些爬虫用哪些pipeline

综合实例

‘‘‘
#1、settings.py
HOST="127.0.0.1"
PORT=27017
USER="root"
PWD="123"
DB="amazon"
TABLE="goods"

‘‘‘
from scrapy.exceptions import DropItem
from pymongo import MongoClient

class MongoPipeline(object):
    ‘‘‘2、把解析好的item对象做一个持久化，保存到数据库中‘‘‘
    def __init__(self,db,collection,host,port,user,pwd):
        self.db = db
        self.collection = collection  #文档(表)
        self.host = host
        self.port = port
        self.user = user
        self.pwd = pwd

    @classmethod
    def from_crawler(cls,crawler):
        ‘‘‘1、Scrapy会先通过getattr判断我们是否自定义了from_crawler,有则调它来完
        成实例化‘‘‘
        db = crawler.settings.get("DB")
        collection = crawler.settings.get("COLLECTION")
        host = crawler.settings.get("HOST")
        port = crawler.settings.get("PORT")
        user = crawler.settings.get("USER")
        pwd = crawler.settings.get("PWD")
        return cls(db,collection,host,port,user,pwd)   #cls是当前的类，类加括号执行__init__方法

    def open_spider(self,spider):
        ‘‘‘3、爬虫刚启动时执行一次‘‘‘
        print(‘==============>爬虫程序刚刚启动‘)
        self.client = MongoClient(‘mongodb://%s:%s@%s:%s‘%(
            self.user,
            self.pwd,
            self.host,
            self.port
        ))

    def close_spider(self,spider):
        ‘‘‘5、关闭爬虫程序‘‘‘
        print(‘==============>爬虫程序运行完毕‘)
        self.client.close()

    def process_item(self, item, spider):
        ‘‘‘4、操作并执行持久化‘‘‘
        # return表示会被后续的pipeline继续处理
        d = dict(item)
        if all(d.values()):
            self.client[self.db][self.collection].save(d)   #保存到数据库
        return item
        # 表示将item丢弃，不会被后续pipeline处理
        # raise DropItem()



class FilePipeline(object):
    def __init__(self, file_path):
        self.file_path=file_path

    @classmethod
    def from_crawler(cls, crawler):
        """
        Scrapy会先通过getattr判断我们是否自定义了from_crawler,有则调它来完
        成实例化
        """
        file_path = crawler.settings.get(‘FILE_PATH‘)


        return cls(file_path)

    def open_spider(self, spider):
        """
        爬虫刚启动时执行一次
        """
        print(‘==============>爬虫程序刚刚启动‘)
        self.fileobj=open(self.file_path,‘w‘,encoding=‘utf-8‘)

    def close_spider(self, spider):
        """
        爬虫关闭时执行一次
        """
        print(‘==============>爬虫程序运行完毕‘)
        self.fileobj.close()

    def process_item(self, item, spider):
        # 操作并进行持久化

        # return表示会被后续的pipeline继续处理
        d = dict(item)
        if all(d.values()):
            self.fileobj.write(r"%s\n" %str(d))

        return item

        # 表示将item丢弃，不会被后续pipeline处理
        # raise DropItem()

Scrapy 框架，持久化文件相关

标签：保存 second 配置文件初始控制关闭 doc 循环需要

原文地址：https://www.cnblogs.com/shijieli/p/10358597.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行

Scrapy 框架，持久化文件相关

持久化相关

相关文件

items.py

pipelines.py

持久化流程

示例

爬虫文件

items文件

管道文件

配置文件

ps：

综合实例