# -*- coding: utf-8 -*-import scrapy
from secondblood.items import SecondbloodItem
class QiubaidemoSpider(scrapy.Spider):
name = ‘qiubaiDemo‘
# allowed_domains = [‘www.qiushibaike.com‘]
start_urls = [‘http://www.qiushibaike.com/‘]
def parse(self, response):
odiv = response.xpath(‘//div[@id="content-left"]/div‘)
for div in odiv:
# xpath函数返回的为列表,列表中存放的数据为Selector类型的数据。我们解析到的内容被封装在了Selector对象中,需要调用extract()函数将解析的内容从Selecor中取出。
author = div.xpath(‘.//div[@class="author clearfix"]//h2/text()‘).extract_first()
author = author.strip(‘\n‘)#过滤空行
content = div.xpath(‘.//div[@class="content"]/span/text()‘).extract_first()
content = content.strip(‘\n‘)#过滤空行#将解析到的数据封装至items对象中
item = SecondbloodItem()
item[‘author‘] = author
item[‘content‘] = content
yield item#提交item到管道文件(pipelines.py)
- items文件:items.py
import scrapy
class SecondbloodItem(scrapy.Item):
# define the fields for your item here like:# name = scrapy.Field()
author = scrapy.Field() #存储作者
content = scrapy.Field() #存储段子内容
- 管道文件:pipelines.py
# -*- coding: utf-8 -*-# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlclass SecondbloodPipeline(object):
#构造方法def__init__(self):
self.fp = None #定义一个文件描述符属性#下列都是在重写父类的方法:#开始爬虫时,执行一次def open_spider(self,spider):
print(‘爬虫开始‘)
self.fp = open(‘./data.txt‘, ‘w‘)
#因为该方法会被执行调用多次,所以文件的开启和关闭操作写在了另外两个只会各自执行一次的方法中。def process_item(self, item, spider):
#将爬虫程序提交的item进行持久化存储
self.fp.write(item[‘author‘] + ‘:‘ + item[‘content‘] + ‘\n‘)
return item
#结束爬虫时,执行一次def close_spider(self,spider):
self.fp.close()
print(‘爬虫结束‘)
# -*- coding: utf-8 -*-# Define your item pipelines here
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html#导入数据库的类import pymysql
class QiubaiproPipelineByMysql(object):
conn = None #mysql的连接对象声明
cursor = None#mysql游标对象声明