标签:sed self gif ret lap url proc crawl cli
几个scrapy框架的指令:
scrapy startproject xxxx
scrapy genspider xxx www.ooo.com
scrapy crawl xxx
基于管道的持久化存储:
1.数据解析
2.在item类中定义相关的属性
3.在parse方法中实例化一个item类型的对象
4.将解析到的数据存储到item类型的对象中
5.使用yield item将item对象提交给管道
6.在process_item 这里面 接收数据 并进行持久化存储
7.在配置文件里面开启管道
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from qiubai01.items import Qiubai01Item 4 5 6 class QiubaiSpider(scrapy.Spider): 7 name = ‘qiubai‘ 8 # allowed_domains = [‘www.xxx.com‘] 9 start_urls = [‘https://www.qiushibaike.com/text/‘] 10 11 # def parse(self, response): 12 # # response 是请求返回对象 13 # div_list = response.xpath("//*[@id=\"content-left\"]/div") 14 # for div in div_list: 15 # # author = div.xpath("./div[1]/a[2]/h2/text()")[0].extract() 16 # # 这里面换一种写法也可以 17 # author = div.xpath("./div[1]/a[2]/h2/text()").extract_first() 18 # content = div.xpath(‘./a/div/span//text()‘).extract() 19 # content = "".join(content) 20 # print(author) 21 # print() 22 # print(content) 23 24 # 基于终端指令存储 scrapy crawl -o qiushi.csv 25 # def parse(self, response): 26 # # response 是请求返回对象 27 # div_list = response.xpath("//*[@id=\"content-left\"]/div") 28 # all_data_list = [] 29 # for div in div_list: 30 # # author = div.xpath("./div[1]/a[2]/h2/text()")[0].extract() 31 # # 这里面换一种写法也可以 32 # author = div.xpath("./div[1]/a[2]/h2/text()").extract_first() 33 # content = div.xpath(‘./a/div/span//text()‘).extract() 34 # content = "".join(content) 35 # dic = {} 36 # dic[‘author‘] = author 37 # dic[‘content‘] = content 38 # all_data_list.append(dic) 39 # return all_data_list 40 41 # 基于管道的持久化存储 42 43 def parse(self, response): 44 """ 45 1.数据解析 46 2.在item类中定义相关的属性 47 3.在parse方法中实例化一个item类型的对象 48 4.将解析到的数据存储到item类型的对象中 49 5.使用yield item将item对象提交给管道 50 6.在process_item 这里面 接收数据 并进行持久化存储 51 7.在配置文件里面开启管道 52 """ 53 # response 是请求返回对象 54 div_list = response.xpath("//*[@id=\"content-left\"]/div") 55 56 for div in div_list: 57 # author = div.xpath("./div[1]/a[2]/h2/text()")[0].extract() 58 # 这里面换一种写法也可以 59 author = div.xpath("./div[1]/a[2]/h2/text()").extract_first() 60 content = div.xpath(‘./a/div/span//text()‘).extract() 61 content = "".join(content) 62 item = Qiubai01Item() 63 item[‘author‘] = author 64 item[‘content‘] = content 65 # 向管道提交item 66 yield item
标签:sed self gif ret lap url proc crawl cli
原文地址:https://www.cnblogs.com/gaofeng-d/p/10891342.html