标签:The yield response import join url spider 文件类型 http
一、items保存爬取的文件
items.py
import scrapy class QuoteItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() text = scrapy.Field() author = scrapy.Field() tags = scrapy.Field()
quote.py
# -*- coding: utf-8 -*- import scrapy from toscrapy.items import QuoteItem class QuoteSpider(scrapy.Spider): name = ‘quote‘ allowed_domains = [‘quotes.toscrape.com‘] start_urls = [‘http://quotes.toscrape.com/‘] """ 知识点 1. text()获取标签的text 2. @属性 获取属性的值 3. extract()查找多个 extract_first() 查找一个 4. response.urljoin url拼接 5. scrapy.Request(url=_next, callback=self.parse) 回调 """ def parse(self, response): # print(response.text) quotes = response.xpath(‘//div[@class="col-md-8"]/div[@class="quote"]‘) # print(quotes)‘‘ for quote in quotes: # print(‘=‘ * 20) # print(quote) item = QuoteItem() # extract_first() 查找一个 text = quote.xpath(‘.//span[@class="text"]/text()‘).extract_first() # print(text) item[‘text‘] = text author = quote.xpath(‘.//span/small[@class="author"]/text()‘).extract_first() # print(author) item[‘author‘] = author # extract()查找多个 tags = quote.xpath(‘.//div[@class="tags"]/a[@class="tag"]/@href‘).extract() item[‘tags‘] = tags # print(tags) yield item # print(‘>‘ * 40) next_url = response.xpath(‘//div[@class="col-md-8"]/nav/ul[@class="pager"]/li[@class="next"]/a/@href‘).extract_first() # print(next_url) # 拼接url _next = response.urljoin(next_url) # print(_next) # callback 回调函数 yield scrapy.Request(url=_next, callback=self.parse)
产生文件命令
scrapy crawl quote -o qutoes.json
文件类型:qutoes.xml qutoes.jl qutoes.csv等
二、
待续
标签:The yield response import join url spider 文件类型 http
原文地址:https://www.cnblogs.com/wt7018/p/11729742.html