标签:domain text blog content http 注意 ack style utf-8
1 # -*- coding: utf-8 -*- 2 # scrapy爬取豆瓣电影top250 3 4 import scrapy 5 from douban.items import DoubanItem 6 7 8 class DoubanspiderSpider(scrapy.Spider): 9 name = "doubanspider" 10 # allowed_domains = ["movie.douban.com/top250"]注意这里的主页限制,一旦翻页可能超出范围 11 start_urls = [‘http://movie.douban.com/top250‘] 12 13 def parse(self, response): 14 item = DoubanItem() 15 for each in response.css(‘.article .grid_view li‘): 16 title = each.css(‘.item .hd .title:nth-child(1)::text‘).extract_first() 17 content = each.css(‘.item .bd p::text‘).extract_first().strip() 18 rating_num = each.css(‘.item .bd .star .rating_num::text‘).extract_first() 19 quote = each.css(‘.item .bd .quote span::text‘).extract_first() 20 image = each.css(‘.item .pic a img::attr(src)‘).extract_first() 21 item[‘title‘] = title 22 item[‘content‘] = content 23 item[‘rating_num‘] = rating_num 24 item[‘quote‘] = quote 25 item[‘image‘] = image 26 27 yield item 28 29 # 构造下一页的请求 30 next = response.css(‘.paginator .next a::attr(href)‘).extract_first() 31 if next: 32 url = ‘http://movie.douban.com/top250‘ + next 33 print(url) 34 yield scrapy.Request(url=url, callback=self.parse)
标签:domain text blog content http 注意 ack style utf-8
原文地址:http://www.cnblogs.com/themost/p/7090247.html