scrapy爬取中关村在线手机频道

时间：2017-06-24 09:54:58 阅读：246 评论：0 收藏：0 [点我收藏+]

标签：.sh nbsp llb time dom from main attr css

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from pyquery import PyQuery as pq
 4 
 5 from zolphone.items import ZolphoneItem
 6 
 7 
 8 class PhoneSpider(scrapy.Spider):
 9     name = "phone"
10     # allowed_domains = ["www.zol.com.cn"]
11     # start_url = ‘http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_1_0_1.html‘
12     start_url = ‘http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_1_0_‘
13 
14     def start_requests(self):
15 
16         for page in range(1, 209):
17             url = self.start_url + str(page) + ‘.html‘
18             yield scrapy.Request(url,callback=self.parse_index)
19 
20 
21     def parse_index(self, response):
22         base_url = ‘http://detail.zol.com.cn‘
23         doc = pq(response.text)
24         lis = doc(‘.list-box .list-item‘).items()
25         for result in lis:
26             detail_url = base_url + result.find(‘.pro-intro h3 a‘).attr(‘href‘)
27             yield scrapy.Request(url=detail_url, callback=self.parse_detail)
28 
29     def parse_detail(self,response):
30         doc = pq(response.text)
31         title1 = response.css(‘.page-title h1::text‘).extract_first()
32         title2 = doc(‘.page-title h2‘).text()
33         price = doc(‘.product-price .price-type‘).text()
34         release_time = doc(‘.section div h3 .showdate‘).text()
35         print(title1, title2, price, release_time)
36         item = ZolphoneItem()
37         item[‘title1‘] = title1
38         item[‘title2‘] = title2
39         item[‘price‘] = price
40         item[‘release_time‘] = release_time
41 
42         yield item

 1 import scrapy
 2 
 3 
 4 class ZolphoneItem(scrapy.Item):
 5     # define the fields for your item here like:
 6     # name = scrapy.Field()
 7     title1 = scrapy.Field()
 8     title2 = scrapy.Field()
 9     price = scrapy.Field()
10     release_time = scrapy.Field()

scrapy爬取中关村在线手机频道

标签：.sh nbsp llb time dom from main attr css

原文地址：http://www.cnblogs.com/themost/p/7072431.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行