标签:star set llb roc spi art style title www
# -*- coding: utf-8 -*- import scrapy import requests from daomu.items import DaomuItem from pyquery import PyQuery as pq class DaomuspiderSpider(scrapy.Spider): name = "daomuspider" # allowed_domains = ["www.daomubiji.com"] start_urls = [‘http://www.daomubiji.com/‘] index_url = ‘http://www.daomubiji.com/‘ def start_requests(self): yield scrapy.Request(url=self.index_url,callback=self.parse_book) def parse_book(self, response): for url in response.css(‘.article-content a‘): book_url = url.css(‘a::attr(href)‘).extract_first() yield scrapy.Request(url=book_url, callback=self.parse_chapter) def parse_chapter(self, response): item = DaomuItem() book_title = response.css(‘.focusbox .container h1::text‘).extract_first() book_info = response.css(‘.focusbox .container .focusbox-text::text‘).extract_first() book_url = response.url for chapter in response.css(‘.excerpts-wrapper .excerpts .excerpt‘): chapter_title = chapter.css(‘a::text‘).extract_first().split(‘ ‘)[1] + ‘:‘+ chapter.css(‘a::text‘).extract_first().split(‘ ‘)[-1] chapter_url = chapter.css(‘a::attr(href)‘).extract_first() content = self.parse_detail(chapter_url) item[‘book_title‘] = book_title item[‘book_info‘] = book_info item[‘book_url‘] = book_url item[‘chapter_title‘] = chapter_title item[‘chapter_url‘] = chapter_url item[‘content‘] = content yield item def parse_detail(self, url): response = requests.get(url) doc = pq(response.text) content = doc(‘.article-content p‘).text() return content
import pymongo class DaomuPipeline(object): def __init__(self): self.mongo_uri = ‘localhost‘ self.mongo_db = ‘daomu‘ # @classmethod # def frow_crawler(cls, crawler): # return cls( # mongo_uri = crawler.settings.get(‘MONGO_URI‘), # mongo_db = crawler.settings.get(‘MONGO_DB‘) # ) def open_spider(self,spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def process_item(self, item, spider): name = item.__class__.__name__ self.db[name].insert(dict(item))#一定要注意这里用dict return item def close_spider(self, spider): self.client.close()
标签:star set llb roc spi art style title www
原文地址:http://www.cnblogs.com/themost/p/7093116.html