码迷,mamicode.com
首页 > 其他好文 > 详细

scrapy爬取小说盗墓笔记

时间:2017-06-29 11:26:25      阅读:264      评论:0      收藏:0      [点我收藏+]

标签:star   set   llb   roc   spi   art   style   title   www   

# -*- coding: utf-8 -*-
import scrapy
import requests
from daomu.items import DaomuItem
from pyquery import PyQuery as pq

class DaomuspiderSpider(scrapy.Spider):
    name = "daomuspider"
    # allowed_domains = ["www.daomubiji.com"]
    start_urls = [http://www.daomubiji.com/]
    index_url = http://www.daomubiji.com/

    def start_requests(self):
        yield scrapy.Request(url=self.index_url,callback=self.parse_book)

    def parse_book(self, response):
        for url in response.css(.article-content a):
            book_url = url.css(a::attr(href)).extract_first()
            yield scrapy.Request(url=book_url, callback=self.parse_chapter)

    def parse_chapter(self, response):
        item = DaomuItem()
        book_title = response.css(.focusbox .container h1::text).extract_first()
        book_info = response.css(.focusbox .container .focusbox-text::text).extract_first()
        book_url = response.url

        for chapter in response.css(.excerpts-wrapper .excerpts .excerpt):
            chapter_title = chapter.css(a::text).extract_first().split( )[1] + :+ chapter.css(a::text).extract_first().split( )[-1]
            chapter_url = chapter.css(a::attr(href)).extract_first()
            content = self.parse_detail(chapter_url)

            item[book_title] = book_title
            item[book_info] = book_info
            item[book_url] = book_url
            item[chapter_title] = chapter_title
            item[chapter_url] = chapter_url
            item[content] = content
            yield item


    def parse_detail(self, url):
        response = requests.get(url)
        doc = pq(response.text)
        content = doc(.article-content p).text()
        return content
import pymongo

class DaomuPipeline(object):

    def __init__(self):
        self.mongo_uri = localhost
        self.mongo_db = daomu

    # @classmethod
    # def frow_crawler(cls, crawler):
    #     return cls(
    #         mongo_uri = crawler.settings.get(‘MONGO_URI‘),
    #         mongo_db = crawler.settings.get(‘MONGO_DB‘)
    #     )

    def open_spider(self,spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def process_item(self, item, spider):
        name = item.__class__.__name__
        self.db[name].insert(dict(item))#一定要注意这里用dict
        return item

    def close_spider(self, spider):
        self.client.close()

 

scrapy爬取小说盗墓笔记

标签:star   set   llb   roc   spi   art   style   title   www   

原文地址:http://www.cnblogs.com/themost/p/7093116.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!