码迷,mamicode.com
首页 > 其他好文 > 详细

scrapy学习(1)

时间:2019-08-28 01:23:33      阅读:69      评论:0      收藏:0      [点我收藏+]

标签:topic   response   enabled   ignore   cli   integer   efi   tle   绑定   

使用scrapy开发简单爬虫的步骤:

1、创建项目

技术图片

技术图片

通过以上命令,可以得到下面的目录

技术图片

2、开始修改items文件,====这里放置你想要爬取的或者你感兴趣的东西

import scrapy


class BookspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 书名
    book_name = scrapy.Field()
    # 作者
    author = scrapy.Field()
    # 分类
    book_sort = scrapy.Field()
    # 状态
    book_status = scrapy.Field()
    # 字数
    book_size = scrapy.Field()
    # 更新时间
    book_update = scrapy.Field()
    # 最新章节
    last_chapter = scrapy.Field()
    # 书籍简介
    book_intro = scrapy.Field()
    # 章节的名称
    chapter_name = scrapy.Field()
    # 书籍的章节urls
    chapter_url = scrapy.Field()
    # 章节的内容
    chapter_content = scrapy.Field()
    # 用于绑定章节顺序,章节编码
    chapter_num = scrapy.Field()

  3、开始编写各个内容爬取的程序,也就是spider

技术图片
# -*- coding: utf-8 -*-
import scrapy
from BookSpider.items import BookspiderItem


class BooksSpider(scrapy.Spider):
    name = Books
    allowed_domains = [biqugex.com]
    # start_urls = [‘http://biqugex.com/‘]

    def start_requests(self):
        url = https://www.biqugex.com/book_{0}
        for i in range(1, 2):
            yield scrapy.Request(url.format(i), callback=self.parse)

    def parse(self, response):
        books = response.xpath(//div[@class="info"])
        book_name = books.xpath(./h2/text()).extract()[0]
        author = books.xpath(./div[@class="small"]/span[1]/text()).extract()[0].split(, 2)[1]
        book_sort = books.xpath(./div[@class="small"]/span[2]/text()).extract()[0].split(, 2)[1]
        book_status = books.xpath(./div[@class="small"]/span[3]/text()).extract()[0].split(, 2)[1]
        book_size = books.xpath(./div[@class="small"]/span[4]/text()).extract()[0].split(, 2)[1]
        book_update = books.xpath(./div[@class="small"]/span[5]/text()).extract()[0].split(, 2)[1]
        last_chapter = books.xpath(./div[@class="small"]/span[6]/a/text()).extract()[0]
        book_intro = books.xpath(./div[@class="intro"]/text()).extract_first()
        chapters = response.xpath(//div[@class="listmain"])
        urls = chapters.xpath(./dl/dd/a/@href).extract()
        for url in urls[0:4]:
            new_link = "https://www.biqugex.com" + url
            yield scrapy.Request(url=new_link, meta={book_name: book_name, author: author, book_sort: book_sort,
                                                     book_status: book_status, book_size: book_size,
                                                     book_update: book_update, last_chapter: last_chapter,
                                                     book_intro: book_intro}, callback=self.detail_parse, dont_filter=False)

    def detail_parse(self, response):
        item = BookspiderItem()
        item[book_name] = response.meta[book_name]
        item[author] = response.meta[author]
        item[book_sort] = response.meta[book_sort]
        item[book_status] = response.meta[book_status]
        item[book_size] = response.meta[book_size]
        item[book_update] = response.meta[book_update]
        item[last_chapter] = response.meta[last_chapter]
        item[book_intro] = response.meta[book_intro]
        chapter_page = response.xpath(//div[@class="content"])
        item[chapter_name] = chapter_page.xpath(./h1/text()).extract()[0]
        item[chapter_url] = chapter_page.xpath(./div[@class="showtxt"]/text()).extract()[-2].strip().replace((, ‘‘).replace(), ‘‘)
        chapter_content = chapter_page.xpath(./div[@class="showtxt"]/text()).extract()[0:-3]
        item[chapter_content] = \n.join(chapter_content)
        # item[‘chapter_num‘] = filter(str.isdigit, chapter_page.xpath(‘./h1/text()‘).extract()[0])
        yield item
View Code

4、开始写pipelines文件,主要是用来把数据写到文件或者数据库中

技术图片
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import sqlite3


class BookspiderPipeline(object):
    def __init__(self):
        self.conn = sqlite3.connect(books.db)
        self.cur = self.conn.cursor()
        self.cur.execute(create table if not exists notes(
                        + id integer primary key autoincrement,
                        + book_name,
                        + author,
                        + book_sort,
                        + book_status,
                        + book_size,
                        + book_update,
                        + last_chapter,
                        + book_intro,
                        + chapter_name,
                        + chapter_url,
                        + chapter_content))

    def close_spider(self, spider):
        print("=====关闭数据库资源=====")
        self.cur.close()
        self.conn.close()

    def process_item(self, item, spider):
        self.cur.execute(insert into notes values (null, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?),
                         (item[book_name], item[author], item[book_sort],
                          item[book_status], item[book_size], item[book_update],
                          item[last_chapter], item[book_intro], item[chapter_name],
                          item[chapter_url], item[chapter_content]))
        self.conn.commit()

    # def process_item(self, item, spider):
    #     print(item[‘book_name‘])
    #     print(item[‘author‘])
    #     print(item[‘book_sort‘])
    #     print(item[‘book_status‘])
    #     print(item[‘book_size‘])
    #     print(item[‘book_update‘])
    #     print(item[‘last_chapter‘])
    #     print(item[‘book_intro‘])
    #     print(item[‘chapter_name‘])
    #     print(item[‘chapter_url‘])
    #     print(item[‘chapter_content‘])
    #     print(item[‘chapter_num‘])
View Code

5、配置setting中的一些参数

技术图片
# -*- coding: utf-8 -*-

# Scrapy settings for BookSpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = BookSpider

SPIDER_MODULES = [BookSpider.spiders]
NEWSPIDER_MODULE = BookSpider.spiders


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = ‘BookSpider (+http://www.yourdomain.com)‘

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 
                  (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0,
    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
   BookSpider.middlewares.BookspiderSpiderMiddleware: 543,
}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   BookSpider.middlewares.BookspiderDownloaderMiddleware: 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    ‘scrapy.extensions.telnet.TelnetConsole‘: None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   BookSpider.pipelines.BookspiderPipeline: 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = httpcache
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = scrapy.extensions.httpcache.FilesystemCacheStorage
View Code

好了,小说的爬虫做好了

scrapy学习(1)

标签:topic   response   enabled   ignore   cli   integer   efi   tle   绑定   

原文地址:https://www.cnblogs.com/pythoncoder/p/11421605.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!