基于scrapy的一些实例

时间：2019-03-13 15:05:39 阅读：145 评论：0 收藏：0 [点我收藏+]

标签：dia modules nic ons awl store call console receive

一.爬取斗鱼主播

　1.　爬虫文件

# -*- coding: utf-8 -*-
import scrapy
import json
from Douyu.items import DouyuItem

class DouyuSpider(scrapy.Spider):
    name = ‘douyu‘
    # allowed_domains = [‘www.xxx.com‘]
    baseurl = ‘http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=‘
    # 偏移量,指的是起始值,从0开始的偏移值
    offset = 0
    start_urls = [baseurl + str(offset)]

    def parse(self, response):
        # 对获取的数据进转jsao格式后进行判断
        data = json.loads(response.text)[‘data‘]

        if len(data) == 0:
            return
        data = json.loads(response.text)[‘data‘]
        # //循环data这个列表,拿到的是每一个主播信息的字典
        for each in data:
            name = each[‘nickname‘]
            img_url = each[‘vertical_src‘]
            # //实例一个item对象来装获取到的数据
            item = DouyuItem()
            item[‘name‘] = name
            item[‘img_url‘] = img_url
            # 这边要记得返回，否则管道文件接不到数据
            yield item

        # 获取所有页的数据
        # 这样不容出错,上面有判断了状态表示码,如果为1就不会走if这边了
        self.offset += 20
        url = self.baseurl + str(self.offset)
        yield scrapy.Request(url=url, callback=self.parse)

　　2.item

import scrapy


class DouyuItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name=scrapy.Field()  #保存昵称
    img_url=scrapy.Field()  #保存图片url

　　3.pipeline

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
from Douyu.settings import IMAGES_STORE as images_store
import os
import scrapy

#文字存储
class DouyuPipeline(object):
    f = None

    def open_spider(self, spider):
        self.f = open(‘./douyu.txt‘, ‘w‘, encoding=‘utf-8‘)

    def process_item(self, item, spider):
        name = item[‘name‘]
        img_url = item[‘img_url‘]
        self.f.write(name + ":" + img_url + "\n")
        return item

    def close_spider(self, spider):
        self.f.close()


# 设置照片存储
class ImagesPipieline(ImagesPipeline):
    # 从爬虫文件赤岸过来的item中获取诈骗的url,对照片的url进行请求,获取照片
    # 照片默认获取保存到settingts.py中IMGS_STORE,自己要去设置路径
    def get_media_requests(self, item, info):
        img_url = item[‘img_url‘]

        yield scrapy.Request(img_url)

    # 对图片修改名字
    def item_completed(self, results, item, info):
        # 固定写法,获取图片路径,同时判断这个路径是否正确,如果正确就放到imgpath里面
        # results:把图片从文件读出来的信息
        img_path = [x[‘path‘] for ok, x in results if ok]
        os.rename(images_store + img_path[0], images_store + item[‘name‘] + ‘.jpg‘)

　　4.settings

# -*- coding: utf-8 -*-

# Scrapy settings for Douyu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = ‘Douyu‘

SPIDER_MODULES = [‘Douyu.spiders‘]
NEWSPIDER_MODULE = ‘Douyu.spiders‘

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36‘

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
#   ‘Accept-Language‘: ‘en‘,
# }

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    ‘Douyu.middlewares.DouyuSpiderMiddleware‘: 543,
# }

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    ‘Douyu.middlewares.DouyuDownloaderMiddleware‘: 543,
# }

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    ‘scrapy.extensions.telnet.TelnetConsole‘: None,
# }

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    ‘Douyu.pipelines.DouyuPipeline‘: 300,
    ‘Douyu.pipelines.ImagesPipieline‘: 301,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = ‘httpcache‘
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘

# 图片的存储路径
#settings都要大写,这边的字一个都不能错
IMAGES_STORE =‘D:/scrapy/Douyu/imgs/‘

View Code

基于scrapy的一些实例

标签：dia modules nic ons awl store call console receive

原文地址：https://www.cnblogs.com/tjp40922/p/10523027.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行

基于scrapy的一些实例

一.爬取斗鱼主播

1. 爬虫文件

2.item

3.pipeline

4.settings

　1.　爬虫文件

　　2.item

　　3.pipeline

　　4.settings