标签:dia modules nic ons awl store call console receive
# -*- coding: utf-8 -*- import scrapy import json from Douyu.items import DouyuItem class DouyuSpider(scrapy.Spider): name = ‘douyu‘ # allowed_domains = [‘www.xxx.com‘] baseurl = ‘http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=‘ # 偏移量,指的是起始值,从0开始的偏移值 offset = 0 start_urls = [baseurl + str(offset)] def parse(self, response): # 对获取的数据进转jsao格式后进行判断 data = json.loads(response.text)[‘data‘] if len(data) == 0: return data = json.loads(response.text)[‘data‘] # //循环data这个列表,拿到的是每一个主播信息的字典 for each in data: name = each[‘nickname‘] img_url = each[‘vertical_src‘] # //实例一个item对象来装获取到的数据 item = DouyuItem() item[‘name‘] = name item[‘img_url‘] = img_url # 这边要记得返回,否则管道文件接不到数据 yield item # 获取所有页的数据 # 这样不容出错,上面有判断了状态表示码,如果为1就不会走if这边了 self.offset += 20 url = self.baseurl + str(self.offset) yield scrapy.Request(url=url, callback=self.parse)
import scrapy class DouyuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name=scrapy.Field() #保存昵称 img_url=scrapy.Field() #保存图片url
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.pipelines.images import ImagesPipeline from Douyu.settings import IMAGES_STORE as images_store import os import scrapy #文字存储 class DouyuPipeline(object): f = None def open_spider(self, spider): self.f = open(‘./douyu.txt‘, ‘w‘, encoding=‘utf-8‘) def process_item(self, item, spider): name = item[‘name‘] img_url = item[‘img_url‘] self.f.write(name + ":" + img_url + "\n") return item def close_spider(self, spider): self.f.close() # 设置照片存储 class ImagesPipieline(ImagesPipeline): # 从爬虫文件赤岸过来的item中获取诈骗的url,对照片的url进行请求,获取照片 # 照片默认获取保存到settingts.py中IMGS_STORE,自己要去设置路径 def get_media_requests(self, item, info): img_url = item[‘img_url‘] yield scrapy.Request(img_url) # 对图片修改名字 def item_completed(self, results, item, info): # 固定写法,获取图片路径,同时判断这个路径是否正确,如果正确就放到imgpath里面 # results:把图片从文件读出来的信息 img_path = [x[‘path‘] for ok, x in results if ok] os.rename(images_store + img_path[0], images_store + item[‘name‘] + ‘.jpg‘)
# -*- coding: utf-8 -*- # Scrapy settings for Douyu project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = ‘Douyu‘ SPIDER_MODULES = [‘Douyu.spiders‘] NEWSPIDER_MODULE = ‘Douyu.spiders‘ # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36‘ # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) # COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) # TELNETCONSOLE_ENABLED = False # Override the default request headers: # DEFAULT_REQUEST_HEADERS = { # ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, # ‘Accept-Language‘: ‘en‘, # } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # ‘Douyu.middlewares.DouyuSpiderMiddleware‘: 543, # } # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # DOWNLOADER_MIDDLEWARES = { # ‘Douyu.middlewares.DouyuDownloaderMiddleware‘: 543, # } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html # EXTENSIONS = { # ‘scrapy.extensions.telnet.TelnetConsole‘: None, # } # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { ‘Douyu.pipelines.DouyuPipeline‘: 300, ‘Douyu.pipelines.ImagesPipieline‘: 301, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html # AUTOTHROTTLE_ENABLED = True # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = ‘httpcache‘ # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘ # 图片的存储路径 #settings都要大写,这边的字一个都不能错 IMAGES_STORE =‘D:/scrapy/Douyu/imgs/‘
标签:dia modules nic ons awl store call console receive
原文地址:https://www.cnblogs.com/tjp40922/p/10523027.html