码迷,mamicode.com
首页 > 其他好文 > 详细

爬取网易

时间:2019-06-23 22:44:50      阅读:121      评论:0      收藏:0      [点我收藏+]

标签:config   ISE   implicit   maximum   必须   spi   exce   ecs   爬虫类   

爬取国内,国际,军事,航空四个板块对应,标题,缩略图,关键字,发布时间,url
# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from wangyiPro.items import WangyiproItem
from scrapy_redis.spiders import RedisSpider
class WangyiSpider(RedisSpider):
    name = wangyi
    #allowed_domains = [‘www.xxxx.com‘]
    #start_urls = [‘https://news.163.com‘]
    redis_key = wangyi

    def __init__(self):
        #实例化一个浏览器对象(实例化一次)
        self.bro = webdriver.Chrome()

    #必须在整个爬虫结束后,关闭浏览器
    def closed(self,spider):
        print(爬虫结束)
        self.bro.quit()

    def parse(self, response):
        lis = response.xpath(//div[@class="ns_area list"]/ul/li)
        indexs = [3,4,6,7]
        li_list = []  #存储的就是国内,国际,军事,航空四个板块对应的li标签对象
        for index in indexs:
            li_list.append(lis[index])
        #获取四个板块中的链接和文字标题
        for li in li_list:
            url = li.xpath(./a/@href).extract_first()
            title = li.xpath(./a/text()).extract_first()

            #print(url+":"+title)

            #对每一个板块对应的url发起请求,获取页面数据(标题,缩略图,关键字,发布时间,url)
            yield scrapy.Request(url=url,callback=self.parseSecond,meta={title:title})



    def parseSecond(self,response):
        div_list = response.xpath(//div[@class="data_row news_article clearfix "])
        #print(len(div_list))
        for div in div_list:
            head = div.xpath(.//div[@class="news_title"]/h3/a/text()).extract_first()
            url = div.xpath(.//div[@class="news_title"]/h3/a/@href).extract_first()
            imgUrl = div.xpath(./a/img/@src).extract_first()
            tag = div.xpath(.//div[@class="news_tag"]//text()).extract()
            tags = []
            for t in tag:
                t = t.strip( \n \t)
                tags.append(t)
            tag = "".join(tags)

            #获取meta传递过来的数据值title
            title = response.meta[title]

            #实例化item对象,将解析到的数据值存储到item对象中
            item = WangyiproItem()
            item[head] = head
            item[url] = url
            item[imgUrl] = imgUrl
            item[tag] = tag
            item[title] = title

            #对url发起请求,获取对应页面中存储的新闻内容数据
            yield scrapy.Request(url=url,callback=self.getContent,meta={item:item})



    def getContent(self,response):
        #获取传递过来的item
        item = response.meta[item]

        #解析当前页面中存储的新闻数据
        content_list = response.xpath(//div[@class="post_text"]/p/text()).extract()
        content = "".join(content_list)
        item[content] = content

        yield item
技术图片
import scrapy


class WangyiproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    head = scrapy.Field()
    url = scrapy.Field()
    imgUrl = scrapy.Field()
    tag = scrapy.Field()
    title = scrapy.Field()
    content = scrapy.Field()
items.py
技术图片
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

from scrapy.http import HtmlResponse
import time
# from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware

import random
#UA池代码的编写(单独给UA池封装一个下载中间件的一个类)
#1,导包UserAgentMiddlware类
class RandomUserAgent(object):
    def process_request(self, request, spider):
        #从列表中随机抽选出一个ua值
        ua = random.choice(spider.setings.get("USER_AGENT_LIST"))
        #ua值进行当前拦截到请求的ua的写入操作
        request.headers.setdefault(User-Agent,ua)

# 批量对拦截到的请求进行ip更换
class Proxy(object):
    def process_request(self, request, spider):
        #对拦截到请求的url进行判断(协议头到底是http还是https)
        #request.url返回值:http://www.xxx.com
        h = request.url.split(:)[0]  #请求的协议头
        if h == https:
            ip = random.choice(PROXY_https)
            request.meta[proxy] = https://+ip
        else:
            ip = random.choice(PROXY_http)
            request.meta[proxy] = http:// + ip

class WangyiproDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.



    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
    #拦截到响应对象(下载器传递给Spider的响应对象)
    #request:响应对象对应的请求对象
    #response:拦截到的响应对象
    #spider:爬虫文件中对应的爬虫类的实例
    def process_response(self, request, response, spider):
        #响应对象中存储页面数据的篡改
        if request.url in[http://news.163.com/domestic/,http://news.163.com/world/,http://news.163.com/air/,http://war.163.com/]:
            spider.bro.get(url=request.url)
            js = window.scrollTo(0,document.body.scrollHeight)
            spider.bro.execute_script(js)
            time.sleep(2)  #一定要给与浏览器一定的缓冲加载数据的时间
            #页面数据就是包含了动态加载出来的新闻数据对应的页面数据
            page_text = spider.bro.page_source
            #篡改响应对象
            return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding=utf-8,request=request)
        else:
            return response

PROXY_http = [
    153.180.102.104:80,
    195.208.131.189:56055,
]
PROXY_https = [
    120.83.49.90:9000,
    95.189.112.214:35508,
]
middleware.py
技术图片
# -*- coding: utf-8 -*-

# Scrapy settings for wangyiPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = wangyiPro

SPIDER_MODULES = [wangyiPro.spiders]
NEWSPIDER_MODULE = wangyiPro.spiders


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = ‘wangyiPro (+http://www.yourdomain.com)‘
USER_AGENT = Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
#   ‘Accept-Language‘: ‘en‘,
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    ‘wangyiPro.middlewares.WangyiproSpiderMiddleware‘: 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    ‘wangyiPro.middlewares.WangyiproDownloaderMiddleware‘: 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    ‘scrapy.extensions.telnet.TelnetConsole‘: None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    ‘wangyiPro.pipelines.WangyiproPipeline‘: 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = ‘httpcache‘
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘
USER_AGENT_LIST=["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ",
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
settings.py

 

爬取网易

标签:config   ISE   implicit   maximum   必须   spi   exce   ecs   爬虫类   

原文地址:https://www.cnblogs.com/jianxiang/p/11074544.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!