标签:爬虫 image loader 数据交互 作用 use 数据 win def
直接上代码吧
中间件简单使用:
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals import random user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] class MidlleproDownloaderMiddleware(object): #拦截正常请求 def process_request(self, request, spider): #进行UA伪装 request.headers[‘User-Agent‘] = random.choice(user_agent_list) # print(request.headers[‘User-Agent‘]) #代理 request.meta[‘proxy‘] = ‘http://111.29.3.194:8080‘ print(request.meta[‘proxy‘]) return None #拦截所有的响应 def process_response(self, request, response, spider): return response #拦截发生异常的请求对象 def process_exception(self, request, exception, spider): print(request) return request#将修正后的正常的请求对象进行重新发送
settings也要改
DOWNLOADER_MIDDLEWARES = { ‘midllePro.middlewares.MidlleproDownloaderMiddleware‘: 543, }
selenium在scrapy使用
网易新闻获取
spiders:
# -*- coding: utf-8 -*- import scrapy from selenium import webdriver from wangyiPeo.items import WangyipeoItem class WangyiSpider(scrapy.Spider): name = ‘wangyi‘ # allowed_domains = [‘www.xxx.com‘] start_urls = [‘https://news.163.com‘] model_urls = [] bro = webdriver.Chrome(executable_path=r‘D:\study\chromedriver.exe‘) def parse(self, response): #解析出5个板块对应的url li_list = response.xpath(‘/html/body/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/ul[1]/li‘) model_index = [1,2] print(len(li_list)) # print(len(li_list)) # for i in li_list: # print(i) for index in model_index: #li依次表示的是5个板块对应的li标签 li = li_list[index] #5个板块对应的url model_url = li.xpath(‘./a/@href‘).extract_first() self.model_urls.append(model_url) #对每一个板块的url进行手动请求的发送 yield scrapy.Request(model_url,callback=self.parse_model) def parse_model(self,response):#用作于解析每一个板块对应页面数据中的新闻标题和新闻详情页的url #该方法中获取的response对象是没有包含动态加载出的新闻数据(是一个不满足需求的response) div_list = response.xpath(‘/html/body/div/div[3]/div[4]/div[1]/div[1]/div/ul/li/div‘)#1+5+n for div in div_list: title = div.xpath(‘./div[1]/div/div[1]/h3/a/text()‘).extract_first() detail_url = div.xpath(‘./div[1]/a/@href‘).extract_first() item = WangyipeoItem() item[‘title‘] = title yield scrapy.Request(detail_url,callback=self.parse_new_detail,meta={‘item‘:item}) def parse_new_detail(self,response):#解析新闻内容 item = response.meta[‘item‘] content = response.xpath(‘/html/body/div[3]/div[1]/div[3]/div[2]//text()‘).extract() for x in content: if ‘ ‘ in x: content.pop(content.index(x)) for i in content: if ‘\n‘ in i: content.pop(content.index(i)) for i in content: list2=[str(i) for i in content] content=‘‘.join(list2) print(content) item[‘content‘] = content yield item #改方法只会在整个程序结束时执行一次 def closed(self,spider): self.bro.quit()
items
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class WangyipeoItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() content = scrapy.Field()
middlewares
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals from scrapy.http import HtmlResponse from time import sleep class WangyipeoDownloaderMiddleware(object): #参数: #reuqest:拦截到请求对应的响应对象 #response:拦截到所有的响应对象(1+5+n) #spider:爬虫类实例化的对象,可以实现爬虫类和中间件类的数据交互 def process_response(self, request, response, spider): #拦截到5个板块对应的响应对象,将其替换成5个符合需求的新的响应对象进行返回 #1.找出5个板块对应的5个不符合需求的响应对象 if request.url in spider.model_urls: #就是满足需求的五个板块对应的响应对象 #url:响应对象对应的请求对象的url #body:响应数据,可以由selenium中的page_source返回 bro = spider.bro bro.get(request.url) sleep(5) page_text = bro.page_source #包含了动态加载的新闻数据 new_response = HtmlResponse(url=request.url,body=page_text,encoding=‘utf-8‘,request=request) return new_response else: return response
pipelines三部法
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import pymysql class WangyipeoPipeline(object): conn = None curse = None def open_spider(self,spider): self.conn = pymysql.Connect(host=‘127.0.0.1‘,port=3306,user=‘root‘,password=‘123‘,db=‘Spider‘,charset=‘utf8‘) # print(self.conn) def process_item(self, item, spider): # print(6666666666) sql = ‘insert into wangyi values ("%s","%s")‘%(item[‘title‘],item[‘content‘]) self.curse = self.conn.cursor() try: self.curse.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def close_spider(self,spider): # self.curse.close() self.conn.close()
settings配置
DOWNLOADER_MIDDLEWARES = { ‘wangyiPeo.middlewares.WangyipeoDownloaderMiddleware‘: 543, } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # ‘scrapy.extensions.telnet.TelnetConsole‘: None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { ‘wangyiPeo.pipelines.WangyipeoPipeline‘: 300, }
一个小例子
# -*- coding: utf-8 -*- import scrapy from imgPro.items import ImgproItem class ImgSpider(scrapy.Spider): name = ‘img‘ # allowed_domains = [‘www.xxx.com‘] start_urls = [‘http://sc.chinaz.com/tupian/meinvtupian.html‘] def parse(self, response): div_list = response.xpath(‘//*[@id="container"]/div‘) for div in div_list: img_src = div.xpath(‘./div/a/img/@src2‘).extract_first() img_src = ‘https:‘ + img_src item = ImgproItem() item[‘img_src‘] = img_src yield item pipelines下: # -*- coding: utf-8 -*- # Define your item pipelines here # # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # class ImgproPipeline(object): # def process_item(self, item, spider): # return item import scrapy from scrapy.pipelines.images import ImagesPipeline class ImgproPipeline(ImagesPipeline): #是用来对媒体资源进行请求的(数据下载),参数item就是接收到的爬虫类提交的item对象 def get_media_requests(self, item, info): yield scrapy.Request(item[‘img_src‘]) #指明数据存储的路径 def file_path(self, request, response=None, info=None): return request.url.split(‘/‘)[-1] #将item传递个下一个即将被执行的管道类 def item_completed(self, results, item, info): return item settings下: #图片存储文件夹的名称+路径 IMAGES_STORE = ‘./imgLibs‘
最最最重要的——分布式+深度爬取+增量式
这里就介绍一下深度爬取吧
分布式要用到redis,增量式就是监听网站,一有新数据立马获取(简得的形容吧)
感兴趣的欢迎联系我
使用CrawlSpider, Rule,深度爬取 # -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from sunCrawlPro.items import SuncrawlproItem,Detail_item class SunSpider(CrawlSpider): name = ‘sun‘ # allowed_domains = [‘www.xxx.com‘] start_urls = [‘http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1‘] #实例化了一个连接提取器对象 #作用:根据指定规则(allow=’正则表达式‘)进行指定连接的提取 link = LinkExtractor(allow=r‘id=1&page=\d+‘)#获取页码连接 #获取新闻详情页的连接 link_detail = LinkExtractor(allow=r‘politics/index?id=\d+‘) rules = ( #将link作用到了Rule构造方法的参数1中 #作用:将连接提取器提取到的连接进行请求发送且根据指定规则对请求到的数据进行数据解析 Rule(link, callback=‘parse_item‘, follow=True), #follow=True:将连接提取器 继续作用到 连接提取器提取到的 连接 所对应的 页面中 Rule(link_detail, callback=‘parse_detail‘, follow=False), ) def parse_item(self, response): #xpath表达式中是不可以出现tbody标签 tr_list = response.xpath(‘/html/body/div[2]/div[3]/ul[2]/li‘) for tr in tr_list: link_detail = tr.xpath(‘./span[3]/a/@href‘).extract_first() link_detail = ‘http://wz.sun0769.com/‘ + link_detail title = tr.xpath(‘./span[3]/a/text()‘).extract_first() num = tr.xpath(‘./span[1]/text()‘).extract_first() item = SuncrawlproItem() item[‘title‘] = title item[‘num‘] = num yield scrapy.Request(link_detail,callback=self.parse_detail,meta={‘item‘:item}) yield item def parse_detail(self,response): item = response.meta[‘item‘] content = response.xpath(‘/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()‘).extract_first() content = ‘‘.join([s for s in content.splitlines(True) if s.strip()]) print(type(content),content) num = response.xpath(‘/html/body/div[3]/div[2]/div[2]/div[1]/span[4]/text()‘).extract_first() num = num.split(‘:‘)[-1] # for x in content: # if ‘ ‘ in x: # x = x.strip() item = Detail_item() item[‘content‘] = content item[‘num‘] = num # print(item) yield item items: # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class SuncrawlproItem(scrapy.Item): title = scrapy.Field() num = scrapy.Field() class Detail_item(scrapy.Item): content = scrapy.Field() num = scrapy.Field() pipelines: # -*- coding: utf-8 -*- # Define your item pipelines here # # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class SuncrawlproPipeline(object): # def process_item(self, item, spider): # if item.__class__.__name__ == ‘Detail_item‘: # content = item[‘content‘] # num = item[‘num‘] # print(item) # else: # title = item[‘title‘] # num = item[‘num‘] # print(item) # return item fp1 = open(‘content.txt‘,‘w+‘,encoding=‘utf-8‘) fp2 = open(‘title.txt‘,‘w+‘,encoding=‘utf-8‘) # def open_spider(self,item): # if item.__class__.__name__ == ‘Detail_item‘: # self.fp = open(‘content.txt‘,‘w‘,encoding=‘utf-8‘) # else: # self.fp = open(‘title.txt‘,‘w‘,encoding=‘utf-8‘) # return item def process_item(self, item, spider):#item就是接收到爬虫类提交过来的item对象 if item.__class__.__name__ == ‘Detail_item‘: self.fp1.write(item[‘content‘]+‘:‘+item[‘num‘]+ ‘\n‘) else: self.fp2.write(item[‘title‘]+‘:‘+item[‘num‘]+ ‘\n‘) return item def close_spider(self,spider): self.fp1.close() self.fp2.close() settings: DOWNLOAD_DELAY = 1 # The download delay setting will honor only one of: CONCURRENT_REQUESTS_PER_DOMAIN = 10 #CONCURRENT_REQUESTS_PER_IP = 16 ITEM_PIPELINES = { ‘sunCrawlPro.pipelines.SuncrawlproPipeline‘: 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html AUTOTHROTTLE_ENABLED = True # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies AUTOTHROTTLE_MAX_DELAY = 60 都是一些防止快速爬取的配置,可以在中间件里使用ip大量爬取,爬下来的内容真的太好笑了。。。
标签:爬虫 image loader 数据交互 作用 use 数据 win def
原文地址:https://www.cnblogs.com/zzj666/p/14747617.html