标签:pass job 标签 高效 rac media none sspi 没有
scrapy是爬虫中封装好的一个明星框架,具有的功能:
环境安装
使用
xpath返回的是列表,但是列表元素一定是Selector类型的对象
extract可以将Selector对象中data参数存储的字符串提取出来, 只有一个元素可以用extract_first()
列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取了出来
extract()
和extracr_first()
提取数据,数据不在报错getall()
和get()
提取数据,数据不在则返回Noneparse()
方法,解析数据import scrapy
class QiubaiSpider(scrapy.Spider):
name = 'qiubai'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.qiushibaike.com/text/']
def parse(self, response):
# 解析:作者的名称+段子内容
div_list = response.xpath('//div[@id="content-left"]/div')
for div in div_list:
#xpath返回的是列表,但是列表元素一定是Selector类型的对象
#extract可以将Selector对象中data参数存储的字符串提取出来
#author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
#列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取了出来
content = div.xpath('./a[1]/div/span//text()').extract()
content = ''.join(content)
print(content)
break
parse
方法的返回值存储到本地文本文件中scrapy crawl qiubai -o ./qiubai.csv
import scrapy
class QiubaiSpider(scrapy.Spider):
name = 'qiubai'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.qiushibaike.com/text/']
def parse(self, response):
# 解析:作者的名称+段子内容
div_list = response.xpath('//div[@id="content-left"]/div')
all_data = [] # 存储所有解析到的数据
for div in div_list:
#xpath返回的是列表,但是列表元素一定是Selector类型的对象
#extract可以将Selector对象中data参数存储的字符串提取出来
#author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
#列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取了出来
content = div.xpath('./a[1]/div/span//text()').extract()
content = ''.join(content)
dic = {
"author":author,
"content":content
}
all_data.append(dic)
return all_data
基于管道的编码流程:
储存文件时:
实战演示
1-数据解析
#sangou.py
import scrapy
from sanguoyanyi.items import SanguoyanyiItem
class SanguoSpider(scrapy.Spider):
name = 'sanguo'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.shicimingju.com/book/sanguoyanyi.html']
def parse(self, response):
li_list = response.xpath('//*[@id="main_left"]/div/div[4]/ul/li')
for li in li_list:
title = li.xpath('./a/text()').extract_first()
2-在item类中定义相关属性
#items.py
import scrapy
class SanguoyanyiItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
3-将解析的数据封装存储到item类型的对象
4-将item类型的对象提交给管道进行持久化存储的操作
yield
提交item对象给管道,每循环一次,发送item对象并调用一次pipelines.py
文件中的process_item()
方法,持久化存储数据#sangou.py
import scrapy
from sanguoyanyi.items import SanguoyanyiItem
class SanguoSpider(scrapy.Spider):
name = 'sanguo'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.shicimingju.com/book/sanguoyanyi.html']
def parse(self, response):
li_list = response.xpath('//*[@id="main_left"]/div/div[4]/ul/li')
for li in li_list:
title = li.xpath('./a/text()').extract_first()
item = SanguoyanyiItem()
item["title"] = title
yield item
5-在管道类的process_item方法中要将其接受到的item对象中存储的数据进行持久化存储
process_item()
方法接收item对象,做数据持久化存储,应该最后返回item对象给下一个管道类继续做其他平台的持久化存储。如果只有一个管道类,则不再继续做持久化储存。open_spider()
方法,只会被调用一次,用来开启持久化存储的载体。close_spider()
方法,只会被调用一次,用来关闭持久化存储的载体。# pipelines.py
class SanguoyanyiPipeline(object):
fp = None
def open_spider(self, spider):
print('开始爬虫......')
self.fp = open('./sanguo.txt', 'w', encoding='utf-8')
def close_spider(self, spider):
print('结束爬虫......')
self.fp.close()
def process_item(self, item, spider):
title = item["title"]
self.fp.write(title + "\n")
return item
6-在配置文件中开启管道
ITEM_PIPELINES
,调用管道持久化存储功能,字典中每一个键值对代表一个管道类。ITEM_PIPELINES
中添加多个键值对,调用多个的管道类。# settings.py
BOT_NAME = 'sanguoyanyi'
SPIDER_MODULES = ['sanguoyanyi.spiders']
NEWSPIDER_MODULE = 'sanguoyanyi.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# 定义日志等级,只显示错误提示日志
LOG_LEVEL = "ERROR"
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'sanguoyanyi.pipelines.SanguoyanyiPipeline': 300,
}
全站数据爬取:就是将网站中某板块下的全部页码对应的页面数据爬取和解析等相关操作。
爬取方式:
yield scrapy.Request(url,callback)
# 需求: 爬取4k动漫图片
import scrapy
class DongmanSpider(scrapy.Spider):
name = 'dongman'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://pic.netbian.com/4kdongman/']
url_tmp = 'http://pic.netbian.com/4kdongman/index_%d.html'
page_num = 2
def parse(self, response):
li_list = response.xpath('//*[@id="main"]/div[3]/ul/li')
for li in li_list:
#替换掉空格replace(' ', '')
img_name = li.xpath('./a/img/@alt').extract_first().replace(' ', '')
print(img_name)
if self.page_num <=5:
print(f"第{self.page_num}页:")
new_url = format(self.url_tmp%self.page_num)
self.page_num += 1
yield scrapy.Request(url=new_url, callback=self.parse)
scrapy的基本使用我们已经掌握,但是各位心中一定会有些许的疑问,我们在编写scrapy工程的时候,我们只是在定义相关类中的属性或者方法,但是我们并没有手动的对类进行实例化或者手动调用过相关的方法,那么这些操作都是谁做的呢?
引擎(Scrapy)
调度器(Scheduler)
下载器(Downloader)
爬虫(Spiders)
管道(Pipeline)
使用场景:如果爬取解析的数据不在同一张页面中。(深度爬取)
需求:爬取boss的岗位名称,岗位描述
对详情页发请求,获取详情页的页面源码数据:手动请求的发送
yield scrapy.Request(detail_url,callback=self.parse_detail,meta={‘item‘:item}
# boss.py
import scrapy
from bossPro.items import BossproItem
class BossSpider(scrapy.Spider):
name = 'boss'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.zhipin.com/job_detail/?query=python&city=101010100&industry=&position=']
url = 'https://www.zhipin.com/c101010100/?query=python&page=%d'
page_num = 2
#回调函数接受item
def parse_detail(self,response):
item = response.meta['item']
job_desc = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div//text()').extract()
job_desc = ''.join(job_desc)
item['job_desc'] = job_desc
yield item
#解析首页中的岗位名称
def parse(self, response):
li_list = response.xpath('//*[@id="main"]/div/div[3]/ul/li')
for li in li_list:
item = BossproItem()
job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div[1]/text()').extract_first()
item['job_name'] = job_name
detail_url = 'https://www.zhipin.com'+li.xpath('.//div[@class="info-primary"]/h3/a/@href').extract_first()
#请求传参:meta={},可以将meta字典传递给请求对应的回调函数
yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})
#分页操作
if self.page_num <= 3:
new_url = format(self.url%self.page_num)
self.page_num += 1
yield scrapy.Request(new_url,callback=self.parse)
# items.py
import scrapy
class BossproItem(scrapy.Item):
job_name = scrapy.Field()
job_desc = scrapy.Field()
# pipelines.py
# 只打印
class BossproPipeline(object):
def process_item(self, item, spider):
print(item)
return item
基于scrapy爬取字符串数据和图片类型数据的区别:
图片数据爬取之管道类:ImagesPipeline
ImagesPipeline
使用流程:
数据解析(图片地址)
将存储图片地址的item提交给指定的管道类, 继承ImagesPipeline的管道类,重写三个父类方法
在配置文件中配置:
IMAGE_STORE = ‘./image_files‘
实战:爬取站长素材高清图片
解析图片地址,封装在item对象中
# img.py
import scrapy
from imgsPro.items import ImgsproItem
class ImgSpider(scrapy.Spider):
name = 'img'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://sc.chinaz.com/tupian/']
def parse(self, response):
div_list = response.xpath('//div[@id="container"]/div')
item = ImgsproItem()
for div in div_list:
#注意:使用伪属性
src = div.xpath('./div/a/img/@src2').extract_first()
item['src'] = src
yield item
继承ImagesPipeline管道类,重写三个方法
# pipelines.py
from scrapy.pipelines.images import ImagesPipeline
import scrapy
class imgsPileLine(ImagesPipeline):
#就是可以根据图片地址进行图片数据的请求
def get_media_requests(self, item, info):
yield scrapy.Request(item['src'])
#指定图片存储的路径
def file_path(self, request, response=None, info=None):
imgName = request.url.split('/')[-1]
return imgName
#返回给下一个即将被执行的管道类
def item_completed(self, results, item, info):
return item
配置文件:指定图片存储的目录和开启的管道类
# settings.py
BOT_NAME = 'imgsPro'
SPIDER_MODULES = ['imgsPro.spiders']
NEWSPIDER_MODULE = 'imgsPro.spiders'
LOG_LEVEL = 'ERROR'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'imgsPro.pipelines.imgsPileLine': 300,
}
#指定图片存储的目录
IMAGES_STORE = './imgs_bobo'
与scrapy爬取图片文件一样,当爬取文件时,scrapy也封装了一个文件爬取管道类 FilesPipeline,该管道类基本使用方法与ImagesPipeline的用法几乎完全一致。
实战:爬取站长素材的简历模板
爬虫文件
# jianli.py
import scrapy
import requests, re
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianliPro.items import JianliproItem
class JianliSpider(CrawlSpider):
name = 'jianli'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://sc.chinaz.com/jianli/free.html']
rules = (
Rule(LinkExtractor(allow=r'free_\d+'), callback='parse_item', follow=False),
)
def parse_item(self, response):
item = JianliproItem()
div_list = response.xpath('//*[@id="container"]/div')
for div in div_list:
jianli_page_url = div.xpath('./a/@href').extract_first()
jianli_name = div.xpath('./a/img/@alt').extract_first()
item["jianli_name"] = jianli_name
yield scrapy.Request(jianli_page_url, callback=self.parse_jianli_dir, meta=item)
def parse_jianli_dir(self, response):
item = response.meta
jianli_dir = response.xpath('//*[@id="down"]/div[2]/ul/li[1]/a/@href').extract_first()
item["jianli_dir"] = jianli_dir
yield item
管道文件
# pipelines.py
import scrapy
from scrapy.pipelines.files import FilesPipeline
class MyFilesPiplelin(FilesPipeline):
def get_media_requests(self, item, info):
print(f"{item['jianli_name']}: 下载成功")
yield scrapy.Request(item["jianli_dir"], meta=item)
def file_path(self, request, response=None, info=None):
file_name = request.meta["jianli_name"] + ".rar"
return file_name
def item_completed(self, results, item, info):
return item
配置文件settings.py和items.py
# ############################ ## settings.py
# 开启自定义的FilesPipeline管道类
ITEM_PIPELINES = {
'jianliPro.pipelines.MyFilesPiplelin': 300,
}
# 指定文件下载的目录
FILES_STORE = 'jianli/'
# ############################ ## items.py
# 设置需要爬取的参数名称
import scrapy
class JianliproItem(scrapy.Item):
# define the fields for your item here like:
jianli_dir = scrapy.Field()
jianli_name = scrapy.Field()
分类:爬虫中间件、下载中间件
下载中间件:
需求:爬取网易新闻国内板块和国际板块内的新闻标题和新闻详细文本内容
分析:
爬虫文件
# wangyi.py
import scrapy
from selenium import webdriver
from wangyiPro.items import WangyiproItem
class WangyiSpider(scrapy.Spider):
name = 'wangyi'
# allowed_domains = ['www.xx.com']
start_urls = ['https://news.163.com/']
models_url = []
def __init__(self):
"""
初始化方法,使用selenium生成一个bro对象,供下载中间件中动态加载数据使用
"""
# 驱动器放在python的更目录,此处不需要在使用executable_path参数指定驱动路径
self.bro = webdriver.Chrome()
def parse(self, response):
"""
解析start_url的数据,返回每个新闻模块的url,交给self.parse_model方法使用
:param response:
:return:
"""
li_list = response.xpath('//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li')
index_list = [3,4]
for index in index_list:
model_url = li_list[index].xpath('./a/@href').extract_first()
self.models_url.append(model_url)
for model_url in self.models_url:
yield scrapy.Request(url=model_url, callback=self.parse_model)
def parse_model(self, response):
"""
解析模块下新闻的标题和详细内容的url,标题封装在item对象中,相信内容url和item对象交给parse_detail方法
:param response:
:return:
"""
div_list = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div')
for div in div_list:
title = div.xpath('./div/div[1]/h3/a/text()').extract_first()
news_url = div.xpath('./div/div[1]/h3/a/@href').extract_first()
item = WangyiproItem()
item["title"] = title
yield scrapy.Request(url = news_url, callback=self.parse_detail, meta={"item":item})
def parse_detail(self, response):
"""
解析详细内容的方法,最后把item对象传给管道类
:param response:
:return:
"""
content = response.xpath('//*[@id="endText"]/p/text()').extract()
content = ''.join(content).replace(' ', '')
item = response.meta["item"]
item['content'] = content
yield item
def closed(self, spider):
"""
关闭bro,该方法仅执行一次
:param spider:
:return:
"""
self.bro.quit()
中间件
# middlewares.py
from scrapy import signals
from scrapy.http import HtmlResponse
from time import sleep
import random
class WangyiproDownloaderMiddleware(object):
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
PROXY_http = [
'153.180.102.104:80',
'195.208.131.189:56055',
]
PROXY_https = [
'120.83.49.90:9000',
'95.189.112.214:35508',
]
def process_request(self, request, spider):
# UA伪装
request.headers['User-Agent'] = random.choice(self.user_agent_list)
return None
def process_response(self, request, response, spider):
"""
拦截响应:此处对于动态加载的页面,使用selenium模块解析动态加载的数据
:param request:
:param response:
:param spider:
:return:
"""
if request.url in spider.models_url:
# 模块的数据动态加载
bro = spider.bro
bro.get(request.url)
sleep(1)
page_text = bro.page_source # 包含了动态加载的新闻数据
# response #五大板块对应的响应对象
# 针对定位到的这些response进行篡改
# 实例化一个新的响应对象(符合需求:包含动态加载出的新闻数据),替代原来旧的响应对象
# 如何获取动态加载出的新闻数据?
# 基于selenium便捷的获取动态加载数据
new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
return new_response
else:
return response
def process_exception(self, request, exception, spider):
# 代理
if request.url.split(':')[0] == 'http':
request.meta['proxy'] = 'http://' + random.choice(self.PROXY_http)
elif request.url.split(':')[0] == 'https':
request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https)
return request # 将修正之后的请求对象进行重新的请求发送
管道类文件
# pipelines.py
class WangyiproPipeline(object):
def open_spider(self, spider):
"""
开启爬虫,该方法仅调用一次
:param spider:
:return:
"""
print('开始爬虫......')
self.fp = open('./wangyi.txt', 'w', encoding='utf-8')
def close_spider(self, spider):
"""
结束爬虫,该方法仅调用一次
:param spider:
:return:
"""
print('结束爬虫......')
self.fp.close()
def process_item(self, item, spider):
"""
持久化数据存储
:param item:
:param spider:
:return:
"""
title = item["title"]
content = item["content"]
self.fp.write(f"{title}\n{content}\n\n")
return item
CrawlSpider类是Spider的一个子类
全站数据爬取方式
CrawlSpider的使用:
链接提取器:将起始列表中所有的url按规则(参数allow,正则)提取满足条件的url
规则解析器:将链接提取器取到的链接进行指定规则(callback,回调函数)的解析操作
实战:爬取4k美女图片
# sun.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from crawlspiderPro.items import CrawlspiderproItem
class SunSpider(CrawlSpider):
name = 'sun'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://pic.netbian.com/4kmeinv']
link = LinkExtractor(allow=r'index_\d+.html')
rules = (
Rule(link, callback='parse_item', follow=True),
)
def parse_item(self, response):
li_list = response.xpath('//*[@id="main"]/div[3]/ul/li')
item = CrawlspiderproItem()
for li in li_list:
img_url = "http://pic.netbian.com" + li.xpath('./a/img/@src').get()
img_name = ''.join(li.xpath('./a/img/@alt').get().split())
item["img_url"] = img_url
item["img_name"] = img_name
print(img_name)
yield item
# pipelines.py (手写版,没有使用图片下载管道ImagesPipeline)
import os, requests
class CrawlspiderproPipeline(object):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"
}
def process_item(self, item, spider):
file_path = "4k美女图片"
if not os.path.exists(file_path):
os.mkdir(file_path)
img_data = requests.get(url=item["img_url"], headers=self.headers).content
with open(f"{file_path}/{item['img_name']}", "wb") as fp:
fp.write(img_data)
print(f"{item['img_name']}: 下载成功..")
注意:
IMAGES_STORE = ‘4k美女/‘
# pipelines.py (使用图片下载管道ImagesPipeline)
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class MyImgPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
print(f"{item['img_name']}: 下载成功")
yield scrapy.Request(url=item["img_url"], meta=item)
def file_path(self, request, response=None, info=None):
imgName = request.meta["img_name"]
return imgName
def item_completed(self, results, item, info):
yield item
实现流程:
安装scrapy-redis组件
创建一个工程
创建一个基于CrawlSpider的爬虫文件
修改当前的爬虫文件:
修改配置文件settings
指定使用可以被共享的管道:
ITEM_PIPELINES = {
‘scrapy_redis.pipelines.RedisPipeline‘: 400
}
指定调度器:
增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
? DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
使用scrapy-redis组件自己的调度器
? SCHEDULER = "scrapy_redis.scheduler.Scheduler"
配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
? SCHEDULER_PERSIST = True
指定redis服务器:
# 指定redis数据库的连接参数
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
- redis相关操作配置:
- 配置redis的配置文件:
- linux或者mac:redis.conf
- windows:redis.windows.conf
- 代开配置文件修改:
- 将bind 127.0.0.1进行删除
- 关闭保护模式:protected-mode yes改为no
- 结合着配置文件开启redis服务
- redis-server 配置文件
- 启动客户端:
- redis-cli
- 执行工程:
- scrapy runspider xxx.py
- 向调度器的队列中放入一个起始的url:
- 调度器的队列在redis的客户端中
- lpush xxx www.xxx.com
- 爬取到的数据存储在了redis的proName:items这个数据结构中
实战:爬取58同城北京全站二手房信息
# bj2sf.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from bj58Pro.items import Bj58ProItem
from scrapy_redis.spiders import RedisCrawlSpider
class Bj2sfSpider(RedisCrawlSpider):
name = 'bj2sf'
# allowed_domains = ['www.xxx.com']
# start_urls = ['https://bj.58.com/ershoufang/']
redis_key = 'bj2sf'
rules = (
Rule(LinkExtractor(allow=r'/ershoufang/pn\d+'), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = Bj58ProItem()
li_list = response.xpath('/html/body/div[5]/div[5]/div[1]/ul/li')
for li in li_list:
item["title"] = li.xpath('./div[2]/h2/a/text()').get()
item["desc"] = li.xpath('./div[2]/p[1]/span/text()').getall()
item["addr"] = li.xpath('./div[2]/p[2]/span/a/text()').getall()
item["price"] = li.xpath('./div[3]/p//text()').getall()
yield item
# items.py
import scrapy
class Bj58ProItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
desc = scrapy.Field()
addr = scrapy.Field()
price = scrapy.Field()
# settings.py
BOT_NAME = 'bj58Pro'
SPIDER_MODULES = ['bj58Pro.spiders']
NEWSPIDER_MODULE = 'bj58Pro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# LOG_LEVEL = 'ERROR'
# ############### 分布式爬虫 scrapy-redis的相关配置
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 400,
}
#增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
SCHEDULER_PERSIST = True
# 指定redis数据库的连接参数
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
REDIS_ENCODING = "utf-8"
增量式爬虫
- 概念:监测网站数据更新的情况,只会爬取网站最新更新出来的数据。
- 分析:
- 指定一个起始url
- 基于CrawlSpider获取其他页码链接
- 基于Rule将其他页码链接进行请求
- 从每一个页码对应的页面源码中解析出每一个电影详情页的URL
- 核心:检测电影详情页的url之前有没有请求过
- 将爬取过的电影详情页的url存储
- 存储到redis的set数据结构
- 对详情页的url发起请求,然后解析出电影的名称和简介
- 进行持久化存储
- # 将详情页的url存入redis的set中
ex = self.conn.sadd('urls', detail_url)
if ex==1 没有储存过, ex==0 储存过
实战
# movies.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from moviePro.items import MovieproItem
class MovieSpider(CrawlSpider):
name = 'movie'
# allowed_domains = ['www.ccc.com']
start_urls = ['https://www.4567tv.tv/frim/index1.html']
rules = (
Rule(LinkExtractor(allow=r'/frim/index1-\d+\.html'), callback='parse_item', follow=True),
)
# 创建redis链接对象
conn = Redis(host='127.0.0.1', port=6379)
#用于解析每一个页码对应页面中的电影详情页的url
def parse_item(self, response):
li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
for li in li_list:
# 获取详情页的url
detail_url = 'https://www.4567tv.tv' + li.xpath('./div/a/@href').extract_first()
# 将详情页的url存入redis的set中
ex = self.conn.sadd('urls', detail_url)
if ex == 1:
print('该url没有被爬取过,可以进行数据的爬取')
yield scrapy.Request(url=detail_url, callback=self.parst_detail)
else:
print('数据还没有更新,暂无新数据可爬取!')
# 解析详情页中的电影名称和类型,进行持久化存储
def parst_detail(self, response):
item = MovieproItem()
item['name'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
item['desc'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]//text()').extract()
item['desc'] = ''.join(item['desc'])
yield item
#items.py
import scrapy
class MovieproItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
desc = scrapy.Field()
# pass
# pipelines.py
from redis import Redis
class MovieproPipeline(object):
conn = None
def open_spider(self,spider):
self.conn = spider.conn
def close_spider(self,spider):
self.conn = spider.conn.close()
def process_item(self, item, spider):
dic = {
'name':item['name'],
'desc':item['desc']
}
# print(dic)
self.conn.lpush('movieData',dic)
return item
标签:pass job 标签 高效 rac media none sspi 没有
原文地址:https://www.cnblogs.com/liuxu2019/p/12112708.html