标签:inux app pymysql 响应 try mit ret 发送post请求 from
异步的爬虫框架。
框架:就是一个集成好了各种功能且具有很强通用性的一个项目模板。
环境安装:
Linux:
pip3 install scrapy
Windows:
a. pip3 install wheel
b. 下载twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
c. 进入下载目录,执行 pip3 install Twisted?17.1.0?cp35?cp35m?win_amd64.whl
d. pip3 install pywin32
e. pip3 install scrapy
基本使用
1.新建一个工程:scrapy startporject proName
2.cd proName
3.scrapy genspider spiderName www.xxx.com
4.执行工程:scrapy crawl spiderName
settings.py:
数据解析:
持久化存储
# -*- coding: utf-8 -*-
import scrapy
class FirstSpider(scrapy.Spider):
#爬虫名称:当前爬虫文件的唯一标识
name = ‘first‘
#允许的域名
# allowed_domains = [‘www.baidu.com‘]
#起始的url列表:列表元素只可以是url
#作用:列表元素表示的url就会被进行请求发送
start_urls = [‘http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/‘]
#数据解析
#调用次数是由请求次数决定
# def parse(self, response):
# article_list = response.xpath(‘/html/body/section/div/div/main/article‘)
# for article in article_list:
# #xpath在进行数据提取时,返回的不再是字符串而是一个Selector对象,想要的数据被包含在了该对象的data参数中
# # title = article.xpath(‘./div[1]/h1/a/text()‘)[0].extract()
# title = article.xpath(‘./div[1]/h1/a/text()‘).extract_first()
# content = article.xpath(‘./div[2]//text()‘).extract()
# content = ‘‘.join(content)
# print(title,content)
#==基于终端指令的持久化存储==
def parse(self, response):
all_data = []
article_list = response.xpath(‘/html/body/section/div/div/main/article‘)
for article in article_list:
#xpath在进行数据提取时,返回的不再是字符串而是一个Selector对象,想要的数据被包含在了该对象的data参数中
# title = article.xpath(‘./div[1]/h1/a/text()‘)[0].extract()
title = article.xpath(‘./div[1]/h1/a/text()‘).extract_first()
content = article.xpath(‘./div[2]//text()‘).extract()
content = ‘‘.join(content)
dic = {
‘title‘:title,
‘content‘:content
}
all_data.append(dic)
return all_data #将解析到的数据进行了返回
基于管道的持久化存储的编码流程
全栈数据的爬取
五大核心组件(对象)
如何适当提升scrapy爬取数据的效率
增加并发:
默认scrapy开启的并发线程为16个,可以适当进行增加。在settings配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发设置成了为100。
降低日志级别:
在运行scrapy时,会有大量日志信息的输出,为了减少CPU的使用率。可以设置log输出信息为INFO或者ERROR即可。在配置文件中编写:LOG_LEVEL = ‘ERROR’
禁止cookie:
如果不是真的需要cookie,则在scrapy爬取数据时可以禁止cookie从而减少CPU的使用率,提升爬取效率。在配置文件中编写:COOKIES_ENABLED = False
禁止重试:
对失败的HTTP进行重新请求(重试)会减慢爬取速度,因此可以禁止重试。在配置文件中编写:RETRY_ENABLED = False
减少下载超时:
如果对一个非常慢的链接进行爬取,减少下载超时可以能让卡住的链接快速被放弃,从而提升效率。在配置文件中进行编写:DOWNLOAD_TIMEOUT = 10 超时时间为10s
请求传参
爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from DuanziPro.items import DuanziproItem
#管道机制对应的操作
# class DuanziSpider(scrapy.Spider):
# name = ‘duanzi‘
# # allowed_domains = [‘www.xxx.com‘]
# start_urls = [‘http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/‘]
#
# def parse(self, response):
# all_data = []
# article_list = response.xpath(‘/html/body/section/div/div/main/article‘)
# for article in article_list:
# # xpath在进行数据提取时,返回的不再是字符串而是一个Selector对象,想要的数据被包含在了该对象的data参数中
# # title = article.xpath(‘./div[1]/h1/a/text()‘)[0].extract()
# title = article.xpath(‘./div[1]/h1/a/text()‘).extract_first()
# content = article.xpath(‘./div[2]//text()‘).extract()
# content = ‘‘.join(content)
# #实例化一个item类型的对象,然后将解析到的一组数据存进去
# item = DuanziproItem()
# item[‘title‘] = title
# item[‘content‘] = content
#
# yield item #将item提交给管道
#全栈数据爬取对应的操作
class DuanziSpider(scrapy.Spider):
name = ‘duanzi‘
# allowed_domains = [‘www.xxx.com‘]
start_urls = [‘http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/‘]
#通用的url模板
url = ‘http://duanziwang.com/category/经典段子/%d/‘
pageNum = 1
# def start_requests(self):
# for url in self.start_urls:
# yield scrapy.Request(url,callback=self.parse)
def parse(self, response):
all_data = []
article_list = response.xpath(‘/html/body/section/div/div/main/article‘)
for article in article_list:
# xpath在进行数据提取时,返回的不再是字符串而是一个Selector对象,想要的数据被包含在了该对象的data参数中
# title = article.xpath(‘./div[1]/h1/a/text()‘)[0].extract()
title = article.xpath(‘./div[1]/h1/a/text()‘).extract_first()
content = article.xpath(‘./div[2]//text()‘).extract()
content = ‘‘.join(content)
# 实例化一个item类型的对象,然后将解析到的一组数据存进去
item = DuanziproItem()
item[‘title‘] = title
item[‘content‘] = content
yield item # 将item提交给管道
#编写手动请求的操作
if self.pageNum < 5:
self.pageNum += 1
print(‘正在下载的页码是:‘,self.pageNum)
new_url = format(self.url%self.pageNum)
yield scrapy.Request(url=new_url,callback=self.parse)
构建管道items
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class DuanziproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
管道存储pipelines
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#写入到文本文件中
import pymysql
from redis import Redis
class DuanziproPipeline(object):
fp = None
def open_spider(self,spider):
print(‘开始爬虫......‘)
self.fp = open(‘./duanzi.txt‘,‘w‘,encoding=‘utf-8‘)
#方法每被调用一次,参数item就是其接收到的一个item类型的对象
def process_item(self, item, spider):
# print(item)#item就是一个字典
self.fp.write(item[‘title‘]+‘:‘+item[‘content‘]+‘\n‘)
return item#可以将item提交给下一个即将被执行的管道类
def close_spider(self,spider):
self.fp.close()
print(‘爬虫结束!!!‘)
#将数据写入到mysql
class MysqlPipeLine(object):
conn = None
cursor = None
def open_spider(self,spider):
self.conn = pymysql.Connect(host=‘127.0.0.1‘,port=3306,user=‘root‘,password=‘222‘,db=‘spider‘,charset=‘utf8‘)
print(self.conn)
def process_item(self,item,spider):
sql = ‘insert into duanzi values ("%s","%s")‘%(item[‘title‘],item[‘content‘])
self.cursor = self.conn.cursor()
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self,spider):
self.cursor.close()
self.conn.close()
#将数据写入到redis
class RedisPileLine(object):
conn = None
def open_spider(self,spider):
self.conn = Redis(host=‘127.0.0.1‘,port=6379)
print(self.conn)
def process_item(self,item,spider):
self.conn.lpush(‘duanziData‘,item)
return item
爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from moviePro.items import MovieproItem
#深度爬取
# class MovieSpider(scrapy.Spider):
# name = ‘movie‘
# # allowed_domains = [‘www.xxx.com‘]
# start_urls = [‘https://www.4567tv.tv/index.php/vod/show/class/动作/id/1.html‘]
# url = ‘https://www.4567tv.tv/index.php/vod/show/class/动作/id/1/page/%d.html‘
# def parse(self, response):
# li_list = response.xpath(‘/html/body/div[1]/div/div/div/div[2]/ul/li‘)
# for li in li_list:
# title = li.xpath(‘.//div[@class="stui-vodlist__detail"]/h4/a/text()‘).extract_first()
# detail_url = ‘https://www.4567tv.tv‘+li.xpath(‘.//div[@class="stui-vodlist__detail"]/h4/a/@href‘).extract_first()
#
# item = MovieproItem()
# item[‘title‘] = title
#
# # print(title,detail_url)
# #对详情页的url进行手动请求发送
# #请求传参:
# #参数meta是一个字典,字典会传递给callback
# yield scrapy.Request(detail_url,callback=self.parse_detail,meta={‘item‘:item})
# #自定义的另一个解析方法(必须要有response参数)
# def parse_detail(self,response):
# #接收传递过来的meta
# item = response.meta[‘item‘]
# desc = response.xpath(‘/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()‘).extract_first()
# item[‘desc‘] = desc
#
# yield item
#深度爬取+全栈爬取
class MovieSpider(scrapy.Spider):
name = ‘movie‘
# allowed_domains = [‘www.xxx.com‘]
start_urls = [‘https://www.4567tv.tv/index.php/vod/show/class/动作/id/1.html‘]
url = ‘https://www.4567tv.tv/index.php/vod/show/class/动作/id/1/page/%d.html‘
pageNum = 1
def parse(self, response):
li_list = response.xpath(‘/html/body/div[1]/div/div/div/div[2]/ul/li‘)
for li in li_list:
title = li.xpath(‘.//div[@class="stui-vodlist__detail"]/h4/a/text()‘).extract_first()
detail_url = ‘https://www.4567tv.tv‘+li.xpath(‘.//div[@class="stui-vodlist__detail"]/h4/a/@href‘).extract_first()
item = MovieproItem()
item[‘title‘] = title
# print(title,detail_url)
#对详情页的url进行手动请求发送
#请求传参:
#参数meta是一个字典,字典会传递给callback
yield scrapy.Request(detail_url,callback=self.parse_detail,meta={‘item‘:item})
#全栈爬取
if self.pageNum < 4:
self.pageNum += 1
new_url = format(self.url%self.pageNum)
yield scrapy.Request(new_url,callback=self.parse)
#自定义的另一个解析方法(必须要有response参数)
def parse_detail(self,response):
#接收传递过来的meta
item = response.meta[‘item‘]
desc = response.xpath(‘/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()‘).extract_first()
item[‘desc‘] = desc
yield item
items
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class MovieproItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
desc = scrapy.Field()
pipelines
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class MovieproPipeline(object):
def process_item(self, item, spider):
print(item)
return item
robots
UA伪装
动态变化的请求参数
验证码
cookie
代理
动态加载的数据
js加密
js混淆
图片懒加载
标签:inux app pymysql 响应 try mit ret 发送post请求 from
原文地址:https://www.cnblogs.com/zhaoganggang/p/13192713.html