标签:form rop complete 增加 网易新闻 gen password 笔记 方法
scrapy的持久化存储
基于终端指令:
基于管道:pipelines.py
import scrapy
from huyaPro.items import HuyaproItem
class HuyaSpider(scrapy.Spider):
name = ‘huya‘
# allowed_domains = [‘www.xxx.com‘]
start_urls = [‘https://www.huya.com/g/wzry‘]
# 基于终端指令进行的持久化存储
# def parse(self, response):
# li_list = response.xpath(‘//*[@id="js-live-list"]/li‘)
# all_data = []
# for li in li_list:
# title = li.xpath(‘./a[2]/text()‘).extract_first()
# author = li.xpath(‘./span/span[1]/i/text()‘).extract_first()
# hot = li.xpath(‘./span/span[2]/i[2]/text()‘).extract_first()
# dic = {
# ‘title‘: title,
# ‘author‘: author,
# ‘hot‘: hot
# }
# all_data.append(dic)
# return all_data
# 基于管道进行的持久化存储
def parse(self, response):
li_list = response.xpath(‘//*[@id="js-live-list"]/li‘)
for li in li_list:
title = li.xpath(‘./a[2]/text()‘).extract_first()
author = li.xpath(‘./span/span[1]/i/text()‘).extract_first()
hot = li.xpath(‘./span/span[2]/i[2]/text()‘).extract_first()
item = HuyaproItem()
item[‘title‘] = title
item[‘author‘] = author
item[‘hot‘] = hot
yield item
import scrapy
class HuyaproItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field() # 数据解析中的属性
author = scrapy.Field()
hot = scrapy.Field()
import pymysql
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class HuyaproPipeline:
fp = None
def open_spider(self, spider): # 只会执行一次
print(‘open‘)
self.fp = open(‘huyazhibo.txt‘, ‘w‘, encoding=‘utf-8‘)
def process_item(self, item, spider): # item就是接受爬虫类提交过来的item对象
self.fp.write(item[‘title‘]+‘:‘+item[‘author‘]+‘:‘+item[‘hot‘]+‘\n‘)
print(item[‘title‘]+‘写入成功‘)
return item # item的操作表示将item传递给下一个即将被执行的管道类
def close_item(self, spider): # 只会执行一次
self.fp.close()
print(‘close‘)
class mysqlPopeLine:
conn = None
cursor = None
def open_spider(self, spider): # 只会执行一次
self.conn = pymysql.Connect(host=‘127.0.0.1‘, port=3306, user=‘root‘, password=‘‘, db=‘Spider‘, charset=‘utf8‘)
print(self.conn)
def process_item(self, item, spider):
sql = ‘insert into huya values("%s","%s","%s")‘%(item[‘title‘],item[‘author‘],item[‘hot‘])
self.cursor = self.conn.cursor()
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
def close_spider(self, spider): # 只会执行一次
self.cursor.close()
self.conn.close()
BOT_NAME = ‘huyaPro‘
SPIDER_MODULES = [‘huyaPro.spiders‘]
NEWSPIDER_MODULE = ‘huyaPro.spiders‘
USER_AGENT = ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36‘
LOG_LEVEL = ‘ERROR‘
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
‘huyaPro.pipelines.HuyaproPipeline‘: 300, # 数字越小优先级越高
‘huyaPro.pipelines.mysqlPopeLine‘: 301,
}
增加并发:
降低日志级别:
禁止cookie:
禁止重试:
减少下载超时:
import scrapy
class MiddleSpider(scrapy.Spider):
name = ‘middle‘
# allowed_domains = [‘www.xxx.com‘]
start_urls = [‘http://ip.chinaz.com/‘]
def parse(self, response):
page_text = response.text
with open(‘iip.html‘, ‘w‘, encoding=‘utf-8‘) as fp:
fp.write(page_text)
from scrapy import signals
import random
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
]
from itemadapter import is_item, ItemAdapter
class MiddleproDownloaderMiddleware:
# 拦截正常请求
def process_request(self, request, spider):
# 进行UA伪装
print(‘!!!!!!!!!!!!!!!!!!!!!‘)
request.headers[‘User-Agent‘] = random.choice(user_agent_list)
print(request.headers[‘User-Agent‘])
# 代理ip
request.meta[‘proxy‘] = ‘http://123.55.114.25:9999‘
print(request.meta[‘proxy‘])
return None
# 拦截所有的请求
def process_response(self, request, response, spider):
print(‘??????????????????‘)
return response
# 拦截发生异常的请求
def process_exception(self, request, exception, spider):
# print(request)
return request # 将修正后的正常的请求对象进行重新发送
def spider_opened(self, spider):
spider.logger.info(‘Spider opened: %s‘ % spider.name)
import scrapy
from moviePro.items import MovieproItem
class MovieSpider(scrapy.Spider):
name = ‘movie‘
# allowed_domains = [‘www.xxx.com‘]
start_urls = [‘http://www.4567kan.com/index.php/vod/show/class/喜剧/id/6/page/1.html‘]
url = ‘http://www.4567kan.com/index.php/vod/show/class/喜剧/id/6/page/%d.html‘
# 专门用于解析电影名称
page = 1 # 该page给之后递归调用停止用
def parse(self, response):
print(f‘正在爬取第{self.page}页的数据‘)
li_list = response.xpath(‘/html/body/div[1]/div/div/div/div[2]/ul/li‘)
for li in li_list:
item = MovieproItem()
name = li.xpath(‘./div/a/@title‘).extract_first()
item[‘name‘] = name
detail_url = ‘http://www.4567kan.com‘ + li.xpath(‘./div/a/@href‘).extract_first()
# 可以对详情页的url手动发起请求
# 请求参数:让Request将一个数据值(字典)传递给回调函数
yield scrapy.Request(detail_url, callback=self.parse_detail, meta={‘item‘: item})
if self.page < 5:
self.page += 1
new_url = format(self.url%self.page)
yield scrapy.Request(new_url, callback=self.parse) # 递归调用
def parse_detail(self, response):
item = response.meta[‘item‘]
desc = response.xpath(‘/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()‘).extract_first()
item[‘desc‘] = desc
yield item
import scrapy
class MovieproItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
desc = scrapy.Field()
import scrapy
from wangyiPro.items import WangyiproItem
from selenium import webdriver
class WangyiSpider(scrapy.Spider):
name = ‘wangyi‘
# allowed_domains = [‘www.xxx.com‘]
start_urls = [‘https://news.163.com/‘]
model_urls = []
bro = webdriver.Chrome(executable_path=r‘D:\老男孩python22期代码及笔记\day95\chromedriver.exe‘)
def parse(self, response):
# 解析出5个板块对应的url
li_list = response.xpath(‘//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li‘)
model_index = [3,4,6,7,8]
for index in model_index:
li = li_list[index]
# 5个板块对应的url
model_url = li.xpath(‘./a/@href‘).extract_first()
self.model_urls.append(model_url)
# 对每一个板块的url进行手动请求的发送
yield scrapy.Request(model_url, callback=self.parse_model)
def parse_model(self, response): # 用作于解析每一个板块对应页面数据中的新闻标题和新闻详情页的url
div_list = response.xpath(‘/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div‘)
for div in div_list:
title = div.xpath(‘./div/div[1]/h3/a/text()‘).extract_first()
item = WangyiproItem()
item[‘title‘] = title
detail_url = div.xpath(‘./a/@herf‘).extract_first()
yield scrapy.Request(detail_url, callback=self.parse_new_detail, meta={‘item‘: item})
def parse_new_detail(self, response):
item = response.meta[‘item‘]
content = response.xpath(‘//*[@id="content"]/div[2]//text()‘).extract()
content = ‘‘.join(content)
item[‘content‘] = content
yield item
# 改方法只会在整个程序结束时执行一次
def closed(self, reason):
self.bro.quit()
import scrapy
class WangyiproItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
content = scrapy.Field()
from scrapy import signals
from itemadapter import is_item, ItemAdapter
from scrapy.http import HtmlResponse
from time import sleep
class WangyiproDownloaderMiddleware:
#参数:
#reuqest:拦截到请求对应的响应对象
#response:拦截到所有的响应对象(1+5+n)
#spider:爬虫类实例化的对象,可以实现爬虫类和中间件类的数据交互
def process_response(self, request, response, spider):
#拦截到5个板块对应的响应对象,将其替换成5个符合需求的新的响应对象进行返回
#1.找出5个板块对应的5个不符合需求的响应对象
if request.url in spider.model_urls:
#就是满足需求的五个板块对应的响应对象
#url:响应对象对应的请求对象的url
#body:响应数据,可以由selenium中的page_source返回
bro = spider.bro
bro.get(request.url)
sleep(3)
page_text = bro.page_source # 包含了动态加载的新闻数据
new_response = HtmlResponse(url=request.url, body=page_text, encoding=‘utf-8‘, request=request)
return new_response
else:
return response
import pymysql
class WangyiproPipeline:
conn = None
cur = None
def open_spider(self, spider):
self.conn = pymysql.connect(host=‘127.0.0.1‘, port=3306, user=‘root‘, password=‘‘, database=‘Spider‘, charset=‘utf8‘)
def process_item(self, item, spider):
print(item)
sql = ‘insert into wangyi values("%s","%s")‘%(item[‘title‘], item[‘content‘])
self.cur = self.conn.cursor()
try:
self.cur.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
self.cur.close()
self.conn.close()
图片懒加载
ImagePileline: 专门用作与二进制数据下载和持久化存储的管道类
CrawlSpider
分布式
UA伪装
Robots
管道的指定
ITEM_PIPELINES = {
‘scrapy_redis.pipelines.RedisPipeline‘: 400
}
指定调度器:
指定redis数据库
REDIS_HOST = ‘redis服务的ip地址‘
REDIS_PORT = 6379
redis的配置文件进行配置redis.windows.conf:
启动redis的服务端和客户端:
启动程序:
scrapy runspider xxx.py
向调度器的队列中仍入一个起始的url:
import scrapy
from imgPro.items import ImgproItem
class ImgSpider(scrapy.Spider):
name = ‘img‘
# allowed_domains = [‘www.xxx.com‘]
start_urls = [‘http://sc.chinaz.com/tupian/meinvtupian.html‘]
def parse(self, response):
div_list = response.xpath(‘//*[@id="container"]/div‘)
for div in div_list:
img_src = ‘https:‘ + div.xpath(‘./div/a/img/@src2‘).extract_first()
item = ImgproItem()
item[‘img_src‘] = img_src
yield item
import scrapy
class ImgproItem(scrapy.Item):
img_src = scrapy.Field()
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class ImgproPipeline(ImagesPipeline):
# 是用来对媒体资源进行请求的(数据下载),参数item就是接受到爬虫类提交的item对象
def get_media_requests(self, item, info):
yield scrapy.Request(item[‘img_src‘])
# 指明数据存储的路径,需要再settings.py中进行设置
def file_path(self, request, response=None, info=None):
return request.url.split(‘/‘)[-1]
def item_completed(self, results, item, info):
return item
# 图片存储文件夹的名称+路径
IMAGES_STORE = ‘./imgLibs‘
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunCrawlPro.items import SuncrawlproItem, Detail_item
class SuncrawlSpider(CrawlSpider):
name = ‘sunCrawl‘
# allowed_domains = [‘www.xxx.com‘]
start_urls = [‘http://wz.sun0769.com/political/index/politicsNewest‘]
# 实例化一个连接提取器对象:只能取连接
# 作用:根据指定规则(allow=‘正则表达式‘)进行指定连接的提取
link = LinkExtractor(allow=r‘id=1&page=\d+‘) # 获取页码连接
# 获取详情页连接,注意:如果中间有点要注意转义
link_detail = LinkExtractor(allow=r‘/political/politics/index?id=\d+‘)
rules = (
# 将Link作用到了Rule构造方法的参数1中
# 作用:(可以数据解析和请求发送)将连接提取器提取到的连接进行请求发送且根据指定规则对请求到的数据进行数据解析
Rule(link, callback=‘parse_item‘, follow=False),
# follow=True:将连接提取器 继续作用到 连接提取器取到的 连接 所对应的 页面中
Rule(link_detail, callback=‘parse_detail‘, follow=False),
)
def parse_item(self, response):
# xpath表达式中是不可以出现tbody的*****
li_list = response.xpath(‘/html/body/div[2]/div[3]/ul[2]/li‘)
for li in li_list:
title = li.xpath(‘./li/span[3]/a/text()‘).extract_first()
num = li.xpath(‘./li/span[1]/text()‘).extract_first()
item = SuncrawlproItem()
item[‘title‘] = title
item[‘num‘] = num
yield item
def parse_detail(self, response):
num = response.xpath(‘/html/body/div[3]/div[2]/div[2]/div[1]/span[4]/text()‘).extract_first()
num = num.split(‘:‘)[-1]
content = response.xpath(‘/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()‘).extract_first()
item = Detail_item()
item[‘num‘] = num
item[‘content‘] = content
yield item
import scrapy
class SuncrawlproItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
num = scrapy.Field()
class Detail_item(scrapy.Item):
content = scrapy.Field()
num = scrapy.Field()
class SuncrawlproPipeline:
def process_item(self, item, spider):
if item.__class__.__name__ == ‘Detail_item‘:
content = item[‘content‘]
num = item[‘num‘]
print(item, ‘!!!!!!!!!!‘)
else:
title = item[‘title‘]
num = item[‘num‘]
print(item, ‘?????????‘)
return item
标签:form rop complete 增加 网易新闻 gen password 笔记 方法
原文地址:https://www.cnblogs.com/zranguai/p/14036876.html