标签:top pipeline type python3.6 res imp lis col path
一、依赖
virtualenv -p python3.6 xx
pip install scrapy
pip install pymysql
二、
1、创建项目和spider1
scrapy startproject scraw_swagger
scrapy genspider spider1 xxx.com
以下代码主要实现了将swagger的第一级目录爬下来存在一个叫:interfaces_path的文件下
# -*- coding: utf-8 -*- import scrapy import json from scraw_swagger import settings class Spider1Spider(scrapy.Spider): name = ‘spider1‘ allowed_domains = [‘xxx.com‘‘] scrawl_domain = settings.interface_domain+‘/api-docs‘ start_urls = [scrawl_domain] def parse(self, response): # 调试代码 # filename = ‘mid_link‘ # open(filename, ‘wb‘).write(response.body) # ///////// response = response.body response_dict = json.loads(response) apis = response_dict[‘apis‘] n = len(apis) temppath = [] i = 0 domain = settings.interface_domain+‘/api-docs‘ filename = ‘interfaces_path‘ file = open(filename, ‘w‘) for i in range(0, n): subapi = apis[i] path = subapi[‘path‘] path = ‘,‘+domain + path temppath.append(path) file.write(path)
2、创建spider2
scrapy genspider spider2 xxx.com
以下代码主要实现了获取interfaces_path的文件下的地址对应的内容
# # -*- coding: utf-8 -*- import scrapy from scraw_swagger.items import ScrawSwaggerItem import json from scraw_swagger import settings class Spider2Spider(scrapy.Spider): name = ‘spider2‘ allowed_domains = [‘xxx.com‘] file = open(‘interfaces_path‘, ‘r‘) file = file.read() list_files = [] files = file.split(‘,‘) n = len(files) for i in range(1, n): file = files[i] list_files.append(file) start_urls = list_files def parse(self, response): outitem = ScrawSwaggerItem() out_interface = [] out_domain = [] out_method = [] out_param_name = [] out_data_type = [] out_param_required = [] # 调试代码 # filename = response.url.split("/")[-1] # open(‘temp/‘+filename, ‘wb‘).write(response.body) # /////// response = response.body response_dict = json.loads(response) items = response_dict[‘apis‘] items_len = len(items) for j in range(0, items_len): path = items[j][‘path‘] # interface组成list operations = items[j][‘operations‘][0] method = operations[‘method‘] parameters = operations[‘parameters‘] parameters_len = len(parameters) param_name = [] param_required = [] data_type = [] for i in range(0, parameters_len): name = parameters[i][‘name‘] param_name.append(name) required = parameters[i][‘required‘] param_required.append(required) type = parameters[i][‘type‘] data_type.append(type) out_interface.append(path) interface_domain = settings.interface_domain out_domain.append(interface_domain) out_method.append(method) out_data_type.append(data_type) out_param_name.append(param_name) out_param_required.append(param_required) outitem[‘interface‘] = out_interface outitem[‘domain‘] = out_domain outitem[‘method‘] = out_method outitem[‘param_name‘] = out_param_name outitem[‘param_required‘] = out_param_required outitem[‘data_type‘] = out_data_type yield outitem
3、存入数据库
提取返回的item,并将对应的字段存入数据库
# -*- coding: utf-8 -*- import pymysql from scraw_swagger import settings from twisted.enterprise import adbapi import pymysql.cursors # # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class ScrawSwaggerPipeline(object): def process_item(self, item, spider): try: # 插入数据sql sql = """ insert into interfaces (domain, interface, method, param_name ,data_type, param_required) VALUES (%s, %s, %s, %s, %s, %s) """ domain = item[‘domain‘] n = len(domain) for i in range(0, n): domain = str(item[‘domain‘][i]) interface = str(item["interface"][i]) method = str(item["method"][i]) param_name = str(item["param_name"][i]) data_type = str(item["data_type"][i]) param_required = str(item["param_required"][i]) a = (domain, interface, method, param_name, data_type, param_required) self.cursor.execute(sql, a) self.connect.commit() except Exception as error: # 出现错误时打印错误日志 print(error) # self.connect.close() return item def __init__(self): # 连接数据库 self.connect = pymysql.connect( host=settings.MYSQL_HOST, db=settings.MYSQL_DBNAME, user=settings.MYSQL_USER, passwd=settings.MYSQL_PASSWD, port=settings.MYSQL_PORT, charset=‘utf8‘, use_unicode=True) # 通过cursor执行增删查改 self.cursor = self.connect.cursor()
标签:top pipeline type python3.6 res imp lis col path
原文地址:https://www.cnblogs.com/turbolxq/p/10331342.html