爬虫的编写方法

时间：2020-04-16 16:54:35 阅读：96 评论：0 收藏：0 [点我收藏+]

标签：wax ram 上海 ipy rollback 训练 sklearn ctrl dai

安装MYSQL，创建数据库

sudo apt install mysql-server

sudo mysql

sudo打开mysql后创建名为ClusteringFutures的数据库，并创建和授权一般用户。Ctrl+c退出

CREATE DATABASE ClusteringFutures;

USE ClusteringFutures;

CREATE USER ‘IVIVI_PLUS‘@‘localhost‘ IDENTIFIED BY ‘123456‘;

GRANT ALL PRIVILEGES ON ClusteringFutures.* TO ‘IVIVI_PLUS‘@‘localhost‘;

FLUSH PRIVILEGES;

验证以上操作，以一般用户进入数据库。

mysql -u IVIVI_PLUS -p ClusteringFutures

至此，MYSQL准备工作完成。

Python端

基本流程：

1、从新浪财经获取国内商品期货的日K线数据。

2、数据导入MYSQL

3、清洗数据

4、计算单日涨跌幅，作为聚类模型训练的输入

5、SKLearn聚类

6、Matplotlib进行可视化

import requests

import pymysql

import numpy as np

import matplotlib.pyplot as plt

from matplotlib.collections import LineCollection

from sklearn import cluster, covariance, manifold

# 商品期货代码与名称字典

# symbol_dict = {

# ‘TA0‘: ‘PTA‘,

# ‘OI0‘: ‘菜油‘,

# ‘RS0‘: ‘菜籽‘,

# ‘RM0‘: ‘菜粕‘,

# ‘ZC0‘: ‘动力煤‘,

# ‘WH0‘: ‘强麦‘,

# ‘JR0‘: ‘粳稻‘,

# ‘SR0‘: ‘白糖‘,

# ‘CF0‘: ‘棉花‘,

# ‘RI0‘: ‘早籼稻‘,

# ‘MA0‘: ‘甲醇‘,

# ‘FG0‘: ‘玻璃‘,

# ‘LR0‘: ‘晚籼稻‘,

# ‘SF0‘: ‘硅铁‘,

# ‘SM0‘: ‘锰硅‘,

# ‘CY0‘: ‘棉纱‘,

# ‘AP0‘: ‘苹果‘,

# ‘CJ0‘: ‘红枣‘,

# ‘V0‘: ‘PVC‘,

# ‘P0‘: ‘棕榈油‘,

# ‘B0‘: ‘豆二‘,

# ‘M0‘: ‘豆粕‘,

# ‘I0‘: ‘铁矿石‘,

# ‘JD0‘: ‘鸡蛋‘,

# ‘L0‘: ‘塑料‘,

# ‘PP0‘: ‘聚丙烯‘,

# ‘FB0‘: ‘纤维板‘,

# ‘BB0‘: ‘胶合板‘,

# ‘Y0‘: ‘豆油‘,

# ‘C0‘: ‘玉米‘,

# ‘A0‘: ‘豆一‘,

# ‘J0‘: ‘焦炭‘,

# ‘JM0‘: ‘焦煤‘,

# ‘CS0‘: ‘淀粉‘,

# ‘EG0‘: ‘乙二醇‘,

# ‘FU0‘: ‘燃料油‘,

# ‘SC0‘: ‘上海原油‘,

# ‘AL0‘: ‘铝‘,

# ‘RU0‘: ‘天然橡胶‘,

# ‘ZN0‘: ‘沪锌‘,

# ‘CU0‘: ‘铜‘,

# ‘AU0‘: ‘黄金‘,

# ‘RB0‘: ‘螺纹钢‘,

# ‘WR0‘: ‘线材‘,

# ‘PB0‘: ‘铅‘,

# ‘AG0‘: ‘白银‘,

# ‘BU0‘: ‘沥青‘,

# ‘HC0‘: ‘热轧卷板‘,

# ‘SN0‘: ‘锡‘,

# ‘NI0‘: ‘镍‘,

# ‘SP0‘: ‘纸浆‘}

symbol_dict = {

‘TA0‘: ‘PTA‘,

‘OI0‘: ‘菜油‘,

‘RM0‘: ‘菜粕‘,

‘WH0‘: ‘强麦‘,

‘SR0‘: ‘白糖‘,

‘CF0‘: ‘棉花‘,

‘MA0‘: ‘甲醇‘,

‘FG0‘: ‘玻璃‘,

‘SM0‘: ‘锰硅‘,

‘V0‘: ‘PVC‘,

‘P0‘: ‘棕榈油‘,

‘M0‘: ‘豆粕‘,

‘I0‘: ‘铁矿石‘,

‘JD0‘: ‘鸡蛋‘,

‘L0‘: ‘塑料‘,

‘PP0‘: ‘聚丙烯‘,

‘FB0‘: ‘纤维板‘,

‘BB0‘: ‘胶合板‘,

‘Y0‘: ‘豆油‘,

‘C0‘: ‘玉米‘,

‘A0‘: ‘豆一‘,

‘J0‘: ‘焦炭‘,

‘JM0‘: ‘焦煤‘,

‘CS0‘: ‘淀粉‘,

‘AL0‘: ‘铝‘,

‘RU0‘: ‘天然橡胶‘,

‘ZN0‘: ‘沪锌‘,

‘CU0‘: ‘铜‘,

‘AU0‘: ‘黄金‘,

‘RB0‘: ‘螺纹钢‘,

‘PB0‘: ‘铅‘,

‘AG0‘: ‘白银‘,

‘HC0‘: ‘热轧卷板‘}

# 将期货字典的keys和values排序后，分别存入symbol_list和names备用

symbol_list, names = np.array(sorted(symbol_dict.items())).T

# 将symbol从np.array转为list

symbol_list = list(symbol_list)

# 创建期货数据表的函数

def creat_future_table(symbol):

future_code = symbol

# 向新浪财经api请求历史数据

url_str = (‘http://stock2.finance.sina.com.cn/futures/api/json.php/IndexService.getInnerFuturesDailyKLine?symbol=‘ +

future_code)

r = requests.get(url_str)

daily_klines = r.json()

daily_klines_lists = list(daily_klines)

# 数据写入数据库，表格名为期货代码

future_db = pymysql.connect(‘localhost‘, ‘IVIVI_PLUS‘, ‘123456‘, ‘ClusteringFutures‘)

cursor = future_db.cursor()

# 如果已存在该期货表格，则删除

cursor.execute("DROP TABLE IF EXISTS " + future_code)

# 创建表格，“交易日期”（主键）、开盘价、最高价、最低价、收盘价、成交量

sql = "CREATE TABLE " + future_code + """ (

price_date DATE NOT NULL PRIMARY KEY,

open_price DECIMAL(19, 4),

high_price DECIMAL(19, 4),

low_price DECIMAL(19, 4),

close_price DECIMAL(19, 4),

volume BIGINT)"""

# 运行创建表格的sql语句

cursor.execute(sql)

# 插入数据（ps:新浪api返回数据转字符串时会多余‘[‘和‘]‘，所以进行切片）

for i in daily_klines_lists:

sql = "INSERT INTO " + future_code + """ (price_date, open_price, high_price, low_price, close_price, volume)

VALUES (""" + str(i)[1:-1] + ")"

try:

# 执行sql语句

cursor.execute(sql)

# 提交到数据库执行

future_db.commit()

except:

print(‘rollback table %s‘ %s (future_code))

# 如果发生错误则回滚

future_db.rollback()

# 关闭数据库连接

future_db.close()

# 删除表格中垃圾数据行的函数

def delete_rows(symbol):

future_code = symbol

future_db = pymysql.connect(‘localhost‘, ‘IVIVI_PLUS‘, ‘123456‘, ‘ClusteringFutures‘)

cursor = future_db.cursor()

# 简单认为，开盘价或收盘价或成交量是0的行为垃圾数据

sql = "DELETE FROM " + future_code + " WHERE open_price = 0 OR close_price = 0 OR volume = 0"

cursor.execute(sql)

future_db.commit()

future_db.close()

# 返回一个表格中，全部交易日期的函数

def select_columns(symbol):

future_code = symbol

future_db = pymysql.connect(‘localhost‘, ‘IVIVI_PLUS‘, ‘123456‘, ‘ClusteringFutures‘)

cursor = future_db.cursor()

sql = "SELECT price_date FROM " + future_code

cursor.execute(sql)

r = cursor.fetchall()

future_db.close()

return r

# 查询某期货某一天的某项数据的函数

def select_a_data(symbol, date, data_name):

future_code = symbol

future_db = pymysql.connect(‘localhost‘, ‘IVIVI_PLUS‘, ‘123456‘, ‘ClusteringFutures‘)

cursor = future_db.cursor()

sql = "SELECT %s FROM " % data_name + future_code + " WHERE price_date = \‘" + date + "\‘"

cursor.execute(sql)

r = cursor.fetchall()

future_db.close()

return r

print(‘对 %d 只期货进行聚类‘ % len(symbol_list))

# 对symbol_list中的所有期货创建数据表，并删除垃圾行

for s in symbol_list:

print(symbol_dict[s])

creat_future_table(s)

delete_rows(s)

# 找出symbol_list中的所有期货的共有交易日

date_set_list = [] # 交易日组成的集合的列表（为了方便后续的求交集工作，所以转为集合）

for s in symbol_list:

r = select_columns(s)

date_set_list.append(set(r))

com_date_set = date_set_list[0] # 共有交易日集合，初始等于第一支期货的交易日集合

# 对date_set_list中的全部日期集合求交集，获得共有交易日

for i in range(len(symbol_list)):

com_date_set = com_date_set.intersection(date_set_list[i])

com_date_list = list(com_date_set)

com_date_list.sort()

# 将共有交易日以字符串形式保存，方便后续使用

str_com_date_list = []

for i in com_date_list:

str_com_date_list.append(str(i[0]))

# 打印共有交易日天数

print(‘将要聚类的期货共有 %d 条有效交易数据‘ % len(str_com_date_list))

# 将共有交易日的收盘价和开盘价转为np.array格式

# 转置后每一列是同一个期货每个交易日的收盘价或开盘价

# 行数为交易日天数，列数为期货数量

close_prices_list = []

for s in symbol_list:

tmp_list = []

for date in str_com_date_list:

tmp_list.append(select_a_data(s, date, ‘close_price‘)[0][0])

else:

close_prices_list.append(tmp_list)

close_prices = np.array(close_prices_list).T

open_prices_list = []

for s in symbol_list:

tmp_list = []

for date in str_com_date_list:

tmp_list.append(select_a_data(s, date, ‘open_price‘)[0][0])

else:

open_prices_list.append(tmp_list)

open_prices = np.array(open_prices_list).T

# 计算每个期货的每个单日涨跌幅数组作为模型输入

X = (close_prices - open_prices) / open_prices * 100

# 查看输入形状

print(‘验证输入数据形状‘)

print(X.shape)

# 使用sk_learn预设模型进行聚类

edge_model = covariance.GraphicalLassoCV()

edge_model.fit(X)

_, labels = cluster.affinity_propagation(edge_model.covariance_)

n_labels = labels.max()

# #############################################################################

# Find a low-dimension embedding for visualization: find the best position of

# the nodes (the stocks) on a 2D plane

# We use a dense eigen_solver to achieve reproducibility (arpack is

# initiated with random vectors that we don‘t control). In addition, we

# use a large number of neighbors to capture the large-scale structure.

node_position_model = manifold.LocallyLinearEmbedding(

n_components=2, eigen_solver=‘dense‘, n_neighbors=6)

embedding = node_position_model.fit_transform(X.T).T

# #############################################################################

# Visualization

plt.rcParams[‘font.sans-serif‘] = [‘TW-Sung‘] # 指定中文字体，解决中文乱码

plt.figure(1, facecolor=‘w‘, figsize=(10, 8))

plt.clf()

ax = plt.axes([0., 0., 1., 1.])

plt.axis(‘off‘)

# Display a graph of the partial correlations

partial_correlations = edge_model.precision_.copy()

d = 1 / np.sqrt(np.diag(partial_correlations))

partial_correlations *= d

partial_correlations *= d[:, np.newaxis]

non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

# Plot the nodes using the coordinates of our embedding

plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,

cmap=plt.cm.nipy_spectral)

爬虫的编写方法

标签：wax ram 上海 ipy rollback 训练 sklearn ctrl dai

原文地址：https://www.cnblogs.com/ihdci/p/12713804.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行