标签:nsf sklearn div pairs load size mod uniq put
python k-means
F:\PythonProject\K-Means
import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from sklearn.cluster import KMeans threshold_value = 0.85 def main(): # load data df_wine = pd.read_csv(‘d_1.txt‘, header=None) # 本地加载 df_wine2 = pd.read_csv(‘f_1.txt‘, header=None) # 本地加载 # split the data,train:test=7:3 #x, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values #print(df_wine.iloc[:, 2:].values) #print(df_wine.iloc[:, 1:2].values) #print(df_wine.iloc[:, 0:1].values) #x,y,z = df_wine.iloc[:, 2:].values, df_wine.iloc[:, 1:2].values, df_wine.iloc[:, 0:1].values x=df_wine.iloc[:, 2:].values y=df_wine.iloc[:, 1].values z_frame=df_wine.iloc[:, 0:2].values z_frame_f = df_wine2.iloc[:, 0:2].values label_name_f = df_wine2.iloc[:, 2].values list_len = 20 x=x[0:list_len] y=y[0:list_len] z_frame=z_frame[0:list_len] #z_frame_f=z_frame_f[0:list_len] #label_name_f=label_name_f[0:list_len] #print(z_frame) #print("-------------------------------------------") #print(z_frame_f) #print("{0} {1}".format(x,y)) print("{0} {1}".format(len(x),len(y))) #print(x) #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, stratify=y, random_state=0) x_train = x[0:len(x)-5] y_train = y[0:len(y)-5] x_test = x[0:5] y_test = y[0:5] print(len(x_train)) print(x_train) print("----------------------------------------") #print(y_train) # standardize the feature 标准化单位方差 sc = StandardScaler() x_train_std = sc.fit_transform(x_train) x_test_std = sc.fit_transform(x_test) #print(x_train_std) print(len(x_train_std)) # 构造协方差矩阵,得到特征向量和特征值 cov_matrix = np.cov(x_train_std.T) eigen_val, eigen_vec = np.linalg.eig(cov_matrix) # print("values\n ", eigen_val, "\nvector\n ", eigen_vec) print(len(eigen_val)) print(len(eigen_vec)) # 解释方差比 tot = sum(eigen_val) # 总特征值和 var_exp = [(i / tot) for i in sorted(eigen_val, reverse=True)] # 计算解释方差比,降序 #print(var_exp) #[0.3516026271036254, 0.2154102386841404, 0.09449164581680554, #0.0919054990988971, 0.08265939106635344, 0.055431032435754, #0.04012443059852082, 0.028756191609729642, 0.017827639508716207, #0.011781879332959133, 0.008141811912227535, 0.0018676128322704462] cum_var_exp = np.cumsum(var_exp) # 累加方差比率 print(cum_var_exp) #[0.35789126 0.56364606 0.66236146 0.7537545 0.83350328 0.88822259 #0.93227841 0.96230417 0.9793677 0.99038737 0.9981856 1. ] index_x0 = -1 for i in range(len(cum_var_exp)): index_value = cum_var_exp[i] if index_value >threshold_value: index_x0 = i break print("PCA:",index_x0) # 特征变换 eigen_pairs = [(np.abs(eigen_val[i]), eigen_vec[:, i]) for i in range(len(eigen_val))] eigen_pairs.sort(key=lambda k: k[0], reverse=True) # (特征值,特征向量)降序排列 eigen_pairs2 = np.array(eigen_pairs) print(type(eigen_pairs)) print(type(eigen_pairs2)) print(len(eigen_pairs)) #print(eigen_pairs) print("====================================") #print(eigen_pairs[0][1]) #print(eigen_pairs[1][1][0:4]) output_matrix = x X = np.array(output_matrix) print("---------m----------------") #print(eigen_pairs2[:,:2]) w = np.hstack((eigen_pairs[0][1][:, np.newaxis], eigen_pairs[1][1][:, np.newaxis])) # 降维投影矩阵W #print("-------------------------") #print(w) x_train_pca = x_train_std.dot(w) print("-------------------------") #print(x_train_pca) color = [‘r‘, ‘g‘, ‘b‘] marker = [‘s‘, ‘x‘, ‘o‘] for i, c, m in zip(np.unique(y_train), color, marker): #print("{0} {1}".format(x_train_pca[y_train == i, 0],x_train_pca[y_train == i, 1])) #print("{0} {1} {2}".format(len(x_train_pca[y_train == i, 0]),len(x_train_pca[y_train == i, 1]),len(y))) plt.scatter(x_train_pca[y_train == i, 0],x_train_pca[y_train == i, 1],c=c, label=i, marker=m) plt.title(‘Result‘) plt.xlabel(‘PC1‘) plt.ylabel(‘PC2‘) plt.legend(loc=‘lower left‘) plt.show() #print("============================") estimator = KMeans(n_clusters=3)#构造聚类器 #print(estimator.labels_) estimator.fit(X)#聚类 label_pred = estimator.labels_ #获取聚类标签 center_p = estimator.cluster_centers_ #聚类中心 #print(estimator.labels_) #print(y_train) print("============聚类中心================") print(center_p) print("============================") print(label_pred) #print(X) #绘制k-means结果 ‘‘‘ x0 = X[label_pred == 0] x1 = X[label_pred == 1] x2 = X[label_pred == 2] ‘‘‘ x0=[] x1=[] x2=[] y0=[] y1=[] y2=[] ‘‘‘ for i in range(len(label_pred)): if label_pred[i] == 0: x0.append(X[i]) y0.append(y_train[i]) elif label_pred[i] == 1: x1.append(X[i]) y1.append(y_train[i]) elif label_pred[i] == 2: x2.append(X[i]) y2.append(y_train[i]) ‘‘‘ for i in range(len(label_pred)): if label_pred[i] == 0: x0.append(X[i]) index_z = z_frame[i] index_z_1 = index_z[0] index_z_2 = index_z[1] for m in range(len(z_frame_f)): index_z_f = z_frame_f[m] index_z_f_1 = index_z_f[0] index_z_f_2 = index_z_f[1] if index_z_f_1==index_z_1 and index_z_2==index_z_f_2: index_name1 = label_name_f[m] print("1 {0} {1} {2}".format(index_z_f_1,index_z_2,index_name1)) y0.append(index_name1) elif label_pred[i] == 1: x1.append(X[i]) index_z = z_frame[i] index_z_1 = index_z[0] index_z_2 = index_z[1] for m in range(len(z_frame_f)): index_z_f = z_frame_f[m] index_z_f_1 = index_z_f[0] index_z_f_2 = index_z_f[1] if index_z_f_1==index_z_1 and index_z_2==index_z_f_2: index_name1 = label_name_f[m] print("2 {0} {1} {2}".format(index_z_f_1,index_z_2,index_name1)) y1.append(index_name1) elif label_pred[i] == 2: x2.append(X[i]) index_z = z_frame[i] index_z_1 = index_z[0] index_z_2 = index_z[1] for m in range(len(z_frame_f)): index_z_f = z_frame_f[m] index_z_f_1 = index_z_f[0] index_z_f_2 = index_z_f[1] if index_z_f_1==index_z_1 and index_z_2==index_z_f_2: index_name1 = label_name_f[m] print("3 {0} {1} {2}".format(index_z_f_1,index_z_2,index_name1)) y2.append(index_name1) print("=========================================") #print(x0) print("\n====1===") print(y0) print("====2===") print(y1) print("====3===") print(y2) x0=np.array(x0) x1=np.array(x1) x2=np.array(x2) final_matrix = [] for i in range(len(y_train)): #y_train[i] -=1 final_matrix.append(y_train[i]) final_matrix.append(label_pred[i]) final_matrix.append(x_train[i]) #print(final_matrix) #print("{0} {1} \n {2} \n {3} \n".format(len(label_pred),len(y_train),label_pred,y_train)) print("\n\n\n\n\n============================") print(label_pred) print(y_train) print("============================") plt.scatter(x0[:, 0], x0[:, 1], c = "red", marker=‘o‘, label=‘label0‘) plt.scatter(x1[:, 0], x1[:, 1], c = "green", marker=‘*‘, label=‘label1‘) plt.scatter(x2[:, 0], x2[:, 1], c = "blue", marker=‘+‘, label=‘label2‘) #plt.xlabel(‘petal length‘) #plt.ylabel(‘petal width‘) plt.legend(loc=2) plt.show() if __name__ == ‘__main__‘: main()
########################33
标签:nsf sklearn div pairs load size mod uniq put
原文地址:https://www.cnblogs.com/herd/p/14785086.html