标签:
去除平均值
计算协方差矩阵
计算协方差矩阵的特征值和特征向量
将特征值从大到小排序
保留最上面的N个特征向量
将数据转换到上述N个特征向量构建的新空间中
# 加载数据的函数
def loadData(filename, delim = ‘\t‘):
fr = open(filename)
stringArr = [line.strip().split(delim) for line in fr.readlines()]
datArr = [map(float,line) for line in stringArr]
return mat(datArr)
# =================================
# 输入:dataMat:数据集
# topNfeat:可选参数,需要应用的N个特征,可以指定,不指定的话就会返回全部特征
# 输出:降维之后的数据和重构之后的数据
# =================================
def pca(dataMat, topNfeat=9999999):
meanVals = mean(dataMat, axis=0)# axis = 0表示计算纵轴
meanRemoved = dataMat - meanVals #remove mean
covMat = cov(meanRemoved, rowvar=0)# 计算协方差矩阵
eigVals,eigVects = linalg.eig(mat(covMat))# 计算特征值(eigenvalue)和特征向量
eigValInd = argsort(eigVals) #sort, sort goes smallest to largest
eigValInd = eigValInd[:-(topNfeat+1):-1] #cut off unwanted dimensions
redEigVects = eigVects[:,eigValInd] #reorganize eig vects largest to smallest
lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions
reconMat = (lowDDataMat * redEigVects.T) + meanVals
return lowDDataMat, reconMat
filename = r‘E:\ml\machinelearninginaction\Ch13\testSet.txt‘
dataMat = loadData(filename)
lowD, reconM = pca(dataMat, 1)
def plotData(dataMat,reconMat):
fig = plt.figure()
ax = fig.add_subplot(111)
# 绘制原始数据
ax.scatter(dataMat[:, 0].flatten().A[0], dataMat[:,1].flatten().A[0], marker=‘^‘, s = 90)
# 绘制重构后的数据
ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0], marker=‘o‘, s = 10, c=‘red‘)
plt.show()
lowD, reconM = pca(dataMat, 2)
标签:
原文地址:http://www.cnblogs.com/mooba/p/5530577.html