标签:机器学习
KNN算法实现:
提取文本:
import numpy as np //提取文本 def loadDataSet(fileName): numFeat = len(open(fileName).readline().split(‘,‘)) dataMat = []; labelMat = [] fr = open(fileName) for line in fr.readlines(): lineArr=[] line = line.strip() curline = line.split(‘,‘) for i in range(0,numFeat-1): lineArr.append(float(curline[i])) dataMat.append(lineArr) labelMat.append(float(curline[-1])) xMat = np.mat(dataMat) return xMat,labelMat //训练样本标准化 def autoNorm(dataSet): minVals = dataSet.min(0) maxVals = dataSet.max(0) range = maxVals - minVals normDataSet = np.zeros(np.shape(dataSet)) m = dataSet.shape[0] normDataSet = dataSet - np.tile(minVals,(m,1)) normDataSet = normDataSet/np.tile(range,(m,1)) return normDataSet //讲总样本分为训练样本和检测样本 def classifyDataSet(normDataSet,labelMat): labDataSet = np.array(normDataSet.copy()) testDataSet = [] testResultSet = [] classfiDataSet = [] classfiResultSet = [] size = labDataSet.shape[0] for j in range(size): if(j%50==0): testDataSet.append(labDataSet[j]) testResultSet.append(labelMat[j]) else: classfiDataSet.append(labDataSet[j]) classfiResultSet.append(labelMat[j]) return classfiDataSet,classfiResultSet,testDataSet,testResultSet
KNN
这里进入的是两个数组,不是矩阵
import numpy as np import operator as op def classify(inX, dataSet, labels,k=7): dataSetSize = dataSet.shape[0] diffMat = np.tile(inX,(dataSetSize,1))-dataSet sqDiffMat = diffMat**2 sqlDistances = sqDiffMat.sum(axis=1) distances = sqlDistances**0.5 sortedDistIndices = distances.argsort() classCount = {} for i in range(k): voteLabel = labels[sortedDistIndices[i]] classCount[voteLabel] = classCount.get(voteLabel,0)+1 sortedResult = sorted(classCount.iteritems(),key=op.itemgetter(1),reverse=True) return sortedResult[0][0]
main函数
import KNN import fileOp import numpy as np import matplotlib import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) dataMat,labelsMat = fileOp.loadDataSet(‘donate.txt‘) normDataSet = fileOp.autoNorm(dataMat) result = [] classfiDataSet,classfiResultSet,testDataSet,testResultSet = fileOp.classifyDataSet(normDataSet,labelsMat) testDataSet = np.array(testDataSet) classfiDataSet = np.array(classfiDataSet) for i in range(testDataSet.shape[0]): result.append(KNN.classify(testDataSet[i,:],classfiDataSet,classfiResultSet,5)) print result print testResultSet ax.scatter(normDataSet[:,2],normDataSet[:,3],15.0*(np.array(labelsMat)+1),15.0*(np.array(labelsMat)+1)) plt.show()
注意:
序列可以增加或减小,无shape操作
数组有shape,转置等操作,是基于某个轴进行操作的。数组有切片功能,一般用数据操作即可,矩阵用于运算。
np.dot(arr.T,arr)可以用于计算内积
numpy
array和matrix之间的区别:参考http://www.aichengxu.com/view/12902
标签:机器学习
原文地址:http://wukong0716.blog.51cto.com/10442773/1694574