使用KNN分类器的手写识别系统 仅仅能识别数字0到9。
须要识别的数字使用图形处理软件,处理成具有同样的色 彩和大小 :宽髙是32像素X32像素的黑白图像。虽然採用文本格式存储图像不能有效地利用内存空间,为了方便理解,这里已经将将图像转换为文本格式。训练数据中每一个数字大概有200个样本。程序中将图像样本格式化处理为向量,即一个把一个32x32的二进制图像矩阵转换为一个1x1024的向量。
from numpy import * import operator from os import listdir import matplotlib import matplotlib.pyplot as plt import pdb def classify0(inX, dataSet, labels, k=3): #pdb.set_trace() dataSetSize = dataSet.shape[0] diffMat = tile(inX, (dataSetSize,1)) - dataSet sqDiffMat = diffMat**2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances**0.5 sortedDistIndicies = distances.argsort() #ascend sorted, #return the index of unsorted, that is to choose the least 3 item classCount={} for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1# a dict with label as key and occurrence number as value sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) ‘‘‘descend sorted according to value, ‘‘‘ return sortedClassCount[0][0] def file2matrix(filename): fr = open(filename) #pdb.set_trace() L = fr.readlines() numberOfLines = len(L) #get the number of lines in the file returnMat = zeros((numberOfLines,3)) #prepare matrix to return classLabelVector = [] #prepare labels return index = 0 for line in L: line = line.strip() listFromLine = line.split(‘\t‘) returnMat[index,:] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) #classLabelVector.append((listFromLine[-1])) index += 1 fr.close() return returnMat,classLabelVector def plotscattter(): datingDataMat,datingLabels = file2matrix(‘datingTestSet2.txt‘) #load data setfrom file fig = plt.figure() ax1 = fig.add_subplot(111) ax2 = fig.add_subplot(111) ax3 = fig.add_subplot(111) ax1.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*array(datingLabels),15.0*array(datingLabels)) #ax2.scatter(datingDataMat[:,0],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels)) #ax2.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels)) plt.show() def autoNorm(dataSet): minVals = dataSet.min(0) maxVals = dataSet.max(0) ranges = maxVals - minVals normDataSet = zeros(shape(dataSet)) m = dataSet.shape[0] normDataSet = dataSet - tile(minVals, (m,1)) normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide return normDataSet, ranges, minVals def datingClassTest(hoRatio = 0.20): #hold out 10% datingDataMat,datingLabels = file2matrix(‘datingTestSet2.txt‘) #load data setfrom file normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] numTestVecs = int(m*hoRatio) errorCount = 0.0 for i in range(numTestVecs): classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3) print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]) if (classifierResult != datingLabels[i]): errorCount += 1.0 print "the total error rate is: %.2f%%" % (100*errorCount/float(numTestVecs)) print ‘testcount is %s, errorCount is %s‘ %(numTestVecs,errorCount) def classifyPerson(): ‘‘‘ input a person , decide like or not, then update the DB ‘‘‘ resultlist = [‘not at all‘,‘little doses‘,‘large doses‘] percentTats = float(raw_input(‘input the person\‘ percentage of time playing video games:‘)) ffMiles = float(raw_input(‘flier miles in a year:‘)) iceCream = float(raw_input(‘amount of iceCream consumed per year:‘)) datingDataMat,datingLabels = file2matrix(‘datingTestSet2.txt‘) normMat, ranges, minVals = autoNorm(datingDataMat) normPerson = (array([ffMiles,percentTats,iceCream])-minVals)/ranges result = classify0(normPerson, normMat, datingLabels, 3) print ‘you will probably like this guy in:‘, resultlist[result -1] #update the datingTestSet print ‘update dating DB‘ tmp = ‘\t‘.join([repr(ffMiles),repr(percentTats),repr(iceCream),repr(result)])+‘\n‘ with open(‘datingTestSet2.txt‘,‘a‘) as fr: fr.write(tmp) def img2file(filename): #vector = zeros(1,1024) with open(filename) as fr: L=fr.readlines() vector =[int(L[i][j]) for i in range(32) for j in range(32)] return array(vector,dtype = float) def handwritingClassTest(): hwLabels = [] trainingFileList = listdir(‘trainingDigits‘) #load the training set m = len(trainingFileList) trainingMat = zeros((m,1024)) for i in range(m): fileNameStr = trainingFileList[i] fileStr = fileNameStr.split(‘.‘)[0] #take off .txt classNumStr = int(fileStr.split(‘_‘)[0]) hwLabels.append(classNumStr) trainingMat[i,:] = img2vector(‘trainingDigits/%s‘ % fileNameStr) testFileList = listdir(‘testDigits‘) #iterate through the test set errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split(‘.‘)[0] #take off .txt classNumStr = int(fileStr.split(‘_‘)[0]) vectorUnderTest = img2vector(‘testDigits/%s‘ % fileNameStr) classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr) if (classifierResult != classNumStr): errorCount += 1.0 print "\nthe total number of errors is: %d" % errorCount print "\nthe total error rate is: %f" % (errorCount/float(mTest)) if __name__ == ‘__main__‘: datingClassTest() #handwritingClassTest()
数据点的类别标签是连续值时应用KNN算法就是回归。与KNN分类算法过程同样。差别在于对K个邻居的处理上。KNN回归是取K个邻居类标签值得加权作为新数据点的预測值。加权方法有:K个近邻的属性值的平均值(最差)、1/d为权重(有效的衡量邻居的权重。使较近邻居的权重比較远邻居的权重大)、高斯函数(或者其它适当的减函数)计算权重= gaussian(distance) (距离越远得到的值就越小,加权得到更为准确的预计。K-近邻算法是分类数据最简单最有效的算法,其学习基于实例,使用算法时我们必须有接近实际数据的训练样本数据。K-近邻算法必须保存所有数据集,假设训练数据集的非常大,必须使用大量的存储空间。此外,因为必须对数据集中的每一个数据计算距离值,实际使用时可能非常耗时。
