有监督的kNN近邻算法:
(1)计算已知类别数据集中的点与当前点之间的距离
(2)按照距离递增次序排序
(3)选取与当前点距离最小的k个点
(4)确定前k个点所在类别的出现频率
(5)返回前k个点出现频率最高的类别作为当前点的预测分类
#数据样例
1 2:a
1 3:a1000 10000:d
#版本0:纯python
"kNN" from math import sqrt from collections import Counter distance=lambda a,b:sqrt(sum(map(lambda ai,bi:pow(ai-bi,2),a,b))) if len(a)==len(b) else "Error0:data length match fail" distance2=lambda a,b:distance([int(i) for i in a.split()],[int(i) for i in b.split()]) # for strings #print(distance2('1 2 4 7 8','2 5 5 6 110')) readData=lambda file:{line.split(':')[0]:line.strip().split(':')[1] for line in open(file)} #print(readData()) def judgeSpot(fileIn='test0.txt',x='1 2',num=5): distanceDict,data={},readData(fileIn) for k in data: distanceDict[str(distance2(x,k))]=data[k] # sortDistance=sorted(distanceDict.items(),key=lambda x:float(x[0]))[:num] # kindDict=[item[1] for item in sortDistance] return sorted(dict(Counter(item[1] for item in sorted(distanceDict.items(),key=lambda x:float(x[0]))[:num])).items(),key=lambda x:x[1],reverse=True)[0][0] #print(judgeSpot('1000 10000','test0.txt'),) def judgeSpot2(dataIn,x='1 2',num=5): distanceDict,data={},dataIn for k in data: distanceDict[str(distance2(x,k))]=data[k] # sortDistance=sorted(distanceDict.items(),key=lambda x:float(x[0]))[:num] # kindDict=[item[1] for item in sortDistance] return sorted(dict(Counter(item[1] for item in sorted(distanceDict.items(),key=lambda x:float(x[0]))[:num])).items(),key=lambda x:x[1],reverse=True)[0][0] print(judgeSpot('test0.txt','1000 10000'),) #Rate of Right def rateRight(fileIn='test0.txt',num=5): countRight,data=0,readData(fileIn) for k in data: if judgeSpot2(data,k,num)==data[k]: countRight+=1 return countRight/float(len(open(fileIn).readlines())) print(rateRight())
搜索
复制
原文地址:http://blog.csdn.net/awsxsa/article/details/45955871