标签:
# -*- coding: utf-8 -*- import sys import os import time from numpy import * import numpy as np import matplotlib.pyplot as plt import operator from test1 import * # 夹角余弦距离公式 def cosdist(vector1,vector2): return dot(vector1,vector2)/(linalg.norm(vector1)*linalg.norm(vector2)) # kNN分类器 # 测试集:testdata # 训练集:trainSet # 类别标签:listClasses # k:k个邻居数 def classify(testdata, trainSet, listClasses, k): # 返回样本集的行数 #dataSetSize = trainSet.shape[0] dataSetSize=len(trainSet)#计算出训练集文本数,上面这两种方法都可以 # 计算测试集与训练集之间的距离:夹角余弦 #print(dataSetSize) classcount={} distances = array(zeros(dataSetSize)) print(distances) for i in range(dataSetSize): distances[i]=cosdist(testdata,trainSet[i]) print(distances) sortdistances=argsort(distances) print(sortdistances) for indx in range(k): votelabel=listClasses[sortdistances[indx]] classcount[votelabel]=classcount.get(votelabel,0)+1 print(classcount) print(classcount) sortedclasscount=sorted(classcount.items(),key=operator.itemgetter(1),reverse=True) return sortedclasscount[0][0] #d = sorted(s.iteritems(), key=lambda t: t[1], reverse=False) dataSet,listClasses = loadDataSet() nb = NBayes() nb.train_set(dataSet,listClasses) k = 3 print(classify(nb.tf[3], nb.tf, listClasses, k))
标签:
原文地址:http://www.cnblogs.com/caicaihong/p/5769759.html