注意:1、代码中的注释请不要放在源程序中运行,会报错。
2、代码中的数据集来源于http://archive.ics.uci.edu/ml/datasets/Car+Evaluation
3、对于朴素贝叶斯的原理,可以查看我的前面的博客
# Author :Wenxiang Cui # Date :2015/9/11 # Function: A classifier which using naive Bayesian algorithm import math class Bayesian: def __init__(self): self.dataS = [] # 训练样本集DataSource self.attriList = [] # 属性集合 self.desClass = 0 # 分类目标属性在attriList中的位置 def loadDataS(self,fileName,decollator): #input: # fileName - DataSource 的文件名 # decollator - DataSource 中每个字段之间的分割符,有可能是空格或‘,‘ #function : # 从磁盘中读取数据并转化为较好处理的列表 items = [] fp = open(filename,‘r‘) lines = fp.readlines() for line in lines: line = line.strip(‘\n‘) items.append(line) fp.close() i = 0 b = [] for i in range(len(items)): b.append(items[i].split(decollator)) self.dataS = b[:] def getAttriList(self,attributes): #input: # attributes - 训练数据集中的属性集合,必须与dataSource中的列相对应 #function: # 获得训练数据集的属性列表 self.attriList = attributes[:] def getDesClass(self,loca): #input: # loca - 分类目标属性在attriList中的位置 #function: # 获得分类目标属性在attriList中的位置 self.desClass = loca def calPriorProb(self): #input: # #function: # 计算类的先验概率 dictFreq = {} # 构建频度表,用字典表示 desLabel = [] sampleNum = 0 for items in self.dataS: sampleNum += 1 if not items[self.desClass] in dictFreq: dictFreq[items[self.desClass]] = 1 desLabel.append(items[self.desClass]) else: dictFreq[items[self.desClass]] += 1 dictPriorP = {} # 构建先验概率表,用字典表示 for item in desLabel: dictPriorP[item] = float(dictFreq[item]) / sampleNum self.PriorP = dictPriorP[:] self.classLabel = desLabel[:] def calProb(self,type,loca): #input: # type - 定义属性是连续的还是离散的 # loca - 该属性在属性集中的位置 #output: # dictPara - 连续属性的样本均值和方差(列表表示) # dictProb - 离散属性的类条件概率 #function: # 计算某个属性的类条件概率密度 if type == ‘continuous‘: dictData = [] # 提取出样本的类别和当前属性值 dictPara = [] # 记录样本的类别和其对应的样本均值和方差 for item in self.classLabel: dictData.append([]) dictPara.append([]) for items in self.dataS: dataIndex = self.classLabel.index(items[self.desLabel]) # 返回当前样本类属性 dictData[dataIndex].append(float(items[loca])) # 记录当前属性值及该样本的类属性 #计算类属性的样本均值和方差(可以用Numpy包来快速处理) for i in range(len(self.classLabel)): [a,b] = self.calParam(dictData[i]) dictPara[i].append(a) dictPara[i].append(b) return dictPara elif type == ‘discrete‘: dictFreq = {} dictProb = {} for item in self.classLabel:# 构建频度表,用字典表示 dictFreq[item] = {} dictProb[item] = {} label = [] for items in self.dataS: if not items[loca] in label: label.append(items[loca]) dictFreq[items[self.desClass]][items[loca]] = 1 else: dictFreq[items[self.desClass]][items[loca]] += 1 needLaplace = 0 for key in dictFreq.keys(): for ch in labels: if ch not in dictFreq[key]: dictFreq[key][ch] = 0 needLaplace = 1 if needLaplace == 1: # 拉普拉斯平滑用于处理类条件概率为0的情况 dictFreq[key] = self.LaplaceEstimator(dictFreq[key]) needLaplace = 0 for item in self.classLabel: for ch in dictFreq[item]: dictProb[item][ch] = float(dictFreq[item][ch]) / self.dictFreq[item] return dictProb else: print ‘Wrong type!‘ def calParam(self,souList): #input: # souList - 待计算的列表 #output: # meanVal - 列表元素的均值 # deviation - 列表元素的标准差 #function: # 计算某个属性的类条件概率密度 meanVal = sum(souList) / float(len(souList)) deviation = 0 tempt = 0 for val in souList: tempt += (val - meanVal)**2 deviation = math.sqrt(float(tempt)/(len(souList)-1)) return meanVal,deviation def LaplaceEstimator(self,souDict): #input: # souDict - 待计算的字典 #output: # desDict - 平滑后的字典 #function: # 拉普拉斯平滑 desDict = souDict.copy() for key in souDict: desDict[key] = souDict[key] + 1 return desDict class CarBayesian(Bayesian): def __init__(self): Bayesian.__init__(self) self.buying = {} self.maint = {} self.doors = {} self.persons = {} self.lug_boot = {} self.safety = {} def tranning(self): self.Prob = [] self.buying = Bayesian.calProb(‘discrete‘,0) self.maint = Bayesian.calProb(‘discrete‘,1) self.doors = Bayesian.calProb(‘discrete‘,2) self.persons = Bayesian.calProb(‘discrete‘,3) self.lug_boot = Bayesian.calProb(‘discrete‘,4) self.safety = Bayesian.calProb(‘discrete‘,5) self.Prob.append(self.buying) self.Prob.append(self.maint) self.Prob.append(self.doors) self.Prob.append(self.persons) self.Prob.append(self.lug_boot) self.Prob.append(self.safety) def classify(self,sample): #input : # sample - 一个样本 #function: # 判断输入的这个样本的类别 posteriorProb = {} for item in self.classLabel: posteriorProb[item] = self.PriorP[item] for i in range(len(sample)-1): posteriorProb[item] *= self.Prob[i][item][sample[i]] maxVal = posteriorProb[self.classLabel[0]] i = 0 for item in posteriorProb: i += 1 if posteriorProb[item] > maxVal: maxVal = posteriorProb[item] location = i print "该样本属于的类别是:",self.classLabel[location] filename = "D:\MyDocuments-HnH\DataMining\DataSets\Car\Car_Data.txt" MyCar = CarBayesian() MyCar.loadDataS(filename,‘,‘) attributes = [‘buying‘,‘maint‘,‘doors‘,‘persons‘,‘lug_boot‘,‘safety‘] MyCar.getAttriList(attributes) MyCar.getDesClass(7-1) MyCar.tranning() sample = [‘vhigh‘,‘vhigh‘,‘2‘,‘2‘,‘small‘,‘low‘]
本文出自 “路遥” 博客,请务必保留此出处http://cwxfly.blog.51cto.com/6113982/1694356
原文地址:http://cwxfly.blog.51cto.com/6113982/1694356