标签:
from math import log #以决策为标准计算信息熵 def calcShannonEnt(dataSet): numEntries = len(dataSet) labelCounts = {} for featVec in dataSet: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt -= prob *log(prob,2) return shannonEnt def creatDataSet(): dataSet = [[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']] labels = ['no surfacing','flippers'] return dataSet,labels def splitDataSet(dataSet,axis,value): retDataSet = []#根据特征新建链表 for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet #选择信息熵最小的最佳特征 def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0])-1 baseEntropy = calcShannonEnt(dataSet)#比较标准为最末尾的特征 bestInfoGain =0.0;bestFeature =-1 for i in range(numFeatures): featList = [example[i] for example in dataSet]#建立特征i的链表 uniqueVals = set(featList)#set不允许重复 newEntropy =0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet,i,value) prob = len(subDataSet)/float(len(dataSet))#计算满足特征i为value的概率 newEntropy += prob*calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy if(infoGain > bestInfoGain): bestInfoGain = infoGain bestFeature = i return bestFeature # def majorityCnt(classList): classCount ={} for vote in classList: if vote not in classCount.keys():classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.iteritems(),key = operator.itemgetter(1),reverse = True) return sortedClassCount[0][0] #构建递归决策树 def createTree(dataSet,labels): classList = [example[-1] for example in dataSet] if classList.count(classList[0])==len(classList):#如果决策全相同,则停止分割 return classList[0] if len(dataSet[0])==1:#没有特征了,则返回出现次数多的 return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet)#选择最佳的特征的下坐标 bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel:{}} del(labels[bestFeat])#删除这个特征 featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues)#最佳特征的值set for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels) return myTree
标签:
原文地址:http://blog.csdn.net/li_chihang/article/details/44965279