标签:机器学习
ID3算法
#coding=utf-8 from math import log import operator #这里定义个样本集 def createDataSet(): dataSet = [[1, 1, ‘yes‘], [1, 1, ‘yes‘], [1, 0, ‘no‘], [0, 1, ‘no‘], [0, 1, ‘no‘]] labels = [‘no surfacing‘,‘flippers‘] #change to discrete values return dataSet, labels #这里计算该样本的期望值 def calcShannonEnt(dataSet): numEntries = len(dataSet) labelCounts = {} for featVec in dataSet: #the the number of unique elements and their occurance currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt -= prob * log(prob,2) #log base 2 return shannonEnt #这里计算数据集中某列满足相同条件下的其他数值组成的矩阵,用于计算该特征值的期望值 def splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] #chop out axis used for splitting reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet #针对上个函数算出的矩阵,计算出所有特征值的期望值,然后得出最大的信息增量 def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 #the last column is used for the labels baseEntropy = calcShannonEnt(dataSet) bestInfoGain = 0.0; bestFeature = -1 for i in range(numFeatures): #iterate over all the features featList = [example[i] for example in dataSet]#create a list of all the examples of this feature uniqueVals = set(featList) #get a set of unique values newEntropy = 0.0 for value in uniqueVals: #这里选取某一特征值得所有可能计算期望值 subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropy if (infoGain > bestInfoGain): #compare this to the best gain so far bestInfoGain = infoGain #if better than current best, set to best bestFeature = i return bestFeature #returns an integer #如果最后的结果有多个,投票机制取最大的 def majorityCnt(classList): classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] #创建树,使用递归 def createTree(dataSet,labels): #计算dataset里是否只有单值 classList = [example[-1] for example in dataSet] #如果只有单值,且唯一,则返回单值 if classList.count(classList[0]) == len(classList): return classList[0] #如果是最后结果,但有多值,取最多的 if len(dataSet[0]) == 1: return majorityCnt(classList) #取最佳的特征值 bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] #根据这个特征值制定字典 myTree = {bestFeatLabel:{}} #删除这个特征值 del(labels[bestFeat]) #找出这个特征值下有几个选择 featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] #针对每个选择,建立不同的分支 myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) return myTree #决策树分类器,inputtree是决策树,fearlabel是列种类,testVec是要分类的向量 def classify(inputTree,featLabels,testVec): firstStr = inputTree.keys()[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) key = testVec[featIndex] valueOfFeat = secondDict[key] if isinstance(valueOfFeat, dict): classLabel = classify(valueOfFeat, featLabels, testVec) else: classLabel = valueOfFeat return classLabel #序列化决策树 def storeTree(inputTree,filename): import pickle fw = open(filename,‘w‘) pickle.dump(inputTree,fw) fw.close() #提取决策树 def grabTree(filename): import pickle fr = open(filename) return pickle.load(fr)
输入矩阵是一张表,最后一列是结果。
ID3算法是比较粗糙的算法,对标志性变量分类可以,但是无法分析数值型数据。
而且在选择特征值时,倾向于选择种类较多的特征值,因此需要改进。
标签:机器学习
原文地址:http://wukong0716.blog.51cto.com/10442773/1695340