def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 #the last column is used for the labels baseEntropy = calcShannonEnt(dataSet) bestInfoGain = 0.0; bestFeature = -1 for i in range(numFeatures): #iterate over all the features featList = [example[i] for example in dataSet]#create a list of all the examples of this feature uniqueVals = set(featList) #get a set of unique values newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropy if (infoGain > bestInfoGain): #compare this to the best gain so far bestInfoGain = infoGain #if better than current best, set to best bestFeature = i return bestFeature #returns an integer
from __future__ import division def GetAverage(mat): n=len(mat) m= width(mat) num = [0]*m for j in range(0,m): for i in mat: num[j]=num[j]+i[j] num[j]=num[j]/n return num def width(lst): i=0 for j in lst[0]: i=i+1 return i def GetVar(average,mat): ListMat=[] for i in mat: ListMat.append(list(map(lambda x: x[0]-x[1], zip(average, i)))) n=len(ListMat) m= width(ListMat) num = [0]*m for j in range(0,m): for i in ListMat: num[j]=num[j]+(i[j]*i[j]) num[j]=num[j]/n return num def DenoisMat(mat): average=GetAverage(mat) variance=GetVar(average,mat) section=list(map(lambda x: x[0]+x[1], zip(average, variance))) n=len(mat) m= width(mat) num = [0]*m denoisMat=[] for i in mat: for j in range(0,m): if i[j]>section[j]: i[j]=section[j] denoisMat.append(i) return denoisMat
‘‘‘ Sampling archive @author: Garvin Li ‘‘‘ import random def RandomSampling(dataMat,number): try: slice = random.sample(dataMat, number) return slice except: print ‘sample larger than population‘ def SystematicSampling(dataMat,number): length=len(dataMat) k=length/number sample=[] i=0 if k>0 : while len(sample)!=number: sample.append(dataMat[0+i*k]) i+=1 return sample else : return RandomSampling(dataMat,number)
/********************************
* 本文来自博客 “李博Garvin“
* 转载请标明出处:http://blog.csdn.net/buptgshengod
******************************************/
原文地址:http://blog.csdn.net/buptgshengod/article/details/37992719