码迷,mamicode.com
首页 > 编程语言 > 详细

吴裕雄 python 机器学习-DMT(1)

时间:2018-12-20 14:24:57      阅读:191      评论:0      收藏:0      [点我收藏+]

标签:str   mys   col   oat   uniq   技术   机器学习   ase   ssl   

import numpy as np
import operator as op

from math import log

def createDataSet():
    dataSet = [[1, 1, yes],
               [1, 1, yes],
               [1, 0, no],
               [0, 1, no],
               [0, 1, no]]
    labels = [no surfacing,flippers]
    return dataSet, labels

dataSet,labels = createDataSet()
print(dataSet)
print(labels)

def calcShannonEnt(dataSet):
    labelCounts = {}
    for featVec in dataSet: 
        currentLabel = featVec[-1]
        if(currentLabel not in labelCounts.keys()): 
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    rowNum = len(dataSet)
    for key in labelCounts:
        prob = float(labelCounts[key])/rowNum
        shannonEnt -= prob * log(prob,2)
    return shannonEnt

shannonEnt = calcShannonEnt(dataSet)
print(shannonEnt)

def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if(featVec[axis] == value):
            reducedFeatVec = featVec[:axis]    
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

retDataSet = splitDataSet(dataSet,1,1)
print(np.array(retDataSet))
retDataSet = splitDataSet(dataSet,1,0)
print(retDataSet)

def chooseBestFeatureToSplit(dataSet):
    numFeatures = np.shape(dataSet)[1]-1      
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):        
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)       
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)     
        infoGain = baseEntropy - newEntropy     
        if (infoGain > bestInfoGain):       
            bestInfoGain = infoGain        
            bestFeature = i
    return bestFeature 

bestFeature = chooseBestFeatureToSplit(dataSet)
print(bestFeature)

def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if(vote not in classCount.keys()): 
            classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.items(), key=op.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]
    if(classList.count(classList[0]) == len(classList)): 
        return classList[0]
    if len(dataSet[0]) == 1: 
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]   
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
    return myTree

myTree = createTree(dataSet,labels)
print(myTree)

def classify(inputTree,featLabels,testVec):
    for i in inputTree.keys():
        firstStr = i
        break
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    key = testVec[featIndex]
    valueOfFeat = secondDict[key]
    if isinstance(valueOfFeat, dict): 
        classLabel = classify(valueOfFeat, featLabels, testVec)
    else:
        classLabel = valueOfFeat
    return classLabel

featLabels = [no surfacing, flippers]
classLabel = classify(myTree,featLabels,[1,1])
print(classLabel)

import pickle

def storeTree(inputTree,filename):
    fw = open(filename,wb)
    pickle.dump(inputTree,fw)
    fw.close()
    
def grabTree(filename):
    fr = open(filename,rb)
    return pickle.load(fr)

filename = "D:\\mytree.txt"
storeTree(myTree,filename)
mySecTree = grabTree(filename)
print(mySecTree)

featLabels = [no surfacing, flippers]
classLabel = classify(mySecTree,featLabels,[0,0])
print(classLabel)

技术分享图片

 

吴裕雄 python 机器学习-DMT(1)

标签:str   mys   col   oat   uniq   技术   机器学习   ase   ssl   

原文地址:https://www.cnblogs.com/tszr/p/10148597.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!