标签:划分数 ems extc art 图像 null function enc comm
原始数据集:
变化后的数据集在程序代码中体现,这就不截图了
构建决策树的代码如下:
?
#coding :utf-8
‘‘‘
2017.6.25 author :Erin
function: "decesion tree" ID3
‘‘‘
import numpy as np
import pandas as pd
from math import log
import operator
def load_data():
#data=np.array(data)
data=[[‘teenager‘ ,‘high‘, ‘no‘ ,‘same‘, ‘no‘],
[‘teenager‘, ‘high‘, ‘no‘, ‘good‘, ‘no‘],
[‘middle_aged‘ ,‘high‘, ‘no‘, ‘same‘, ‘yes‘],
[‘old_aged‘, ‘middle‘, ‘no‘ ,‘same‘, ‘yes‘],
[‘old_aged‘, ‘low‘, ‘yes‘, ‘same‘ ,‘yes‘],
[‘old_aged‘, ‘low‘, ‘yes‘, ‘good‘, ‘no‘],
[‘middle_aged‘, ‘low‘ ,‘yes‘ ,‘good‘, ‘yes‘],
[‘teenager‘ ,‘middle‘ ,‘no‘, ‘same‘, ‘no‘],
[‘teenager‘, ‘low‘ ,‘yes‘ ,‘same‘, ‘yes‘],
[‘old_aged‘ ,‘middle‘, ‘yes‘, ‘same‘, ‘yes‘],
[‘teenager‘ ,‘middle‘, ‘yes‘, ‘good‘, ‘yes‘],
[‘middle_aged‘ ,‘middle‘, ‘no‘, ‘good‘, ‘yes‘],
[‘middle_aged‘, ‘high‘, ‘yes‘, ‘same‘, ‘yes‘],
[‘old_aged‘, ‘middle‘, ‘no‘ ,‘good‘ ,‘no‘]]
features=[‘age‘,‘input‘,‘student‘,‘level‘]
return data,features
def cal_entropy(dataSet):
‘‘‘
输入data ,表示带最后标签列的数据集
计算给定数据集总的信息熵
{‘是‘: 9, ‘否‘: 5}
0.9402859586706309
‘‘‘
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
label = featVec[-1]
if label not in labelCounts.keys():
labelCounts[label] = 0
labelCounts[label] += 1
entropy = 0.0
for key in labelCounts.keys():
p_i = float(labelCounts[key]/numEntries)
entropy -= p_i * log(p_i,2)#log(x,10)表示以10 为底的对数
return entropy
def split_data(data,feature_index,value):
‘‘‘
划分数据集
feature_index:用于划分特征的列数,例如“年龄”
value:划分后的属性值:例如“青少年”
‘‘‘
data_split=[]#划分后的数据集
for feature in data:
if feature[feature_index]==value:
reFeature=feature[:feature_index]
reFeature.extend(feature[feature_index