您的位置:首页 > 编程语言 > Python开发

【风马一族_Python】 决策树

2016-06-10 19:26 363 查看
《机器学习实战》第三章 决策树

-------------------------------------

#1 trees.py  计算给定数据集的香农熵

-------------------------------------

from math import log

# 计算给定数据集的香农熵
def calcShannonEnt(dataSet):
numEnres = len(dataSet)
labelCoounts = {}
for featVec in dataSet:
#为所有可能分类创建字典
currentLabel = featVec[-1]
if currentLabel not in labelCoounts.keys():
labelCoounts[currentLabel] = 0
labelCoounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCoounts:
prob = float(labelCoounts[key]) / numEnres
shannonEnt -= prob * log(prob, 2)               #以2为底求对数
return shannonEnt

#用来 得到简单鱼类鉴定数据集
def createDataSet():
dataSet = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfacing', 'flippers']
return dataSet, labels




-------------------------------------

#2 trees.py  划分数据集 待划分的数据集、划分数据集的待征、需要返回的特征的值

-------------------------------------

# 划分数据集   待划分的数据集、划分数据集的待征、需要返回的特征的值
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet




-------------------------------------

#3 trees.py  选择最好的数据集划分方式

-------------------------------------

# 划分数据集   待划分的数据集、划分数据集的待征、需要返回的特征的值
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet

# 选择最好的数据集划分方式
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0;
bestFeature = -1;
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0.0;

for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)

infoGain = baseEntropy - newEntropy

if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i

return bestFeature




-------------------------------------

#4 trees.py  创建树的函数代码 两个参数:数据集、标签列表

-------------------------------------

import operator

# 创建树的函数代码 两个参数:数据集、标签列表
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet]

# 类别完全相同则停止继续划分
if classList.count(classList[0]) == len(classList):
return classList[0]

# 遍历完所有特征时返回出现次数最多的
if len(dataSet[0]) == 1:
return majorityCnt(classList)

bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel: {}}
del (labels[bestFeat])

# 得到列表包含的所有属性值
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)

# 遍历当前选择特征包含的所有属性值,在每个数据集划分上递归调用函数createTree()
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)

return myTree


内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: