您的位置:首页 > 编程语言

统计学习方法第五章决策树的ID3算法代码实践 例5.3

2018-01-18 11:33 381 查看
统计学习方法第五章决策树的ID3算法代码实践  例5.3(下面还有一个封装起来的类)1.from numpy import *import mathdef loadDataSet(): # 本书例题的数据集dataset = [['青年', '否', '否', '一般', '否'],['青年', '否', '否', '好', '否'],['青年', '是', '否', '好', '是'],['青年', '是', '是', '一般', '是'],['青年', '否', '否', '一般', '否'],['中年', '否', '否', '一般', '否'],['中年', '否', '否', '好', '否'],['中年', '是', '是', '好', '是'],['中年', '否', '是', '非常好', '是'],['中年', '否', '是', '非常好', '是'],['老年', [b]'否', '是', '非常好', '是'],['老年', '否', '是', '好', '是'],['老年', '是', '否', '好', '是'],['老年', '是', '否', '非常好', '是'],['老年', '否', '否', '一般', '否']]label = ['年龄', '有工作', '有自己的房子', '信贷情况']return dataset, labeldef calculateEntopy(dataSet): # 计算训练集的信息熵resultList = [data[-1] for data in dataSet]lenOfResult = float(len(resultList))uniqueLabels = set(resultList)curEntropy = 0.0for label in uniqueLabels:prob = resultList.count(label) / lenOfResultcurEntropy -= prob * math.log(prob, 2)return curEntropydef splitDataSet(dataSet,i,value):returnDataSet=[]for data in dataSet:if data[i] == value:returnList=data[:i]returnList.extend(data[i+1::])returnDataSet.append(returnList)return returnDataSetdef calculateGain(dataSet, i): # 计算根据某个特征进行划分后的信息增益m = float(len(dataSet))baseEntropy = calculateEntopy(dataSet)listOfValue = [data[i] for data in dataSet]uniqueValueList = set(listOfValue)newEntropy = 0.0for value in uniqueValueList:returnDataSet=splitDataSet(dataSet,i,value)newEntropy += (len(returnDataSet) / m) * calculateEntopy(returnDataSet)return baseEntropy - newEntropydef chooseBestValueToSplit(dataSet): # 通过本函数寻找能获得最大增益的属性m, n = shape(dataSet)bestInfoGain = 0bestValue = 0.0for i in range(n - 1):curInfoGain = calculateGain(dataSet, i)if curInfoGain > bestInfoGain:bestInfoGain = curInfoGainbestValue = ireturn bestValuedef maxResult(resultList):calcNumDict=dict([(resultList.count(result),result) for result in resultList])return calcNumDict[max(calcNumDict.keys())]def createTree(dataSet,labels): #首先判别是否需要继续划分resultList=[data[-1] for data in dataSet]if len(dataSet[0])==1:return maxResult(resultList)if resultList.count(resultList[0]) == len(resultList):return resultList[0]bestValue=chooseBestValueToSplit(dataSet)bestLabel=labels[bestValue]tree={bestLabel:{}}del(labels[bestValue])uniqueValue=set([data[bestValue] for data in dataSet])for value in uniqueValue:returnDataSet=splitDataSet(dataSet,bestValue,value)subLabels=labels[:]subTree=createTree(returnDataSet,subLabels)tree[bestLabel][value]=subTreereturn treedataSet,label=loadDataSet()print(createTree(dataSet,label))
2.对该决策树进行封装
from numpy import *
import math
import copy

class ID3Tree(object):
def __init__(self):
self.dataSet=[]
self.labels=[]
self.Tree={}

def loadDataSet(self):
dataset = [['青年', '否', '否', '一般', '否'],
['青年', '否', '否', '好', '否'],
['青年', '是', '否', '好', '是'],
['青年', '是', '是', '一般', '是'],
['青年', '否', '否', '一般', '否'],
['中年', '否', '否', '一般', '否'],
['中年', '否', '否', '好', '否'],
['中年', '是', '是', '好', '是'],
['中年', '否', '是', '非常好', '是'],
['中年', '否', '是', '非常好', '是'],
['老年', '否', '是', '非常好', '是'],
['老年', '否', '是', '好', '是'],
['老年', '是', '否', '好', '是'],
['老年', '是', '否', '非常好', '是'],
['老年', '否', '否', '一般', '否']]
label = ['年龄', '有工作', '有自己的房子', '信贷情况']
self.dataSet=dataset;self.labels=label

def calculateEntopy(self,dataSet):  # 计算训练集的信息熵
resultList = [data[-1] for data in dataSet]
lenOfResult = float(len(resultList))
uniqueLabels = set(resultList)
curEntropy = 0.0
for label in uniqueLabels:
prob = resultList.count(label) / lenOfResult
curEntropy -= prob * math.log(prob, 2)
return curEntropy

def splitDataSet(self,dataSet, i, value):
returnDataSet = []
for data in dataSet:
if data[i] == value:
returnList = data[:i]
returnList.extend(data[i + 1::])
returnDataSet.append(returnList)
return returnDataSet

def calculateGain(self,dataSet, i):  # 计算根据某个特征进行划分后的信息增益
m = float(len(dataSet))
baseEntropy = self.calculateEntopy(dataSet)
listOfValue = [data[i] for data in dataSet]
uniqueValueList = set(listOfValue)
newEntropy = 0.0
for value in uniqueValueList:
returnDataSet = self.splitDataSet(dataSet, i, value)
newEntropy += (len(returnDataSet) / m) * self.calculateEntopy(returnDataSet)
return baseEntropy - newEntropy

def chooseBestValueToSplit(self,dataSet):  # 通过本函数寻找能获得最大增益的属性
m, n = shape(dataSet)
bestInfoGain = 0
bestValue = 0.0
for i in range(n - 1):
curInfoGain = self.calculateGain(dataSet, i)
if curInfoGain > bestInfoGain:
bestInfoGain = curInfoGain
bestValue = i
return bestValue

def maxResult(self,resultList):
calcNumDict = dict([(resultList.count(result), result) for result in resultList])
return calcNumDict[max(calcNumDict.keys())]

def createTree(self,dataSet, labels):  # 首先判别是否需要继续划分
resultList = [data[-1] for data in dataSet]
if len(dataSet[0]) == 1:
return self.maxResult(resultList)
if resultList.count(resultList[0]) == len(resultList):
return resultList[0]
bestValue = self.chooseBestValueToSplit(dataSet)
bestLabel = labels[bestValue]
tree = {bestLabel: {}}
del (labels[bestValue])
uniqueValue = set([data[bestValue] for data in dataSet])
for value in uniqueValue:
returnDataSet = self.splitDataSet(dataSet, bestValue, value)
subLabels = labels[:]
subTree = self.createTree(returnDataSet, subLabels)
tree[bestLabel][value] = subTree
return tree

def returnTree(self):
labels=copy.deepcopy(self.labels)
self.tree=self.createTree(self.dataSet,labels)

id3tree=ID3Tree()
id3tree.loadDataSet()
id3tree.returnTree()
print(id3tree.tree)
结果为:
{'有自己的房子': {'是': '是', '否': {'有工作': {'是': '是', '否': '否'}}}}
made by zcl at CUMT
I know I can because I have a heart that beats
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  AI ML