您的位置:首页 > 其它

【数据挖掘】决策树之ID3算法

2013-08-25 09:19 267 查看
    在决策树中,如何衡量数据集的有序度至关重要。划分数据集的最大规则:将无序的数据变得更加有序,划分数据集前后信息发生的变化称为信息增益。香农熵表现为信息的期望值,熵越大,数据越混乱。另外一种度量集合是否无序的方法时基尼不纯度。

def calc_entropy(dataset):#香农熵的计算
lines = len(dataset)
lables={}
for curvect in dataset:
curlable = curvect[-1]#分类
labels.setdefault(curlable,0)
labels[curlable]+=1
shannonEntroy=0.0
for key in lables.keys():
prob = float(lables[key])/lines
shannonEntroy-=math.log(prob,2)*prob
return shannonEntroy

#划分数据集:从dataset中挑选特征cIndex中值为cValue的集合,返回集合中不包括cIndex对应的特征
def splitDataset(dataset,cIndex,cValue):
rset=[]
for featureset in dataset:
if featureset[cIndex] == cValue:
tmpset = featureset[:cIndex]
tmpset.append(featureset[cIndex+1:])
rset.extend(tmpset)
return rset

#ID3算法核心思想:遍历每个特征,找到最好的特征作为划分
def choose_best_feature(dataset):
features = len(dataset[0]) - 1
baseEntropy = calc_entropy(dataset)
bestfearue = -1
bestGain = 0.0
for i in range(features):
#从第i个特征的香农熵
featurelist = [example[i] for example in dataset]
uniqfeatureval = set(featurelist)
newEntropy = 0.0 #第
#计算第i个特征每个特征值中的香农熵,计算信息增益
for val in uniqfeatureval:
subdataset = splitDataset(dataset,i,val)
prob = len(subdataset)/float(len(dataset))
newEntropy += prob * calc_entropy(subdataset)
infoGain = baseEntropy - newEntropy
#保存最佳的特征
if infoGain > bestGain:
bestGain = infoGain
bestfearue = i
return bestfearue

def majortiyCnt(classList):
classCount={}
for vote in classList:
classCount.setdefault(vote,0)
classCount[vote]+=1
sortedclassCount = sorted(classCount.iteritems(),
key=operator.itemgetter(1),reverse=True)
return sortedclassCount[0][0]
#构建决策树,递归终止的条件:1)类别完全相同 2)只剩下一个特征
def createTree(dataset,lables):
classList = [example[-1] for example in dataset]
#划分类别完全相同
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(dataset[0]) == 1:
return majortiyCnt(classList)
bestfeature = choose_best_feature(dataset)
bestlable = labels[bestfeature]
myTree = {bestlable:{}}
del labels[bestfeature]
#递归构建决策树
featrueVals = [example[bestfeature] for example in dataset]
uniqfeature = set(featrueVals)
#为第featrueVals特征的每个特征值构建子决策树
for value in uniqfeature:
sublables = lables[:]#复制
subdataset = splitDataset(dataset,bestfeature,value)
myTree[bestlable][value]=	createTree(subdataset,sublables)
return myTree
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: