Python实现决策树算法ID3
2017-07-04 17:23
537 查看
#coding=utf-8 # 构造数据集 def createDataSet(): dataSet = [[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']] features = ['no surfacing','flippers'] return dataSet,features # 采用字典来递归构造决策树 def treeGrowth(dataSet,features): classList = [example[-1] for example in dataSet] if classList.count(classList[0])==len(classList): return classList[0] if len(dataSet[0])==1:# no more features return classify(classList) bestFeat = findBestSplit(dataSet)#bestFeat is the index of best feature bestFeatLabel = features[bestFeat] myTree = {bestFeatLabel:{}} featValues = [example[bestFeat] for example in dataSet] uniqueFeatValues = set(featValues) del (features[bestFeat]) for values in uniqueFeatValues: subDataSet = splitDataSet(dataSet,bestFeat,values) myTree[bestFeatLabel][values] = treeGrowth(subDataSet,features) return myTree # 当没有多余的feature,但是剩下的样本不完全是一样的类别是,采用出现次数多的那个类别 def classify(classList): classCount = {} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.iteritems(),key = operator.itemgetter(1),reverse = True) return sortedClassCount[0][0] # 寻找用于分裂的最佳属性(遍历所有属性,算信息增益) def findBestSplit(dataset): numFeatures = len(dataset[0])-1 baseEntropy = calcShannonEnt(dataset) bestInfoGain = 0.0 bestFeat = -1 for i in range(numFeatures): featValues = [example[i] for example in dataset] uniqueFeatValues = set(featValues) newEntropy = 0.0 for val in uniqueFeatValues: subDataSet = splitDataSet(dataset,i,val) prob = len(subDataSet)/float(len(dataset)) newEntropy += prob*calcShannonEnt(subDataSet) if(baseEntropy - newEntropy)>bestInfoGain: bestInfoGain = baseEntropy - newEntropy bestFeat = i return bestFeat # 选择完分裂属性后,就行数据集的分裂 def splitDataSet(dataset,feat,values): retDataSet = [] for featVec in dataset: if featVec[feat] == values: reducedFeatVec = featVec[:feat] reducedFeatVec.extend(featVec[feat+1:]) retDataSet.append(reducedFeatVec) return retDataSet # 计算数据集的熵 def calcShannonEnt(dataset): numEntries = len(dataset) labelCounts = {} for featVec in dataset: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries if prob != 0: shannonEnt -= prob*log(prob,2) return shannonEnt # 根据构造的决策树进行数据的分类 def predict(tree,newObject): while isinstance(tree,dict): key = tree.keys()[0] tree = tree[key][newObject[key]] return tree if __name__ == '__main__': dataset,features = createDataSet() tree = treeGrowth(dataset,features) print tree print predict(tree,{'no surfacing':1,'flippers':1}) print predict(tree,{'no surfacing':1,'flippers':0}) print predict(tree,{'no surfacing':0,'flippers':1}) print predict(tree,{'no surfacing':0,'flippers':0})
相关文章推荐
- python代码实现ID3决策树算法
- 基于ID3决策树算法的实现(Python版)
- python实现ID3决策树算法
- python实现ID3决策树算法
- 基于python的sklearn库的决策树算法基本实现
- 决策树算法Python代码实现
- 【用python实现《统计学习方法》】之决策树C4.5/ID3
- python实现决策树C4.5算法(在ID3基础上改进)
- 决策树算法实现(python)
- 决策树算法及python实现
- python实现决策树C4.5算法(ID3基础上改进)
- ID3决策树算法原理及C++实现
- 机器学习之决策树(ID3)算法与Python实现
- 决策树ID3和C4.5算法Python实现源码
- ID3决策树的算法原理与python实现
- 决策树算法实现应用【基于Python语言实现】
- 决策树算法原理及JAVA实现(ID3)
- 决策树算法原理及JAVA实现(ID3)
- Python实现决策树算法 C4.5和ID3算法
- C4.5决策树算法(Python实现)