机器学习实战 决策树
2017-07-01 17:47
363 查看
#!/usr/bin/env python # -*- coding: utf-8 -*- from math import log import operator def createDataSet(): dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfacing','flippers'] #change to discrete values return dataSet, labels def calcShannonEnt(dataSet):#计算给定数据集的香农熵 numEntries = len(dataSet)#5 labelCounts = {} for featVec in dataSet: #the the number of unique elements and their occurance currentLabel = featVec[-1]#yes if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1#labelCounts = ['yes':2,'no':3] shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries#2/5 shannonEnt -= prob * log(prob,2) #log base 2 #-0.4*log0.4 return shannonEnt def splitDataSet(dataSet, axis, value):#按照给定特征划分数据集 retDataSet = []#[[1,no],[1,no]] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] #chop out axis used for splitting #[] reducedFeatVec.extend(featVec[axis+1:])#[1,no] retDataSet.append(reducedFeatVec) return retDataSet def chooseBestFeatureToSplit(dataSet):#选择最好的数据集划分方式 numFeatures = len(dataSet[0]) - 1 #the last column is used for the labels baseEntropy = calcShannonEnt(dataSet) bestInfoGain = 0.0; bestFeature = -1 for i in range(numFeatures): #iterate over all the features featList = [example[i] for example in dataSet]#create a list of all the examples of this feature [1,1,1,0,0] uniqueVals = set(featList) #get a set of unique values # set('boy') ['y', 'b', 'o'] set([1,1,1,0,0]) [0,1] newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value)#[[1,no],[1,no]] prob = len(subDataSet)/float(len(dataSet))#0.4 newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropy if (infoGain > bestInfoGain): #compare this to the best gain so far bestInfoGain = infoGain #if better than current best, set to best bestFeature = i return bestFeature #returns an integer def majorityCnt(classList):#返回出现次数最多的分类名称 classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def createTree(dataSet,labels):#递归创建树 classList = [example[-1] for example in dataSet]#[yes,yes,no,no,no] if classList.count(classList[0]) == len(classList): return classList[0]#stop splitting when all of the classes are equal if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet)#0 bestFeatLabel = labels[bestFeat]#'no surfacing' myTree = {bestFeatLabel:{}}#{'no surfacing':{}} del(labels[bestFeat]) featValues = [example[bestFeat] for a22e example in dataSet]#[1,1,1,0,0] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:]#'flippers' #copy all of labels, so trees don't mess up existing labels myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) #{'no surfacing':{0: {} , 1:{ } }} return myTree def classify(inputTree,featLabels,testVec):#决策树分类函数 firstStr = inputTree.keys()[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) key = testVec[featIndex] valueOfFeat = secondDict[key] if isinstance(valueOfFeat, dict): classLabel = classify(valueOfFeat, featLabels, testVec) else: classLabel = valueOfFeat return classLabel if __name__ == '__main__': myDat, labels = createDataSet() label_copy = labels[:] myTree = createTree(myDat, label_copy) print "myTree:",myTree class_label0 = classify(myTree, labels, [1,0]) print "class_label0:",class_label0 class_label1 = classify(myTree, labels, [1,1]) print "class_label1:",class_label1
相关文章推荐
- 机器学习实战—决策树(二)
- 机器学习实战笔记3(决策树与随机森林)
- 机器学习实战-第三章(决策树)
- 机器学习实战笔记-决策树
- 机器学习实战--决策树分类
- 机器学习实战——第三章:决策树ID3/C4.5
- 机器学习实战 - 读书笔记(03) - 决策树
- 机器学习实战笔记--决策树
- 机器学习实战学习笔记(二):决策树
- 机器学习实战_03-决策树
- 机器学习实战python版第三章决策树代码理解
- 机器学习实战笔记——微软小冰的读心术与决策树
- 机器学习实战之决策树ID3算法
- 机器学习实战-决策树
- 机器学习实战——决策树
- 机器学习实战3--决策树
- 机器学习实战—ch03 .决策树(ID3算法)
- 机器学习实战-决策树
- 机器学习实战:决策树
- 机器学习实战-决策树