机器学习-决策树算法代码详解
2017-11-20 14:57
411 查看
from imp import reload from math import log import operator def createDataSet(): dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfacing','flippers'] #change to discrete values return dataSet, labels def calcShannonEnt(dataSet):#dateset is a n*m numEntries = len(dataSet)#the row of the dateset,n labelCounts = {}#this is a dict,used to count the lables for featVec in dataSet: #the the number of unique elements and their occurance currentLabel = featVec[-1]#featVec is a list ,it is the row of the dateset if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0 for key in labelCounts:#key is the KEY,and lableCounts[key] is its value prob = float(labelCounts[key])/numEntries#统计所有的不同标签出现的频次,计算他出现的概率 shannonEnt -= prob * log(prob,2) #log base 2,count shannnonEnt return shannonEnt def splitDataSet(dataSet, axis, value): retDataSet = []#this is an empty list for featVec in dataSet:#featVec is the row of the dateset if featVec[axis] == value:#axis is its place where you want to split reducedFeatVec = featVec[:axis] #chop out axis used for splitting reducedFeatVec.extend(featVec[axis+1:])#[1,2,3]extend[4,5]=[1,2,3,4,5] retDataSet.append(reducedFeatVec)#[1,2,3]append[4,5]=[1,2,3,[4,5]] return retDataSet def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 #the last column is used for the labels.the value of len(dataSet[0])is its number of collos,the value of len(dateset))is its number of rows baseEntropy = calcShannonEnt(dataSet)#count the original shannonEnt bestInfoGain = 0.0; bestFeature = -1 for i in range(numFeatures): #iterate over all the features,每行有多少列,即有多少个分类特征,遍历所有分类特征,选取划分后ShannonEnt最小的特征为划分结果 featList = [example[i] for example in dataSet]#create a list of all the examples of this feature,example include many row-list of the dataSet,example[i] is the ith value of the dataSet uniqueVals = set(featList) #get a set of unique values#$# alist=[example[i] for example in dataSet],you can get a total column of dataSet newEntropy = 0.0 for value in uniqueVals:#计算当前列出现所有结果分割下的ShannonEnt subDataSet = splitDataSet(dataSet, i, value)#利用dataSet的每一列中出现的所有的uniqueVals分割dataset,计算概率和shannonEnt,并求和 prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropy if (infoGain > bestInfoGain): #compare this to the best gain so far bestInfoGain = infoGain #if better than current best, set to best bestFeature = i #the ith feature is the better fe 4000 ature for now return bestFeature #returns an integer def majorityCnt(classList): classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def createTree(dataSet,labels): classList = [example[-1] for example in dataSet] if classList.count(classList[0]) == len(classList): #第一个停止条件 return classList[0]#stop splitting when all of the classes are equal if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet)#the return is an Int,which is a place of the row of the column`s feature,The best feature bestFeatLabel = labels[bestFeat]#特征中的一个 myTree = {bestFeatLabel:{}}#创建一个树的字典 del(labels[bestFeat]) featValues = [example[bestFeat] for example in dataSet]#featValues是得到当前这一列特征的所有表达 uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] #copy all of labels, so trees don't mess up existing labels,在python中,参数是列表类型时,是按照引用方式传递,会改变他的值 myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) #value的值是上字典key bestFeatLable对应的值,但同时value也是一个字典,value是该字典的key,等号右边的部分是key下的值,该值同时可能又是一个字典 return myTree ''' bestFeatLabel='sunshine' myTree = {bestFeatLabel:{}} uniqueVals=['rain','mud','wind'] for value in uniqueVals: myTree[bestFeatLabel][value]={"a dog"} print(myTree) >>{'sunshine': {'rain': {'a dog'}, 'mud': {'a dog'}, 'wind': {'a dog'}}} ''' def classify(inputTree,featLabels,testVec):#inputtree是已经生成了的树,testvec是测试向量,featlable是测试向量对应位置的标签 firstStr = inputTree.keys()[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) key = testVec[featIndex]#判断测试向量在该分类下的值是否是决策树的叶节点,如果不是则继续递归判断,如果是,返回分类结果 valueOfFeat = secondDict[key] if isinstance(valueOfFeat, dict): classLabel = classify(valueOfFeat, featLabels, testVec) else: classLabel = valueOfFeat return classLabel
相关文章推荐
- 机器学习----K-近邻算法(Python代码详解)
- 机器学习实战代码详解(五)Logistic回归
- 机器学习-朴素贝叶斯分类代码详解
- 决策树算法代码实现及注释(代码来自于机器学习实战)
- 机器学习实战代码详解(九)树回归
- 算法代码[置顶] 机器学习实战之KNN算法详解
- python机器学习4—2代码详解及修改
- 机器学习实战代码详解(七)利用AdaBoost元算法提高分类性能
- [机器学习]详解分类算法--决策树算法
- 机器学习-KNN算法代码详解
- 机器学习经典算法-logistic回归代码详解
- 机器学习实战学习,代码详解(K-近邻算法)
- 静态代码块(static{})详解
- Java 代码编译和执行的整个过程详解
- [置顶] 【算法 机器学习】R语言做朴素贝叶斯和决策树算法
- mysql 字段as详解及实例代码
- Java语言实现数据结构栈代码详解
- Struts2单选按钮详解及枚举类型的转换代码示例
- java 实现 stack详解及实例代码
- Java 两种延时thread和timer详解及实例代码