您的位置:首页 > 其它

[机器学习][源码]机器学习实战ch7 AdaBoost

2018-03-09 20:53 555 查看
把代码保存于此,python3实现,详解就参考《机器学习实战》(Peter Harrington)啦...
boost.py : 
from numpy import *

def loadSimpData():
datMat=matrix([[1.,2.1],
[2.,1.1],
[1.3,1.],
[1.,1.],
[2.,1.] ])
classLabels=[1.0,1.0,-1.0,-1.0,1.0]
return datMat,classLabels

#7-1 单层决策树生成函数
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#数据集,维度,分界值(阈值),方向
retArray=ones((shape(dataMatrix)[0],1))
if threshIneq=='lt':
retArray[dataMatrix[:,dimen]<=threshVal]=-1.0
else:
retArray[dataMatrix[:,dimen]>threshVal]=-1.0
return retArray

def buildStump(dataArr,classLabels,D):#数据集,数据标号,权重集合
dataMatrix=mat(dataArr)
labelMat=mat(classLabels).T
m,n=shape(dataMatrix) #samples,features
numSteps=10.0 #10 steps
bestStump={}
bestClasEst=mat(zeros((m,1)))
minError=inf

#attention: 3重for循环
for i in range(n): 
#for 1 -- each feature(dim)
rangeMin=dataMatrix[:,i].min();rangeMax=dataMatrix[:,i].max()
stepSize=(rangeMax-rangeMin)/numSteps
for j in range(-1,int(numSteps)+1): 
#for 2 -- each stepsize(thresh)
for inequal in ['lt','gt']: 
#for 3 -- each direction(inequal)
threshVal =( rangeMin+float(j)*stepSize ) #分界值
predictedVals=stumpClassify(dataMatrix,i,threshVal,inequal)
errArr=mat(ones((m,1))) #错误向量,所有位全1
errArr[ predictedVals==labelMat ]=0 #若样本分类正确,错误向量对应位=0
weightedError=D.T*errArr #权重集合*错误集合,总错误率
#print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f"%(i,threshVal,inequal,weightedError))

if weightedError<minError:
minError=weightedError
bestClasEst=predictedVals.copy() #最小error对应的 分类标号
bestStump['dim']=i #最小error对应的 分类特征
bestStump['thresh']=threshVal#最小error对应的 分界值
bestStump['ineq']=inequal#最小error对应的 方向

return bestStump,minError,bestClasEst#return: bestStump字典{分类维度,分界值,阈值方向},错误率,分类标号
'''
d:
cd pythonwp
cd ch07
python
import boost
from importlib import reload
reload(boost)
datMat,classLabels=boost.loadSimpData()
from numpy import *
D=mat(ones((5,1))/5) #初始
b9b2
权重集合
boost.buildStump(datMat,classLabels,D)
'''

#7-2 基于单层决策树的AdaBoost训练过程
def adaBoostTrainDS(dataArr,classLabels,numIt=40):#dataset,labels,iteration number
weakClassArr=[]#弱分类器的集合
m=shape(dataArr)[0]#num of samples
D=mat(ones((m,1))/m)#初始概率分布向量
aggClassEst=mat(zeros((m,1)))#类别估计累积值

for i in range(numIt):
bestStump,error,classEst=buildStump(dataArr,classLabels,D)#7-1 第i棵最佳单层决策树的bestStump字典,分类错误率,分类标号
#print("D:",D.T) #第i棵树的概率分布
alpha=float( 0.5*log( (1.0-error)/max(error,1e-16) ) )#第i棵树的系数
bestStump['alpha']=alpha#7-1,bestStump字典有dim,thresh,ineq
weakClassArr.append(bestStump)
#print("classEst:",classEst)#第i棵树的分类标号

expon=multiply(-1*alpha*mat(classLabels).T,classEst) #下一棵树(第 i+1 个)的概率分布
D=multiply(D,exp(expon))
D=D/D.sum()

aggClassEst+=alpha*classEst #前i棵树生成的加法模型--类别估计累积值
#print("aggClassEst:",aggClassEst.T)
aggErrors=multiply( sign(aggClassEst)!=mat(classLabels).T, ones((m,1)) ) #分类错误的样本数
errorRate=aggErrors.sum()/m #分类错误率
#print("total error:",errorRate,"\n")
if(errorRate==0.0): break
return weakClassArr,aggClassEst #return 弱分类器的集合,类别估计累积值
'''
classifierArray=boost.adaBoostTrainDS(datMat,classLabels,9)
'''

#7-3 adaBoost分类函数
def adaClassify(datToClass,classifierArr):#待分类数据,弱分类器的集合(来自7-2)
dataMatrix=mat(datToClass)
m=shape(dataMatrix)[0]#num of samples
aggClassEst=mat(zeros((m,1)))
for i in range(len(classifierArr)):
classEst=stumpClassify(dataMatrix,classifierArr[i]['dim'],\
classifierArr[i]['thresh'],classifierArr[i]['ineq'])#用 第i个弱分类器 进行分类(来自7-1)
aggClassEst += classifierArr[i]['alpha'] * classEst #乘以alpha权重,得到类别估计累积值
print( aggClassEst )
return sign(aggClassEst)
'''
reload(boost)
datArr,labelArr=boost.loadSimpData()
classifierArr=boost.adaBoostTrainDS(datMat,classLabels,30) #train
boost.adaClassify( [[5,5],[0,0]],classifierArr) #test
'''

# 7-4 自适应数据加载函数
def loadDataSet(fileName):
numFeat=len(open(fileName).readline().split('\t'))#num of features
dataMat=[];labelMat=[]
fr=open(fileName)

for line in fr.readlines():
lineArr=[]
curLine=line.strip().split('\t')#逐行读入并切分,每行的前两个值为X1,X2
for i in range(numFeat-1):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr )
labelMat.append( float(curLine[-1]) )
return dataMat,labelMat
'''
reload(boost)
datArr,labelArr=boost.loadDataSet('horseColicTraining2.txt')
classifierArr=boost.adaBoostTrainDS(datArr,labelArr,10) #train the model
testArr,testLabelArr=boost.loadDataSet('horseColicTest2.txt')
prediction10=boost.adaClassify(testArr,classifierArr)
errArr=mat(ones((67,1)))
error=errArr[prediction10!=mat(testLabelArr).T].sum() 
errorRate=error/67 #错误率
print(errorRate)
'''

#7-5 ROC+AUC
def plotROC(predStrengths,classLabels):#AdaBoost训练所得类别估计累积值(from 7-2),样本真实标号    
    import matplotlib.pyplot as plt
    cur = (1.0,1.0) #cursor,起始点(x,y)的坐标
    ySum = 0.0 #variable to calculate AUC。所有小矩形的--宽度是xStep 高度之和是ySum
    numPosClas = sum(array(classLabels)==1.0)# 真正例数
    yStep = 1/float(numPosClas)# x轴步长--1/真正例数
    xStep = 1/float(len(classLabels)-numPosClas)# y轴步长--1/真反例数

    sortedIndicies = predStrengths.argsort()#get sorted index, value of predStrengths--from small to big.相当于R语言的sort

    fig = plt.figure()
    fig.clf()
    ax = plt.subplot(111)

    #loop through all the values, drawing a line segment at each point
    for index in sortedIndicies.tolist()[0]:#将sortedIndicies转成list
        if classLabels[index] == 1.0:#实际为正
            delX = 0; delY = yStep;
        else: #实际为反
            delX = xStep; delY = 0;
            ySum += cur[1]
        #draw line from cur to (cur[0]-delX,cur[1]-delY)
        ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY], c='b')
        cur = (cur[0]-delX,cur[1]-delY)

    ax.plot([0,1],[0,1],'b--') # 对角线--x从0到1,y从0到1,虚线
    plt.xlabel('False positive rate'); plt.ylabel('True positive rate')
    plt.title('ROC curve for AdaBoost horse colic detection system')
    ax.axis([0,1,0,1])
    plt.show()
    print("the Area Under the Curve is: ",ySum*xStep)#所有小矩形的面积之和,即ROC面积
'''
from importlib import reload
reload(boost)
datArr,labelArr=boost.loadDataSet('horseColicTraining2.txt')
classifierArr,aggClassEst=boost.adaBoostTrainDS(datArr,labelArr,50) #train the model
boost.plotROC(aggClassEst.T,labelArr)
'''
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: