您的位置:首页 > 其它

机器学习基石 作业4 带Regularizer和Cross Validation的线性回归分类器

2015-10-30 21:09 232 查看
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__title__ = 'main.py'
__author__ = 'w1d2s'
__mtime__ = '2015/10/30'
"""
from numpy import *
from RidgeReg import *
from Validation import *
import sys
import string

def Data_Pretreatment(path):
rawData = open(path).readlines()
#print rawData
dataNum = len(rawData)
dataDim = len(rawData[0].strip().split(' ')) - 1
dataIdx = 0
X = zeros([dataNum, dataDim])
Y = zeros(dataNum)
print(dataNum, dataDim)
for line in rawData:
tempList = line.strip().split(' ')
Y[dataIdx] = string.atoi(tempList[dataDim])
X[dataIdx, :] = tempList[0: dataDim]
dataIdx += 1
return (X, Y)

if __name__ == '__main__':
Xtrain, Ytrain = Data_Pretreatment('train.dat')
Xtest, Ytest = Data_Pretreatment('test.dat')
#(Wt, p) = Validate(Xtrain, Ytrain, 120, False)
(Wt, p) = Cross_Validate(Xtrain, Ytrain, 5)

rate = 10 ** p
W = Ridge_Regression(Xtrain, Ytrain, rate)
Ein = Err_Counter(Xtrain, Ytrain, W)
Eout = Err_Counter(Xtest, Ytest, W)
print '** Ein : ' + str(float(Ein)/200)
print '** Eout : ' + str(float(Eout)/1000)


#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__title__ = 'RidgeReg.py'
__author__ = 'w1d2s'
__mtime__ = '2015/10/30'
"""
from numpy import *
from scipy import linalg
import random

def Err_Counter(X, Y, W):
(dataSize, dataDim) = X.shape
Z = ones([dataSize, dataDim + 1])
Z[:, 1: dataDim + 1] = X
ErrCnt = 0
for i in range(0, dataSize):
if Y[i] * dot(Z[i, :], W) <= 0:
ErrCnt = ErrCnt + 1
return ErrCnt

def Ridge_Regression(X, Y, rate):
(dataSize, dataDim) = X.shape
Z = ones([dataSize, dataDim + 1])
Z[:, 1: dataDim + 1] = X
Zt = transpose(Z)
ZtZ = dot(Zt, Z)
I = identity(len(ZtZ))
P = linalg.inv(ZtZ + rate * I)
W = dot(dot(P, Zt), Y)
return W


#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__title__ = 'Validation.py'
__author__ = 'w1d2s'
__mtime__ = '2015/10/30'
"""
from numpy import *
from RidgeReg import *

def Data_Spliter(X, Y, Num4Train):
Xtrain = X[0: Num4Train, :]
Ytrain = Y[0: Num4Train]
Xval = X[Num4Train: , :]
Yval = Y[Num4Train: ]
return [Xtrain, Ytrain, Xval, Yval]

def Validate(X, Y, Num4Train, IsEt):
[Xt, Yt, Xv, Yv] = Data_Spliter(X, Y, Num4Train)
minEt = 120
minEv = 80
Wt = zeros([1, Xt.ndim + 1])
p = 0
for pow in range(-10, 3):
rate = 10 ** pow
W = Ridge_Regression(Xt, Yt, rate)
Et = Err_Counter(Xt, Yt, W)
Ev = Err_Counter(Xv, Yv, W)
if IsEt == True:
if Et <= minEt:
[Wt, minEt, p] = [W, Et, pow]
print '== Et : ' + str(float(Et)/120)
print '== log lambda : ' + str(pow)
else:
if Ev <= minEv:
[Wt, minEv, p] = [W, Ev, pow]
print '== Ev : ' + str(float(Ev)/80)
print '== log lambda : ' + str(pow)
Et = Err_Counter(Xt, Yt, Wt)
Ev = Err_Counter(Xv, Yv, Wt)
print 'log lambda : ' + str(p)
print 'Et : ' + str(float(Et)/120)
print 'Ev: ' + str(float(Ev)/80)
return (Wt, p)

def Data_Spliter2(X, Y, folds):
dataSize = len(Y)
inc = dataSize / folds
Xlist = []
Ylist = []
for idx in range(0, dataSize, inc):
Xtemp = X[idx: idx + inc, :]
Ytemp = Y[idx: idx + inc]
Xlist.append(Xtemp)
Ylist.append(Ytemp)
return (Xlist, Ylist)

def Cross_Validate(X, Y, folds):
(Xlist, Ylist) = Data_Spliter2(X, Y, folds)
(foldSize, foldDim) = Xlist[0].shape
Xt = zeros([foldSize * 4, foldDim])
Yt = zeros([foldSize * 4, 1])
Wt = zeros([1, foldDim + 1])
p = 0
minEcv = 10000
for pow in range(-10, 3):
rate = 10 ** pow
EcvSum = 0
for V in range(0, folds):
beg = 0
for idx in range(0, folds):
if idx == V:
Xv = Xlist[idx]
Yv = Ylist[idx]
else:
Xt[beg: beg + foldSize, :] = Xlist[idx]
Ylist[idx].shape = (Ylist[idx].shape[0], 1)
Yt[beg: beg + foldSize] = Ylist[idx]
beg = beg + foldSize
W = Ridge_Regression(Xt, Yt, rate)
Ecv = Err_Counter(Xv, Yv, W)
EcvSum = EcvSum + Ecv
if float(EcvSum)/folds <= minEcv:
minEcv = float(EcvSum)/folds
(Wt, p) = (W, pow)
print 'log lambda: ' + str(p)
print 'Ecv : ' + str(minEcv)
return (Wt, p)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: