您的位置:首页 > 编程语言

代码注释:机器学习实战第8章 预测数值型数据:回归

2017-04-03 10:24 676 查看
写在开头的话:在学习《机器学习实战》的过程中发现书中很多代码并没有注释,这对新入门的同学是一个挑战,特此贴出我对代码做出的注释,仅供参考,欢迎指正。

#coding:gbk
from numpy import *

#作用:从文件中导入数据
#输入:文件名
#输出:数据矩阵,标签向量
def loadDataSet(fileName):
# .readline()每次只读取一行,只需读取一行计算特征值洁身内存
numFeat = len(open(fileName).readline().split('\t')) - 1
dataMat = []
labelMat = []
fr = open(fileName)
#.readlines()一次读取整个文件,自动将文件内容分析成一个行的列表
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat, labelMat

#作用:计算最佳拟合直线
#输入:数据点的x向量,y向量
#输出:最佳拟合直线的回归系数向量
def standRegres(xArr, yArr):
xMat = mat(xArr)
yMat = mat(yArr).T
xTx = xMat.T * xMat
#.linalg.det():计算矩阵的行列式
if linalg.det(xTx) == 0.0:
#如果矩阵行列式为零,则该矩阵不能计算逆矩阵
print "This matrix is singular, cannot do inverse"
return
ws = xTx.I * (xMat.T * yMat)
return ws

#作用:局部加权线性回归方法
#输入:测试点,数据点的x向量,y向量,参数k
#输出:预测值
def lwlr(testPoint, xArr, yArr, k = 1.0):
xMat = mat(xArr)
yMat = mat(yArr).T
m = shape(xMat)[0]
weights = mat(eye((m)))#(m)是一个元组
for j in range(m):
diffMat = testPoint - xMat[j, :]
weights[j, j] = exp(diffMat * diffMat.T / (-2.0 * k ** 2))
xTx = xMat.T * (weights * xMat)
if linalg.det(xTx) == 0.0:
print "This matrix is singular, cannot do inverse"
return
ws = xTx.I * (xMat.T * (weights * yMat))
return testPoint * ws

#作用:测试局部加权线性回归方法
#输入:测试向量,数据点的x向量,y向量,参数k
#输出:预测值向量
def lwlrTest(testArr, xArr, yArr, k = 1.0):
m = shape(testArr)[0]
yHat = zeros(m)
for i in range(m):
yHat[i] = lwlr(testArr[i], xArr, yArr, k)
return yHat

#作用:计算预测值与真实值之间的误差
#输入:真实值,预测值
#输出:误差
def rssError(yArr, yHatArr):
return ((yArr - yHatArr) ** 2).sum()

#作用:岭回归
#输入:数据点的x向量,y向量,岭参数k
#输出:预测值
def ridgeRegres(xMat, yMat, lam = 0.2):
xTx = xMat.T * xMat
denom = xTx + eye(shape(xMat)[1]) * lam
if linalg.det(denom) == 0.0:
print "This matrix is singular, cannot do inverse"
return
ws = denom.I * (xMat.T * yMat)
return ws

#作用:岭回归测试
#输入:数据点的x向量,y向量
#输出:回归系数矩阵
def ridgeTest(xArr, yArr):
xMat = mat(xArr)
yMat = mat(yArr).T
yMean = mean(yMat, 0)
yMat = yMat - yMean
xMeans = mean(xMat, 0)
xVar = var(xMat, 0)#计算偏差
xMat = (xMat - xMeans) / xVar
numTestPts = 30
wMat = zeros((numTestPts, shape(xMat)[1]))
for i in range(numTestPts):
ws = ridgeRegres(xMat, yMat, exp(i - 10))
wMat[i, :] = ws.T
return wMat

#作用:对矩阵进行正则化
#输入:需要正则化的矩阵
#输出:正则化后的矩阵
def regularize(xMat):#regularize by columns
inMat = xMat.copy()
inMeans = mean(inMat,0) #calc mean then subtract it off
inVar = var(inMat,0) #calc variance of Xi then divide by it
inMat = (inMat - inMeans)/inVar
return inMat

#作用:逐步线性回归算法
#输入:数据点的x向量,y向量,步长,迭代次数
#输出:回归系数矩阵
def stageWise(xArr, yArr, eps = 0.01, numIt = 100):
xMat = mat(xArr)
yMat = mat(yArr).T
yMean = mean(yMat, 0)
yMat = yMat - yMean#正则化后的y向量
xMat = regularize(xMat)#正则化后x向量
m, n = shape(xMat)#x向量的行数、列数,即样例个数与特征值个数
returnMat = zeros((numIt, n))#回归系数矩阵
ws = zeros((n, 1))#回归系数
wsTest = ws.copy()
wsMax = ws.copy()
for i in range(numIt):
print ws.T
lowestError = inf
#对每个特征,增大或减小一个特征
for j in range(n):
for sign in [-1, 1]:
wsTest = ws.copy()
wsTest[j] += eps * sign
yTest = xMat * wsTest
rssE = rssError(yMat.A, yTest.A)
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
returnMat[i, :] = ws.T
return returnMat

from time import sleep
import json
import urllib2

#作用:购物信息的获取函数
#输入:
#输出:网址已取消,失败
def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
sleep(10)
myAPIstr = 'get from code.google.com'
searchURL = 'https://www.googleapis.com/shopping/search/v1/pubilic/products?\
key=%s&country=US&q=lego+%d&alt=json' % (myAPIstr, setNum)
pg = urllib2.urlopen(searchURL)
retDict = json.loads(pg.read())
for i in range(len(retDict['item'])):
try:
currItem = retDict['items'][i]
if currItem['product']['conditon'] == 'new':
newFlag = 1
else:
newFlag = 0
listOfInv = currItem['product']['inventories']
for item in listOfInv:
sellingPrice = item['price']
if sellingPrice > origPrc * 0.5:
print "%d\t%d\t%d\t%f\t%df" %\
(yr, numPce, newFlag, origPrc, sellingPrice)
retX.append([yr, numPce, newFlag, origPrc])
retY.append(sellingPrice)
except:
print 'problem with item %d' % i

def setDataCollect(retX, retY):
searchForSet(retX, retY, 8288, 2006, 800, 49.99)
searchForSet(retX, retY, 10030, 2002, 3096, 269.99)
searchForSet(retX, retY, 10179, 2007, 5195, 499.99)
searchForSet(retX, retY, 10181, 2007, 3428, 199.99)
searchForSet(retX, retY, 10189, 2008, 5922, 299.99)
searchForSet(retX, retY, 10196, 2009, 3263, 249.99)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  机器学习 注释