您的位置:首页 > 编程语言 > Python开发

逻辑回归-非线性判定边界Python代码实现

2018-01-29 10:05 1036 查看
首先详细介绍了回归算法,没看过的小伙伴可以点开链接看一下,或者是直接在我的博客中查找。

其次,给大家介绍了线性判定边界的逻辑回归Python代码的实现,主要是通过一个例子,让大家可以自己实现一个逻辑回归分类器。没看过的小伙伴可以点开链接看一下,或者是直接在我的博客中查找。

今天,这一篇根据上两篇的理论和实践知识,我们实现一下逻辑回归中判定边界是非线性函数的情况。

(1)首先我们先准备好我们的数据集,命名为data2.txt。我把数据集贴出来给大家,大家可以自己把数据放到txt文本中,也可以到我的github中下载。

[[ 0.051267   0.69956    1.       ]
[-0.092742   0.68494    1.       ]
[-0.21371    0.69225    1.       ]
[-0.375      0.50219    1.       ]
[-0.51325    0.46564    1.       ]
[-0.52477    0.2098     1.       ]
[-0.39804    0.034357   1.       ]
[-0.30588   -0.19225    1.       ]
[ 0.016705  -0.40424    1.       ]
[ 0.13191   -0.51389    1.       ]
[ 0.38537   -0.56506    1.       ]
[ 0.52938   -0.5212     1.       ]
[ 0.63882   -0.24342    1.       ]
[ 0.73675   -0.18494    1.       ]
[ 0.54666    0.48757    1.       ]
[ 0.322      0.5826     1.       ]
[ 0.16647    0.53874    1.       ]
[-0.046659   0.81652    1.       ]
[-0.17339    0.69956    1.       ]
[-0.47869    0.63377    1.       ]
[-0.60541    0.59722    1.       ]
[-0.62846    0.33406    1.       ]
[-0.59389    0.005117   1.       ]
[-0.42108   -0.27266    1.       ]
[-0.11578   -0.39693    1.       ]
[ 0.20104   -0.60161    1.       ]
[ 0.46601   -0.53582    1.       ]
[ 0.67339   -0.53582    1.       ]
[-0.13882
4000
0.54605    1.       ]
[-0.29435    0.77997    1.       ]
[-0.26555    0.96272    1.       ]
[-0.16187    0.8019     1.       ]
[-0.17339    0.64839    1.       ]
[-0.28283    0.47295    1.       ]
[-0.36348    0.31213    1.       ]
[-0.30012    0.027047   1.       ]
[-0.23675   -0.21418    1.       ]
[-0.06394   -0.18494    1.       ]
[ 0.062788  -0.16301    1.       ]
[ 0.22984   -0.41155    1.       ]
[ 0.2932    -0.2288     1.       ]
[ 0.48329   -0.18494    1.       ]
[ 0.64459   -0.14108    1.       ]
[ 0.46025    0.012427   1.       ]
[ 0.6273     0.15863    1.       ]
[ 0.57546    0.26827    1.       ]
[ 0.72523    0.44371    1.       ]
[ 0.22408    0.52412    1.       ]
[ 0.44297    0.67032    1.       ]
[ 0.322      0.69225    1.       ]
[ 0.13767    0.57529    1.       ]
[-0.0063364  0.39985    1.       ]
[-0.092742   0.55336    1.       ]
[-0.20795    0.35599    1.       ]
[-0.20795    0.17325    1.       ]
[-0.43836    0.21711    1.       ]
[-0.21947   -0.016813   1.       ]
[-0.13882   -0.27266    1.       ]
[ 0.18376    0.93348    0.       ]
[ 0.22408    0.77997    0.       ]
[ 0.29896    0.61915    0.       ]
[ 0.50634    0.75804    0.       ]
[ 0.61578    0.7288     0.       ]
[ 0.60426    0.59722    0.       ]
[ 0.76555    0.50219    0.       ]
[ 0.92684    0.3633     0.       ]
[ 0.82316    0.27558    0.       ]
[ 0.96141    0.085526   0.       ]
[ 0.93836    0.012427   0.       ]
[ 0.86348   -0.082602   0.       ]
[ 0.89804   -0.20687    0.       ]
[ 0.85196   -0.36769    0.       ]
[ 0.82892   -0.5212     0.       ]
[ 0.79435   -0.55775    0.       ]
[ 0.59274   -0.7405     0.       ]
[ 0.51786   -0.5943     0.       ]
[ 0.46601   -0.41886    0.       ]
[ 0.35081   -0.57968    0.       ]
[ 0.28744   -0.76974    0.       ]
[ 0.085829  -0.75512    0.       ]
[ 0.14919   -0.57968    0.       ]
[-0.13306   -0.4481     0.       ]
[-0.40956   -0.41155    0.       ]
[-0.39228   -0.25804    0.       ]
[-0.74366   -0.25804    0.       ]
[-0.69758    0.041667   0.       ]
[-0.75518    0.2902     0.       ]
[-0.69758    0.68494    0.       ]
[-0.4038     0.70687    0.       ]
[-0.38076    0.91886    0.       ]
[-0.50749    0.90424    0.       ]
[-0.54781    0.70687    0.       ]
[ 0.10311    0.77997    0.       ]
[ 0.057028   0.91886    0.       ]
[-0.10426    0.99196    0.       ]
[-0.081221   1.1089     0.       ]
[ 0.28744    1.087      0.       ]
[ 0.39689    0.82383    0.       ]
[ 0.63882    0.88962    0.       ]
[ 0.82316    0.66301    0.       ]
[ 0.67339    0.64108    0.       ]
[ 1.0709     0.10015    0.       ]
[-0.046659  -0.57968    0.       ]
[-0.23675   -0.63816    0.       ]
[-0.15035   -0.36769    0.       ]
[-0.49021   -0.3019     0.       ]
[-0.46717   -0.13377    0.       ]
[-0.28859   -0.060673   0.       ]
[-0.61118   -0.067982   0.       ]
[-0.66302   -0.21418    0.       ]
[-0.59965   -0.41886    0.       ]
[-0.72638   -0.082602   0.       ]
[-0.83007    0.31213    0.       ]
[-0.72062    0.53874    0.       ]
[-0.59389    0.49488    0.       ]
[-0.48445    0.99927    0.       ]
[-0.0063364  0.99927    0.       ]
[ 0.63265   -0.030612   0.       ]]
(2)有了数据集,我们来看看数据集在二维坐标下的分布。

from numpy import loadtxt,where
from pylab import scatter, show, legend, xlabel, ylabel

#load the dataset
data = loadtxt("F:/PythonCode/LogisticRegression/data2.txt", delimiter=",")
#可以看出数据是一个二维数组,维度是100*3
print(data)

X = data[:,0:2]
#X存放的是数据的特征,维度是:100*2
# print(X.shape)
y = data[:, 2]
#y存放的是数据的标签,维度是:100*1
# print(y)

pos = where(y == 1)
#pos是y中数据等于1的下标索引
# print(pos)
neg = where(y==0)
#neg是y中数据等于0的下标索引
# print(neg)

#python中数据可视化函数scatter(数据的横坐标向量,数据的纵坐标向量,marker='0'数据以点的形式显示,c='b'数据点是blue颜色)
scatter(X[pos,0],X[pos, 1],marker='o', c='b')
scatter(X[neg,0],X[neg, 1],marker='x', c='r')

#说明二维坐标中o表示Pass,x表示Fail
legend(["y==1","y==0"])
show()


用Python画图如下:



(3)加正则化的逻辑回归模型训练。

我们先看一下,加正则化后的损失函数和梯度。





训练模型和计算精度的代码:

from numpy import *
import matplotlib.pyplot as plt
import numpy as np
from scipy.optimize import minimize

filename = "F:/PythonCode/LogisticRegression/data2.txt"

def loadDataSet():
# load the dataset
data = loadtxt("F:/PythonCode/LogisticRegression/data2.txt", delimiter=",")
# 拿到X和y
y = np.c_[data[:, 2]]
X = data[:, 0:2]
return data,X,y

def map_feature(x1, x2):
'''''
Maps the two input features to polonomial features.
Returns a new feature array with more features of
X1, X2, X1 ** 2, X2 ** 2, X1*X2, X1*X2 ** 2, etc...
'''
x1.shape =(x1.size,1)
x2.shape =(x2.size,1)
degree =6
mapped_fea = ones(shape=(x1[:,0].size,1))
for i in range(1, degree +1):
for j in range(i +1):
r =(x1 **(i - j))*(x2 ** j)
mapped_fea = append(mapped_fea, r, axis=1)
return mapped_fea

#计算Sigmoid函数
def sigmoid(X):
'''Compute sigmoid function'''
den = 1.0 + exp(-1.0*X)
gz = 1.0/den
return gz

# 定义损失函数
def costFunctionReg(theta, X, y, l):
m = y.size
h = sigmoid(X.dot(theta))

J = -1.0 * (1.0 / m) * (np.log(h).T.dot(y) + np.log(1 - h).T.dot(1 - y)) + (l / (2.0 * m)) * np.sum(np.square(theta[1:]))

if np.isnan(J[0]):
return (np.inf)
return (J[0])

#计算梯度
def compute_grad(theta, X, y, l):
m = y.size
h = sigmoid(X.dot(theta.reshape(-1, 1)))

grad = (1.0 / m) * X.T.dot(h - y) + (l / m) * np.r_[[[0]], theta[1:].reshape(-1, 1)]

return (grad.flatten())

#梯度下降并优化
def gradAscent(XX, y, l):
initial_theta = np.zeros(XX.shape[1])
cost = costFunctionReg(initial_theta, XX, y, l)
print('Cost: \n', cost)
# 最优化 costFunctionReg
res2 = minimize(costFunctionReg, initial_theta, args=(XX, y, l), jac=compute_grad, options={'maxiter': 3000})
return res2

def plotBestFit(data,res2,X,accuracy,l,axes): #画出最终分类的图
# 对X,y的散列绘图
plotData(data, 'Microchip Test 1', 'Microchip Test 2', 'y = 1', 'y = 0', axes=None)
# 画出决策边界
x1_min, x1_max = X[:, 0].min(), X[:, 0].max(),
x2_min, x2_max = X[:, 1].min(), X[:, 1].max(),
xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max))
h = sigmoid(map_feature(xx1.ravel(), xx2.ravel()).dot(res2.x))
h = h.reshape(xx1.shape)
if axes == None:
axes = plt.gca()
axes.contour(xx1, xx2, h, [0.5], linewidths=1, colors='g');
axes.set_title('Train accuracy {}% with Lambda = {}'.format(np.round(accuracy, decimals=2), l))
plt.show()

def plotData(data, label_x, label_y, label_pos, label_neg, axes):
# 获得正负样本的下标(即哪些是正样本,哪些是负样本)
neg = data[:, 2] == 0
pos = data[:, 2] == 1
if axes == None:
axes = plt.gca()
axes.scatter(data[pos][:, 0], data[pos][:, 1], marker='+', c='k', s=60, linewidth=2, label=label_pos)
axes.scatter(data[neg][:, 0], data[neg][:, 1], c='y', s=60, label=label_neg)
axes.set_xlabel(label_x)
axes.set_ylabel(label_y)
axes.legend(frameon=True, fancybox=True)

def predict(theta, X):
'''''Predict whether the label
is 0 or 1 using learned logistic
regression parameters '''
m, n = X.shape
p = zeros(shape=(m,1))
h = sigmoid(X.dot(theta.T))
for it in range(0, h.shape[0]):
if h[it]>0.5:
p[it,0]=1
else:
p[it,0]=0
return p

def main():
data, X, y = loadDataSet()
#对给定的两个feature做一个多项式特征的映射
mapped_fea = map_feature(X[:, 0], X[:, 1])

# 决策边界,咱们分别来看看正则化系数lambda太大太小分别会出现什么情况
# Lambda = 0 : 就是没有正则化,这样的话,就过拟合咯
# Lambda = 1 : 这才是正确的打开方式
# Lambda = 100 :正则化项太激进,导致基本就没拟合出决策边界
l = 1

res = gradAscent(mapped_fea, y, l)
print(res)

# 准确率
accuracy = y[where(predict(res.x, mapped_fea) == y)].size / float(y.size)*100.0
#画决策边界
plotBestFit(data, res, X, accuracy, l,axes=None)

if __name__ == '__main__':
main()

这里我们正则化后的参数λ=1,我们来看一下判定边界:



我们在看一下λ=0时,也就是损失函数没有加入正则化项时,过拟合的情况:



我们在来看一下,λ=100时,也就是你训练出来的参数在损失函数中惩罚参数太大,导致欠拟合的情况。



到此,逻辑回归系列的基础知识和项目实践,已经学完。相应的代码和数据集可以到我的github中下载。

github代码和数据集地址:https://github.com/Microstrong0305/machine_learning/tree/master/noLineLogisticRegression

Reference;
http://blog.csdn.net/han_xiaoyang/article/details/49123419
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: