您的位置:首页 > 理论基础 > 计算机网络

吴恩达深度学习笔记一:神经网络和深度学习

2018-03-01 16:05 453 查看

1. 梯度下降中的一些计算



第一个框:logistic回归方程的代价函数的导数
dj/dz
可认为是
a-y


第二个框:前一级变量的导数为后一级变量导数乘以其系数。

另外求dZ和dA不需要进行 ”/m” 操作

# dA/dA_pre = (dA/dZ * dZ/dA_pre) = (dA/dZ * w), 为了表示方便去掉了"dA/", 故乘法不变
dA_pre = np.
4000
dot(W.T, dZ)


或者说在起前面的变量通过其系数来“放大”对最终结果的影响。例如:
y=2a + b, 令v=2a+b, 则 y=v, dy/dv=1 , dy/da = (dy/dv) * (dv/da) = 1*2 = 2


有关维度的计算,假设有[5,3,2,1]的网络,m个样本则:

W1[3,5], W2[2,3], W3[1,2]; Z1=A1=[3,m], Z2=A2=[2,m], Z3=A3=[1,m], 所以:

dZ3=A3−Y−>[1,m]dZ3=A3−Y−>[1,m] dW3[1,2]=dZ3[1,m]∗A2[2,m].TdW3[1,2]=dZ3[1,m]∗A2[2,m].T dA2[2,m]=W3[1,2].T∗dZ3[1,m]dA2[2,m]=W3[1,2].T∗dZ3[1,m]

可以看出dZ的位置及转置情况有所不同,因为1、W的行列分别为本层的单元数和上一层的单元数且与m无关,而A的行为上一层的单元数,故求dW时转置取A行为列,其列消去。2、A的列和Z的列总为m, 故求dA时Z不变取其列。3、总结为:Z为纽带不转置

2. 激活函数的选择



sigmoid函数一般只用于二分类的输出层,tanh函数类似,其缺点在于当z的绝对值太大时导数很小,训练的过程就会很慢

ReLU函数是在神经网络中应用最多的激活函数,其优点在于当z为正值时导数值一般较大,训练过程较快,而在实际中z处于负值的情况很少(此时函数不可导)。

3. 损失函数和准确度的计算

一般来说,损失函数是对单个样本而言,使用 L 表示,代价(成本)函数是损失函数在整个样本空间的扩展,用 J 表示,但为了方便也可以使用损失函数来表达代价函数的意思。

交叉熵损失函数:

J(θ)=−1m∑mi=1[y(i)log(hθ(x(i)))+(1−y(i))log(1−hθ(x(i))]J(θ)=−1m∑i=1m[y(i)log(hθ(x(i)))+(1−y(i))log(1−hθ(x(i))]

m = Y.shape[1]
cost = -1 / m * np.sum(np.dot(Y, np.log(AL).T) + np.dot(1 - Y, np.log(1 - AL).T))
cost = np.squeeze(cost)


准确率,AL表示最后一层的输出:

acc = (np.dot(y_train, AL.T) + np.dot(1 - y_train, 1 - AL.T)) / y_train.shape[1] * 100


4. 神经网络模型的建立



代码示例:

def initialize_deep_parameters(layer_dims=np.array([5, 4, 1])):
"""
初始化任意层的神经网络参数
:param layer_dims: 记录每层单元数的数组,包括输入输出,如[2,4,1]则有输入层2,W1[4,2],b1[4,1], W2[1,4].b2[1,1]
最后一个1表示只有一个类别,即二分类识别
:return:
"""
parameters = {}
num_layers = len(layer_dims)
for l in range(1, num_layers):
parameters['W' + str(l)] = np.random.rand(layer_dims[l], layer_dims[l - 1]) * 0.01
parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))

# 测试
# for key in parameters.keys():
# print(key, parameters[key].shape)

return parameters

def linear_forward(A, W, b):
"""
一层神经网络中的线性计算
:param A: 上一层激活函数的输出,第一层即为X
:param W:
:param b:
:return: cache,其实就是Z或A的复制值,用了计算反向传播,Z和A变量进行正向传播
"""
Z = np.dot(W, A) + b
assert (Z.shape == (W.shape[0], A.shape[1]))
cache = (A, W, b)
return Z, cache

def linear_activation_forward(A_pre, W, b, activation='relu'):
"""
激活函数
:param A_pre: 上一层的输出
:param W:
:param b:
:param activation:
:return:
"""
Z, linear_cache = linear_forward(A_pre, W, b)

if activation == 'sigmoid':
A, activation_cache = dnn_utils_v2.sigmoid(Z)
assert (A.shape == (W.shape[0], A_pre.shape[1]))
cache = (linear_cache, activation_cache)
return A, cache

elif activation == 'relu':
A, activation_cache = dnn_utils_v2.relu(Z)
assert (A.shape == (W.shape[0], A_pre.shape[1]))
cache = (linear_cache, activation_cache)
return A, cache

def L_model_forward(X, parameters):
"""
实现L层模型的前向传播
:param X:
:param parameters:
:return:
"""
caches = []
A = X
L = len(parameters) // 2 # 根据参数W或b的个数确定层数,一层两个参数,Wl.bl
for l in range(1, L):
A_pre = A
A, cache = linear_activation_forward(A_pre, parameters['W' + str(l)], parameters['b' + str(l)],
activation='relu')
caches.append(cache)
# 最后一层
AL, cache = linear_activation_forward(A, parameters['W' + str(L)], parameters['b' + str(L)], activation='sigmoid')
caches.append(cache)

assert (AL.shape == (1, X.shape[1]))
return AL, caches

def compute_cost(AL, Y):
"""
代价函数计算
:param AL: L层激活函数的输出即最好的输出
:param Y:
:return:
"""
m = Y.shape[1] cost = -1 / m * np.sum(np.dot(Y, np.log(AL).T) + np.dot(1 - Y, np.log(1 - AL).T)) cost = np.squeeze(cost)
return cost

def linear_backward(dZ, cache):
"""
反向传播计算梯度
:param dZ: 当前层损失函数的导数
:param cache:
:return:
"""
A_pre, W, b = cache
m = A_pre.shape[1]

dW = np.dot(dZ, A_pre.T) / m
db = np.sum(dZ, axis=1, keepdims=True) / m
# dA/dA_pre = (dA/dZ * dZ/dA_pre) = (dA/dZ * w), 为了表示方便去掉了"dA/", 故乘法不变
dA_pre = np.dot(W.T, dZ)
return dA_pre, dW, db

def linear_activation_backward(dA, cache, activation='relu'):
"""
激活函数部分梯度计算
:param dA:
:param cache:
:param activation:
:return:
"""
linear_cache, activation_cache = cache
if activation == 'relu':
dZ = dnn_utils_v2.relu_backward(dA, activation_cache)
dA_pre, dW, db = linear_backward(dZ, linear_cache)
return dA_pre, dW, db
elif activation == 'sigmoid':
dZ = dnn_utils_v2.sigmoid_backward(dA, activation_cache)
dA_pre, dW, db = linear_backward(dZ, linear_cache)
return dA_pre, dW, db

def L_model_backward(AL, Y, caches):
"""
反向传播整合
:param AL:
:param Y:
:param caches:
:return:
"""
grads = {}
L = len(caches) # 网络层数
m = AL.shape[1]
Y = Y.reshape(AL.shape)

dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
current_cache = caches[L - 1]
grads['dA' + str(L)], grads['dW' + str(L)], grads['db' + str(L)] = linear_activation_backward(
dAL, current_cache, activation='sigmoid')

for l in reversed(range(L - 1)):
current_cache = caches[l]
dA_pre, dW, db = linear_activation_backward(grads['dA' + str(l + 2)], current_cache, activation='relu')
grads['dA' + str(l + 1)] = dA_pre
grads['dW' + str(l + 1)] = dW
grads['db' + str(l + 1)] = db

return grads

def update_parameters(parametes, grads, learning_rate=0.1):
"""
更新参数
:param parametes:
:param grads:
:param learning_rate:
:return:
"""
L = len(parametes) // 2 # 一层有w和b两个参数
for l in range(L):
parametes['W' + str(l + 1)] = parametes['W' + str(l + 1)] - learning_rate * grads['dW' + str(l + 1)]
parametes['b' + str(l + 1)] = parametes['b' + str(l + 1)] - learning_rate * grads['db' + str(l + 1)]
return parametes

if __name__ == '__main__':
np.random.rand(1)

# 线性单元测试
# A, W, b = testCases_v2.linear_forward_test_case()
# Z, linear_cache = linear_forward(A, W, b)
# print(Z)

# 激活函数测试
# A_pre, W, b = testCases_v2.linear_activation_forward_test_case()
# A, cache = linear_activation_forward(A_pre, W, b, activation='sigmoid')
# print('sigmoid: ' + str(A))
# A, cache = linear_activation_forward(A_pre, W, b, activation='relu')
# print('relu: ' + str(A))

# 正向传播测试
# X, parameters = testCases_v2.L_model_forward_test_case()
# AL, caches = L_mode_forward(X, parameters)
# print(str(AL), str(len(caches)))

# # 损失函数测试
# Y, AL = testCases_v2.compute_cost_test_case()
# print(compute_cost(AL, Y))

# 梯度测试
# dZ, linear_cache = testCases_v2.linear_backward_test_case()
# print(linear_backward(dZ, linear_cache))
# AL, cache = testCases_v2.linear_activation_backward_test_case()
# print(linear_activation_backward(AL, cache, activation='sigmoid'))
# print(linear_activation_backward(AL, cache, activation='relu'))

# 完整的反向出传播梯度测试
# AL, Y, caches = testCases_v2.L_model_backward_test_case()
# grads = L_mode_backward(AL, Y, caches)
# print(grads)

# 更新参数
parameters, grads = testCases_v2.update_parameters_test_case()
parameters = update_parameters(parameters, grads, learning_rate=0.1)
print(parameters)


使用:

def load_data():
train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r")
train_set_x_orig = np.array(train_dataset["train_set_x"][:])  # your train set features
train_set_y_orig = np.array(train_dataset["train_set_y"][:])  # your train set labels

test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r")
test_set_x_orig = np.array(test_dataset["test_set_x"][:])  # your test set features
test_set_y_orig = np.array(test_dataset["test_set_y"][:])  # your test set labels

classes = np.array(test_dataset["list_classes"][:])  # the list of classes

train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))

return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes

def L_layer_model_train(x_train, y_train, layers_dims, train_num=2000, learning_rate=0.02, print_cost=True):
"""
多层模型
:param x_train:
:param y_train:
:param layers_dims: 整个网络模型的维度数组
:param train_num:
:param learning_rate:
:param print_cost:
:return:
"""
parameters = dnn_app_utils_v2.initialize_parameters_deep(layers_dims)
costs = []
for i in range(1, train_num + 1):
AL, cache = deep_neural_network.L_model_forward(x_train, parameters)
cost = deep_neural_network.compute_cost(AL, y_train)
grads = deep_neural_network.L_model_backward(AL, y_train, cache)
parameters = deep_neural_network.update_parameters(parameters, grads, learning_rate)

if print_cost and i % 100 == 0:
costs.append(cost)
acc = (np.dot(y_train, AL.T) + np.dot(1 - y_train, 1 - AL.T)) / y_train.shape[1] * 100
print("num {} cost is: {} and acc is : {}".format(i, cost, np.squeeze(acc)))

if print_cost:
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per tens)')
plt.title("Learning rate = " + str(learning_rate))
plt.show()

return parameters

def L_layers_model_test(x_test, y_test, parameters):
"""
多层模型的检验
:param x_test:
:param y_test:
:param parameters:
:return:
"""
y_pre, cache = deep_neural_network.L_model_forward(x_test, parameters)
cost = deep_neural_network.compute_cost(y_pre, y_test)
acc = (np.dot(y_test,y_pre.T) + np.dot(1-y_test, 1-y_pre.T)) / y_test.shape[1] * 100
print(cost, np.squeeze(acc))
return cost, acc

if __name__ == '__main__':
np.random.rand(1)

# 从文件中读取数据
train_x_orig, train_y, test_x_orig, test_y, classes = deep_neural_network.load_data()
print(train_x_orig.shape, test_x_orig.shape)

# 将 [m,px,px,3] 转为 [px *  px * 3, m], 并归一化
x_train = train_x_orig.reshape(train_x_orig.shape[0], -1).T
# 将类数目作为输入的列数,可保持不变
x_test = test_x_orig.reshape(test_x_orig.shape[0], -1).T
x_train = x_train / 255
x_test = x_test / 255
print(x_train.shape)

# 两层模型
# parameters = two_layer_model_train(x_train, train_y)
# two_layer_model_test(x_train, train_y, parameters)
# two_layer_model_test(x_test, test_y, parameters)

# 任意层模型
layer_dims = np.array([12288, 20, 10, 5, 1])
parameters = L_layer_model_train(x_train, train_y, layer_dims, learning_rate=0.01)
L_layers_model_test(x_test, test_y, parameters)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
相关文章推荐