您的位置:首页 > Web前端

深度学习笔记(1):caffe 添加新层 attention LSTM layer和LSTM layer代码精读

2017-03-08 14:33 696 查看
总结一下最近的工作:LSTM layer 代码,caffe 加入新层 Attention LSTM layer

LSTM layer

关键代码如下,可以参考图1进行阅读,图一来自博客

namespace caffe {

template <typename Dtype>
void LSTMLayer<Dtype>::RecurrentInputBlobNames(vector<string>* names) const {
names->resize(2);
(*names)[0] = "h_0";
(*names)[1] = "c_0";   //定义h_0,c_0 的输入
}

template <typename Dtype>
void LSTMLayer<Dtype>::RecurrentOutputBlobNames(vector<string>* names) const {
names->resize(2);
(*names)[0] = "h_" + this->int_to_str(this->T_);
(*names)[1] = "c_T";  // 定义输出,不同时刻的h_t
}

template <typename Dtype>
void LSTMLayer<Dtype>::OutputBlobNames(vector<string>* names) const {
names->resize(1);
(*names)[0] = "h";  // 最终输出h
}

template <typename Dtype>
void LSTMLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const {
const int num_output = this->layer_param_.recurrent_param().num_output();
CHECK_GT(num_output, 0) << "num_output must be positive";
const FillerParameter& weight_filler =
this->layer_param_.recurrent_param().weight_filler();
const FillerParameter& bias_filler =
this->layer_param_.recurrent_param().bias_filler(); // 权重W和偏差Bias

// Add generic LayerParameter's (without bottoms/tops) of layer types we'll
// use to save redundant code.
LayerParameter hidden_param;
hidden_param.set_type("InnerProduct");
hidden_param.mutable_inner_product_param()->set_num_output(num_output * 4);
hidden_param.mutable_inner_product_param()->set_bias_term(false);
hidden_param.mutable_inner_product_param()->set_axis(2);
hidden_param.mutable_inner_product_param()->
mutable_weight_filler()->CopyFrom(weight_filler);

LayerParameter biased_hidden_param(hidden_param);
biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);
biased_hidden_param.mutable_inner_product_param()->
mutable_bias_filler()->CopyFrom(bias_filler);

LayerParameter sum_param;
sum_param.set_type("Eltwise");
sum_param.mutable_eltwise_param()->set_operation(
EltwiseParameter_EltwiseOp_SUM);

LayerParameter slice_param;
slice_param.set_type("Slice");
slice_param.mutable_slice_param()->set_axis(0);

LayerParameter split_param;
split_param.set_type("Split");

BlobShape input_shape;
input_shape.add_dim(1);  //      eg:1x3x256,256是lstm层设置的,3为buffersize或则通道数
input_shape.add_dim(this->N_);
input_shape.add_dim(num_output);

net_param->add_input("c_0");
net_param->add_input_shape()->CopyFrom(input_shape);

net_param->add_input("h_0");
net_param->add_input_shape()->CopyFrom(input_shape);

LayerParameter* cont_slice_param = net_param->add_layer();
cont_slice_param->CopyFrom(slice_param);
cont_slice_param->set_name("cont_slice");
cont_slice_param->add_bottom("cont");
cont_slice_param->mutable_slice_param()->set_axis(1);  // cont 为0或则1,slice

// Add layer to transform all timesteps of x to the hidden state dimension.
//     W_xc_x = W_xc * x + b_c
{
LayerParameter* x_transform_param = net_param->add_layer();
x_transform_param->CopyFrom(biased_hidden_param);
x_transform_param->set_name("x_transform");
x_transform_param->add_param()->set_name("W_xc");
x_transform_param->add_param()->set_name("b_c");
x_transform_param->add_bottom("x");
x_transform_param->add_top("W_xc_x");//全连接层,W x X+ b ,实际上对应着下图的维度的变化,最后一维必须要变成1024
}                                  //这样才能和4个gate对应,因为W x X+ b 包含了4个gate,可以看做[[],[],[],[]].因此维度需要为1024,!

LayerParameter* x_slice_param = net_param->add_layer();
x_slice_param->CopyFrom(slice_param);
x_slice_param->add_bottom("W_xc_x");
x_slice_param->set_name("W_xc_x_slice");  //维度变换完成后,需要slice                                  //举例,一个视频,10帧,则一帧一帧的输入,这样应该好理解一点,参考下图
LayerParameter output_concat_layer;
output_concat_layer.set_name("h_concat");
output_concat_layer.set_type("Concat");
output_concat_layer.add_top("h");
output_concat_layer.mutable_concat_param()->set_axis(0); //lstm每一个T都输出h_t,至于你取哪一个或者都去,取决于你的prototxt代码的编写

for (int t = 1; t <= this->T_; ++t) {    // 开始实现 平铺 LSTM 层,可以看 这里
string tm1s = this->int_to_str(t - 1);
string ts = this->int_to_str(t);

cont_slice_param->add_top("cont_" + ts);
x_slice_param->add_top("W_xc_x_" + ts);

// Add layers to flush the hidden state when beginning a new
// sequence, as indicated by cont_t.
//     h_conted_{t-1} := cont_t * h_{t-1}
//
// Normally, cont_t is binary (i.e., 0 or 1), so:
//     h_conted_{t-1} := h_{t-1} if cont_t == 1
//                       0   otherwise
{
LayerParameter* cont_h_param = net_param->add_layer();
cont_h_param->CopyFrom(sum_param);
cont_h_param->mutable_eltwise_param()->set_coeff_blob(true);
cont_h_param->set_name("h_conted_" + tm1s);
cont_h_param->add_bottom("h_" + tm1s);
cont_h_param->add_bottom("cont_" + ts);
cont_h_param->add_top("h_conted_" + tm1s); // h_conted 的产生,这个根据cont_slice 决定,与cont_slice 相乘

}

// Add layer to compute
//     W_hc_h_{t-1} := W_hc * h_conted_{t-1}
{
LayerParameter* w_param = net_param->add_layer();
w_param->CopyFrom(hidden_param);
w_param->set_name("transform_" + ts);
w_param->add_param()->set_name("W_hc");
w_param->add_bottom("h_conted_" + tm1s);
w_param->add_top("W_hc_h_" + tm1s);
w_param->mutable_inner_product_param()->set_axis(2); // 计算 W X H
}

// Add the outputs of the linear transformations to compute the gate input.
//     gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c
//                   = W_hc_h_{t-1} + W_xc_x_t + b_c
{
LayerParameter* input_sum_layer = net_param->add_layer();
input_sum_layer->CopyFrom(sum_param);
input_sum_layer->set_name("gate_input_" + ts);
input_sum_layer->add_bottom("W_hc_h_" + tm1s);
input_sum_layer->add_bottom("W_xc_x_" + ts);
if (this->static_input_) {
input_sum_layer->add_bottom("W_xc_x_static");
}
input_sum_layer->add_top("gate_input_" + ts);     //根据公式,计算4个gate的输出
}

// Add LSTMUnit layer to compute the cell & hidden vectors c_t and h_t.
// Inputs: c_{t-1}, gate_input_t = (i_t, f_t, o_t, g_t), cont_t
// Outputs: c_t, h_t
//     [ i_t' ]
//     [ f_t' ] := gate_input_t
//     [ o_t' ]
//     [ g_t' ]
//         i_t := \sigmoid[i_t']
//         f_t := \sigmoid[f_t']
//         o_t := \sigmoid[o_t']
//         g_t := \tanh[g_t']
//         c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
//         h_t := o_t .* \tanh[c_t]
{
LayerParameter* lstm_unit_param = net_param->add_layer();
lstm_unit_param->set_type("LSTMUnit");
lstm_unit_param->add_bottom("c_" + tm1s);
lstm_unit_param->add_bottom("gate_input_" + ts);
lstm_unit_param->add_bottom("cont_" + ts);
lstm_unit_param->add_top("c_" + ts);
lstm_unit_param->add_top("h_" + ts);
lstm_unit_param->set_name("unit_" + ts);  //4 个 gate 输入值进行不同函数的非线性变换
}
output_concat_layer.add_bottom("h_" + ts);  // 中间隐藏层的状态输出
}  // for (int t = 1; t <= this->T_; ++t)

{
LayerParameter* c_T_copy_param = net_param->add_layer();
c_T_copy_param->CopyFrom(split_param);
c_T_copy_param->add_bottom("c_" + this->int_to_str(this->T_));
c_T_copy_param->add_top("c_T");
}
net_param->add_layer()->CopyFrom(output_concat_layer); //split:将C_T复制几份,分别给不同的layer,这些上层layer共享这个blob
}

INSTANTIATE_CLASS(LSTMLayer);
REGISTER_LAYER_CLASS(LSTM);  //LSTM layer 的注册

}  // namespace caffe






   上面是LSTMlayer的代码分析,整体流程比较清晰,但是由于Attention Model 的提出,很多模型需要attention LSTM,而caffe 没有alstm layer,因此博主自己写了个ALSTM layer,由于篇幅比较长,换下一篇写。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  LSTM 深度学习 caffe