您的位置：首页 > Web前端

梳理caffe代码adadelta、adagrad、adam、nesterov、rmsprop_solver（十六）

2016-05-30 10:49 567 查看

有一篇不错的blog描述梯度下降的方法。我前面有介绍solver的优化方法。

这节是新版caffe solver的5个求解方法：

adagrad_solver.cpp：

#include <vector>

#include "caffe/sgd_solvers.hpp"

namespace caffe {

#ifndef CPU_ONLY
template <typename Dtype>
void adagrad_update_gpu(int N, Dtype* g, Dtype* h, Dtype delta,
Dtype local_rate);
#endif

template <typename Dtype>
void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
CHECK(Caffe::root_solver());
const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
const vector<float>& net_params_lr = this->net_->params_lr();
Dtype delta = this->param_.delta();
Dtype local_rate = rate * net_params_lr[param_id];
//求解公式是按照math_functions里面的函数进行实现的，新版caffe和旧版的caffe有点区别，里面函数没什么太大区别
switch (Caffe::mode()) {
case Caffe::CPU: {
// compute square of gradient in update
caffe_powx(net_params[param_id]->count(),
net_params[param_id]->cpu_diff(), Dtype(2),
this->update_[param_id]->mutable_cpu_data());//update[i] =cpu_diff[i] ^ 2

// update history
caffe_add(net_params[param_id]->count(),
this->update_[param_id]->cpu_data(),
this->history_[param_id]->cpu_data(),
this->history_[param_id]->mutable_cpu_data());//history[i] = history[i] + update[i]

// prepare update
caffe_powx(net_params[param_id]->count(),
this->history_[param_id]->cpu_data(), Dtype(0.5),
this->update_[param_id]->mutable_cpu_data());//update[i]=history[i]开根号

caffe_add_scalar(net_params[param_id]->count(),
delta, this->update_[param_id]->mutable_cpu_data());//update[i] += delta

caffe_div(net_params[param_id]->count(),
net_params[param_id]->cpu_diff(),
this->update_[param_id]->cpu_data(),
this->update_[param_id]->mutable_cpu_data());//update[i] = cpu_diff[i] \ update[i]

// scale and copy
//Y= alpha*X+beta*Y   cpu_diff[i]=local_rate*update[i]
caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
this->update_[param_id]->cpu_data(), Dtype(0),
net_params[param_id]->mutable_cpu_diff());
break;
}
case Caffe::GPU: {
#ifndef CPU_ONLY
adagrad_update_gpu(net_params[param_id]->count(),
net_params[param_id]->mutable_gpu_diff(),
this->history_[param_id]->mutable_gpu_data(), delta, local_rate);
#else
NO_GPU;
#endif
break;
}
default:
LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
}
}

INSTANTIATE_CLASS(AdaGradSolver);
REGISTER_SOLVER_CLASS(AdaGrad);

}  // namespace caffe

adadelta_solver.cpp：

#include <vector>

#include "caffe/sgd_solvers.hpp"

namespace caffe {

template <typename Dtype>
void AdaDeltaSolver<Dtype>::AdaDeltaPreSolve() {
// Add the extra history entries for AdaDelta after those from
// SGDSolver::PreSolve
const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
for (int i = 0; i < net_params.size(); ++i) {
const vector<int>& shape = net_params[i]->shape();
this->history_.push_back(
shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
}
}

#ifndef CPU_ONLY
template <typename Dtype>
void adadelta_update_gpu(int N, Dtype* g, Dtype* h, Dtype* h2, Dtype momentum,
Dtype delta, Dtype local_rate);
#endif

template <typename Dtype>
void AdaDeltaSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
const vector<float>& net_params_lr = this->net_->params_lr();
Dtype delta = this->param_.delta();
Dtype momentum = this->param_.momentum();
Dtype local_rate = rate * net_params_lr[param_id];
size_t update_history_offset = net_params.size();
switch (Caffe::mode()) {
case Caffe::CPU: {
caffe_powx(net_params[param_id]->count(),
net_params[param_id]->cpu_diff(), Dtype(2),
this->update_[param_id]->mutable_cpu_data());//update[i] =cpu_diff[i] ^ 2

caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
this->update_[param_id]->cpu_data(), momentum,
this->history_[param_id]->mutable_cpu_data());//history[i]=(1 - momentum)*update[i]+momentum*history[i]

caffe_set(net_params[param_id]->count(), delta,
this->temp_[param_id]->mutable_cpu_data());//temp=delta

caffe_add(net_params[param_id]->count(),
this->temp_[param_id]->cpu_data(),
this->history_[update_history_offset + param_id]->cpu_data(),
this->update_[param_id]->mutable_cpu_data());//update[i]=temp+history[i]

caffe_add(net_params[param_id]->count(),
this->temp_[param_id]->cpu_data(),
this->history_[param_id]->cpu_data(),
this->temp_[param_id]->mutable_cpu_data());//temp=temp+history[i]

caffe_div(net_params[param_id]->count(),
this->update_[param_id]->cpu_data(),
this->temp_[param_id]->cpu_data(),
this->update_[param_id]->mutable_cpu_data());//update[i]=update[i]/temp

caffe_powx(net_params[param_id]->count(),
this->update_[param_id]->cpu_data(), Dtype(0.5),
this->update_[param_id]->mutable_cpu_data());//update[i]=update[i]的开根号

caffe_mul(net_params[param_id]->count(),
net_params[param_id]->cpu_diff(),
this->update_[param_id]->cpu_data(),
net_params[param_id]->mutable_cpu_diff());//cpu_diff[i]=cpu_diff[i]*update[i]

caffe_powx(net_params[param_id]->count(),
net_params[param_id]->cpu_diff(), Dtype(2),
this->update_[param_id]->mutable_cpu_data());//update[i]=cpu_diff[i]^ 2

//history[i]=(1 - momentum)*update[i]+momentum*history[i]
caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
this->update_[param_id]->cpu_data(), momentum,
this->history_[update_history_offset + param_id]->mutable_cpu_data());

caffe_cpu_scale(net_params[param_id]->count(), local_rate,
net_params[param_id]->cpu_diff(),
net_params[param_id]->mutable_cpu_diff());//cpu_diff[i]=local_rate*cpu_diff[i]
break;
}
case Caffe::GPU: {
#ifndef CPU_ONLY
adadelta_update_gpu(net_params[param_id]->count(),
net_params[param_id]->mutable_gpu_diff(),
this->history_[param_id]->mutable_gpu_data(),
this->history_[update_history_offset + param_id]->mutable_gpu_data(),
momentum, delta, local_rate);
#else
NO_GPU;
#endif
break;
}
default:
LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
}
}

INSTANTIATE_CLASS(AdaDeltaSolver);
REGISTER_SOLVER_CLASS(AdaDelta);

}  // namespace caffe

adam_solver.cpp:

按照上图的实现：

#include <vector>

#include "caffe/sgd_solvers.hpp"

namespace caffe {

template <typename Dtype>
void AdamSolver<Dtype>::AdamPreSolve() {
// Add the extra history entries for Adam after those from
// SGDSolver::PreSolve
const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
for (int i = 0; i < net_params.size(); ++i) {
const vector<int>& shape = net_params[i]->shape();
this->history_.push_back(
shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
}
}

#ifndef CPU_ONLY
template <typename Dtype>
void adam_update_gpu(int N, Dtype* g, Dtype* m, Dtype* v, Dtype beta1,
Dtype beta2, Dtype eps_hat, Dtype corrected_local_rate);
#endif

template <typename Dtype>
void AdamSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
const vector<float>& net_params_lr = this->net_->params_lr();
Dtype local_rate = rate * net_params_lr[param_id];
const Dtype beta1 = this->param_.momentum();
const Dtype beta2 = this->param_.momentum2();

// we create aliases for convenience
size_t update_history_offset = net_params.size();
Blob<Dtype>* val_m = this->history_[param_id].get();
Blob<Dtype>* val_v = this->history_[param_id + update_history_offset].get();
Blob<Dtype>* val_t = this->temp_[param_id].get();

const int t = this->iter_ + 1;
const Dtype correction = std::sqrt(Dtype(1) - pow(beta2, t))
(Dtype(1.) - pow(beta1, t));//correction=()^0.5 *();
const int N = net_params[param_id]->count();
const Dtype eps_hat = this->param_.delta();

switch (Caffe::mode()) {
case Caffe::CPU: {
// update m <- \beta_1 m_{t-1} + (1-\beta_1)g_t
caffe_cpu_axpby(N, Dtype(1)-beta1,
net_params[param_id]->cpu_diff(), beta1,
val_m->mutable_cpu_data());

// update v <- \beta_2 m_{t-1} + (1-\beta_2)g_t^2
caffe_mul(N,
net_params[param_id]->cpu_diff(),
net_params[param_id]->cpu_diff(),
val_t->mutable_cpu_data());
caffe_cpu_axpby(N, Dtype(1)-beta2,
val_t->cpu_data(), beta2,
val_v->mutable_cpu_data());

// set update
//t=v^0.5
caffe_powx(N,
val_v->cpu_data(), Dtype(0.5),
val_t->mutable_cpu_data());
//t=t+eps
caffe_add_scalar(N, eps_hat, val_t->mutable_cpu_data());
//  t=m/t
caffe_div(N,
val_m->cpu_data(),
val_t->cpu_data(),
val_t->mutable_cpu_data());
//cpu_diff[i]=local_rate*correction*t
caffe_cpu_scale(N, local_rate*correction,
val_t->cpu_data(),
net_params[param_id]->mutable_cpu_diff());
break;
}
case Caffe::GPU: {
#ifndef CPU_ONLY
adam_update_gpu(N, net_params[param_id]->mutable_gpu_diff(),
val_m->mutable_gpu_data(), val_v->mutable_gpu_data(), beta1, beta2,
eps_hat, local_rate*correction);
#else
NO_GPU;
#endif
break;
}
default:
LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
}
}

INSTANTIATE_CLASS(AdamSolver);
REGISTER_SOLVER_CLASS(Adam);

}  // namespace caffe

nesterov_solver.cpp：

这个求解方法可以参考知乎上的图解：here

实现:

#include <vector>

#include "caffe/sgd_solvers.hpp"

namespace caffe {

#ifndef CPU_ONLY
template <typename Dtype>
void nesterov_update_gpu(int N, Dtype* g, Dtype* h, Dtype momentum,
Dtype local_rate);
#endif

template <typename Dtype>
void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
CHECK(Caffe::root_solver());
const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
const vector<float>& net_params_lr = this->net_->params_lr();
Dtype momentum = this->param_.momentum();
Dtype local_rate = rate * net_params_lr[param_id];
switch (Caffe::mode()) {
case Caffe::CPU: {
// save history momentum for stepping back
//如果梯度求解时出现迭代倒退时需要将当前动量保存下来，一起不是很理解动量，其实
就是每次梯度下降的时候往前迭代需要一定的动量(按照物理理解)
caffe_copy(net_params[param_id]->count(),
this->history_[param_id]->cpu_data(),
this->update_[param_id]->mutable_cpu_data());

// update history
//这个算法也不是很复杂，所以代码比较简单点
//history=local_rate*diff+momentum*history
caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
net_params[param_id]->cpu_diff(), momentum,
this->history_[param_id]->mutable_cpu_data());

// compute update: step back then over step
//update=(1+ momentum)*history-momentum*update
caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
this->history_[param_id]->cpu_data(), -momentum,
this->update_[param_id]->mutable_cpu_data());

// copy
caffe_copy(net_params[param_id]->count(),
this->update_[param_id]->cpu_data(),
net_params[param_id]->mutable_cpu_diff());
break;
}
case Caffe::GPU: {
#ifndef CPU_ONLY
nesterov_update_gpu(net_params[param_id]->count(),
net_params[param_id]->mutable_gpu_diff(),
this->history_[param_id]->mutable_gpu_data(),
momentum, local_rate);
#else
NO_GPU;
#endif
break;
}
default:
LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
}
}

INSTANTIATE_CLASS(NesterovSolver);
REGISTER_SOLVER_CLASS(Nesterov);

}  // namespace caffe

rmsprop_solver.cpp:
这里有一个官网的介绍:here

rmsprop的算法推导：here

实现：

#include <vector>

#include "caffe/sgd_solvers.hpp"

namespace caffe {

#ifndef CPU_ONLY
template <typename Dtype>
void rmsprop_update_gpu(int N, Dtype* g, Dtype* h, Dtype rms_decay,
Dtype delta, Dtype local_rate);
#endif

template <typename Dtype>
void RMSPropSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
const vector<float>& net_params_lr = this->net_->params_lr();

// get the learning rate
Dtype delta = this->param_.delta();
Dtype rms_decay = this->param_.rms_decay();
Dtype local_rate = rate * net_params_lr[param_id];

switch (Caffe::mode()) {
case Caffe::CPU:
// compute square of gradient in update
//update=diff^2
caffe_powx(net_params[param_id]->count(),
net_params[param_id]->cpu_diff(), Dtype(2),
this->update_[param_id]->mutable_cpu_data());

// update history
//history=(1-rms_decay)*update+rms_decay*history
caffe_cpu_axpby(net_params[param_id] -> count(),
Dtype(1-rms_decay), this->update_[param_id]->cpu_data(),
rms_decay, this->history_[param_id]-> mutable_cpu_data());

// prepare update
//update=history^0.5
caffe_powx(net_params[param_id]->count(),
this->history_[param_id]->cpu_data(), Dtype(0.5),
this->update_[param_id]->mutable_cpu_data());
//update=delta*update
caffe_add_scalar(net_params[param_id]->count(),
delta, this->update_[param_id]->mutable_cpu_data());
// update=diff/update
caffe_div(net_params[param_id]->count(),
net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(),
this->update_[param_id]->mutable_cpu_data());

// scale and copy
//diff=local_rate*update
caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
this->update_[param_id]->cpu_data(), Dtype(0),
net_params[param_id]->mutable_cpu_diff());
break;
case Caffe::GPU:
#ifndef CPU_ONLY
rmsprop_update_gpu(net_params[param_id]->count(),
net_params[param_id]->mutable_gpu_diff(),
this->history_[param_id]->mutable_gpu_data(),
rms_decay, delta, local_rate);
#else
NO_GPU;
#endif
break;
default:
LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
}
}

INSTANTIATE_CLASS(RMSPropSolver);
REGISTER_SOLVER_CLASS(RMSProp);

} // namespace caffe

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航