caffe code 理解-solver.cpp&&sgd_solver.cpp
2016-11-18 19:05
405 查看
Solver的流程:
• 1. 设计好需要优化的对象,以及用于学习的训练网络和用于评估的测试网络。
• 2. 通过forward和backward迭代的进行优化来更新参数
• 3. 定期的评价测试网络
• 4. 在优化过程中显示模型和solver的状态
• 每一步迭代的过程(体现在solvers文件夹中)
• 1. 通过forward计算网络的输出和loss
• 2. 通过backward计算网络的梯度
• 3. 根据solver方法,利用梯度来对参数进行更新
• 4. 根据learning
rate,history和method来更新solver的状态
• 1. 设计好需要优化的对象,以及用于学习的训练网络和用于评估的测试网络。
• 2. 通过forward和backward迭代的进行优化来更新参数
• 3. 定期的评价测试网络
• 4. 在优化过程中显示模型和solver的状态
• 每一步迭代的过程(体现在solvers文件夹中)
• 1. 通过forward计算网络的输出和loss
• 2. 通过backward计算网络的梯度
• 3. 根据solver方法,利用梯度来对参数进行更新
• 4. 根据learning
rate,history和method来更新solver的状态
#include <cstdio> #include <string> #include <vector> #include "caffe/solver.hpp" #include "caffe/util/hdf5.hpp" #include "caffe/util/io.hpp" #include "caffe/util/upgrade_proto.hpp" namespace caffe { //确定solver层被使用方式: template<typename Dtype> void Solver<Dtype>::SetActionFunction(ActionCallback func) { action_request_function_ = func; } template<typename Dtype> SolverAction::Enum Solver<Dtype>::GetRequestedAction() { if (action_request_function_) { // If the external request function has been set, call it. return action_request_function_(); } return SolverAction::NONE; } //设计好需要优化的对象,以及用于学习的训练网络和用于评估的测试网络: //构造函数:初始化net,调用init,有两种调用参数的方式 //1.使用SolverParameter类型的param //2.使用string类型的param_file template <typename Dtype> Solver<Dtype>::Solver(const SolverParameter& param, const Solver* root_solver) : net_(), callbacks_(), root_solver_(root_solver), requested_early_exit_(false) { //构造函数,初始化net,调用init,使用SolverParameter //类型的param Init(param); } template <typename Dtype> Solver<Dtype>::Solver(const string& param_file, const Solver* root_solver) : net_(), callbacks_(), root_solver_(root_solver), requested_early_exit_(false) { SolverParameter param; //构造函数,初始化net,调用init,使用string类型的 // param_file ReadSolverParamsFromTextFileOrDie(param_file, ¶m); Init(param); } template <typename Dtype> void Solver<Dtype>::Init(const SolverParameter& param) { CHECK(Caffe::root_solver() || root_solver_) << "root_solver_ needs to be set for all non-root solvers"; LOG_IF(INFO, Caffe::root_solver()) << "Initializing solver from parameters: " << std::endl << param.DebugString(); param_ = param; CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative."; CheckSnapshotWritePermissions(); if (Caffe::root_solver() && param_.random_seed() >= 0) { //设置随机数种子 Caffe::set_random_seed(param_.random_seed()); } // Scaffolding code InitTrainNet(); //初始化trainnet,net指向这个空间 if (Caffe::root_solver()) { InitTestNets(); //初始化testnet,net指向这个空间 LOG(INFO) << "Solver scaffolding done."; } iter_ = 0; current_step_ = 0; } template <typename Dtype> void Solver<Dtype>::InitTrainNet() { const int num_train_nets = param_.has_net() + param_.has_net_param() + param_.has_train_net() + param_.has_train_net_param(); //训练网络数量 const string& field_names = "net, net_param, train_net, train_net_param"; //区域名字 CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net " << "using one of these fields: " << field_names; CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than " << "one of these fields specifying a train_net: " << field_names; //训练网络数量超过一 //报错 NetParameter net_param; //网络参数 if (param_.has_train_net_param()) { LOG_IF(INFO, Caffe::root_solver()) << "Creating training net specified in train_net_param."; // net_param.CopyFrom(param_.train_net_param()); //从训练网络参数中复制参数 } else if (param_.has_train_net()) { LOG_IF(INFO, Caffe::root_solver()) << "Creating training net from train_net file: " << param_.train_net(); ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param); //从训练网络中读取参数 } if (param_.has_net_param()) { LOG_IF(INFO, Caffe::root_solver()) << "Creating training net specified in net_param."; //从测试网络参数中复制参数 net_param.CopyFrom(param_.net_param()); } if (param_.has_net()) { LOG_IF(INFO, Caffe::root_solver()) << "Creating training net from net file: " << param_.net(); //从测试网络中读取参数 ReadNetParamsFromTextFileOrDie(param_.net(), &net_param); } // 设置正确的网络状态,训练从默认开始,然后融入通过网络层规定在任何状态,最后融入 //训练状态(最优解) NetState net_state; net_state.set_phase(TRAIN); net_state.MergeFrom(net_param.state()); net_state.MergeFrom(param_.train_state()); net_param.mutable_state()->CopyFrom(net_state);//设置solver的初始化参数,混合网络参数本身 //的网络状态,混合训练好的state的训练状态 if (Caffe::root_solver()) { net_.reset(new Net<Dtype>(net_param)); //调用模板类的构造函数,进行net的初始化 } else { net_.reset(new Net<Dtype>(net_param, root_solver_->net_.get())); } merge_step_ = param_.refresh_step_size(); need_merge_weight_ = merge_step_ ? true : false; if (need_merge_weight_){ CHECK_LT(merge_step_, param_.max_iter()); } } //InitTestNet() 同理于InitTrainNet()不同的是TestNet可以有多个,而TrainNet只能有一个。 template <typename Dtype> void Solver<Dtype>::InitTestNets() { CHECK(Caffe::root_solver()); const bool has_net_param = param_.has_net_param(); const bool has_net_file = param_.has_net(); const int num_generic_nets = has_net_param + has_net_file; //同类网络数量 CHECK_LE(num_generic_nets, 1) << "Both net_param and net_file may not be specified."; const int num_test_net_params = param_.test_net_param_size(); const int num_test_net_files = param_.test_net_size(); const int num_test_nets = num_test_net_params + num_test_net_files; //测试网络数量 if (num_generic_nets) { CHECK_GE(param_.test_iter_size(), num_test_nets) << "test_iter must be specified for each test network."; } else { CHECK_EQ(param_.test_iter_size(), num_test_nets) << "test_iter must be specified for each test network."; } // If we have a generic net (specified by net or net_param, rather than // test_net or test_net_param), we may have an unlimited number of actual // test networks -- the actual number is given by the number of remaining // test_iters after any test nets specified by test_net_param and/or test_net // are evaluated. const int num_generic_net_instances = param_.test_iter_size() - num_test_nets; //测试迭代大小减测试网络数量 const int num_test_net_instances = num_test_nets + num_generic_net_instances; //测试网络例子数量 if (param_.test_state_size()) { CHECK_EQ(param_.test_state_size(), num_test_net_instances) << "test_state must be unspecified or specified once per test net."; } if (num_test_net_instances) { CHECK_GT(param_.test_interval(), 0); } int test_net_id = 0; vector<string> sources(num_test_net_instances); vector<NetParameter> net_params(num_test_net_instances); for (int i = 0; i < num_test_net_params; ++i, ++test_net_id) { sources[test_net_id] = "test_net_param"; //对网络参数进行标记 net_params[test_net_id].CopyFrom(param_.test_net_param(i)); //复制网络参数 } for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) { sources[test_net_id] = "test_net file: " + param_.test_net(i); //对网络参数进行标记 ReadNetParamsFromTextFileOrDie(param_.test_net(i), //复制网络参数 &net_params[test_net_id]); } const int remaining_test_nets = param_.test_iter_size() - test_net_id; if (has_net_param) { for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) { sources[test_net_id] = "net_param"; net_params[test_net_id].CopyFrom(param_.net_param()); } } if (has_net_file) { for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) { sources[test_net_id] = "net file: " + param_.net(); ReadNetParamsFromTextFileOrDie(param_.net(), &net_params[test_net_id]); } } test_nets_.resize(num_test_net_instances); for (int i = 0; i < num_test_net_instances; ++i) { // 设置正确的网络状态,训练从默认开始,然后融入通过网络层规定在任何状态,最后融入 //训练状态(最优解) NetState net_state; net_state.set_phase(TEST); net_state.MergeFrom(net_params[i].state()); if (param_.test_state_size()) { net_state.MergeFrom(param_.test_state(i)); } net_params[i].mutable_state()->CopyFrom(net_state); LOG(INFO) << "Creating test net (#" << i << ") specified by " << sources[i]; if (Caffe::root_solver()) { test_nets_[i].reset(new Net<Dtype>(net_params[i])); } else { test_nets_[i].reset(new Net<Dtype>(net_params[i], root_solver_->test_nets_[i].get())); } test_nets_[i]->set_debug_info(param_.debug_info()); } } template <typename Dtype> void Solver<Dtype>::Step(int iters) { vector<Blob<Dtype>*> bottom_vec; const int start_iter = iter_; const int stop_iter = iter_ + iters; int average_loss = this->param_.average_loss(); vector<Dtype> losses; Dtype smoothed_loss = 0; while (iter_ < stop_iter) { //0初始化参数 net_->ClearParamDiffs(); if (param_.test_interval() && iter_ % param_.test_interval() == 0 && (iter_ > 0 || param_.test_initialization()) && Caffe::root_solver()) { //参数测试存在,是测试次数的倍数,迭代数据大于0,参 //数已被初始化 TestAll(); //测试 if (requested_early_exit_) { // Break out of the while loop because stop was requested while testing. break; } } for (int i = 0; i < callbacks_.size(); ++i) { callbacks_[i]->on_start(); } const bool display = param_.display() && iter_ % param_.display() == 0; //输出次数存在,且迭代次数为次数倍数 net_->set_debug_info(display && param_.debug_info()); //输出信息 // accumulate the loss and gradient Dtype loss = 0; for (int i = 0; i < param_.iter_size(); ++i) { loss += net_->ForwardBackward(bottom_vec); } loss /= param_.iter_size(); //对于每次迭代loss累计,求均值 if (isnan(loss)) { //不符合要求,输出 LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_ << ", loss = " << loss << ", ignore and continue."; ++iter_; continue; } if (losses.size() < average_loss) { //loss维度小于需要loss维度 losses.push_back(loss); int size = losses.size(); smoothed_loss = (smoothed_loss * (size - 1) + loss) / size; //在平均loss值基础上进行更 //新,至累计达到average_loss } else { int idx = (iter_ - start_iter) % average_loss; smoothed_loss += (loss - losses[idx]) / average_loss; //过去基础上更新 losses[idx] = loss; } if (display) { LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_ << ", loss = " << smoothed_loss; const vector<Blob<Dtype>*>& result = net_->output_blobs(); //输出blob赋给result int score_index = 0; for (int j = 0; j < result.size(); ++j) { const Dtype* result_vec = result[j]->cpu_data(); const string& output_name = net_->blob_names()[net_->output_blob_indices()[j]];//输出名字 const Dtype loss_weight = net_->blob_loss_weights()[net_->output_blob_indices()[j]]; //输出loss for (int k = 0; k < result[j]->count(); ++k) { ostringstream loss_msg_stream; if (loss_weight) { loss_msg_stream << " (* " << loss_weight << " = " << loss_weight * result_vec[k] << " loss)"; } LOG_IF(INFO, Caffe::root_solver()) << " Train net output #" << score_index++ << ": " << output_name << " = " << result_vec[k] << loss_msg_stream.str(); } } } for (int i = 0; i < callbacks_.size(); ++i) { callbacks_[i]->on_gradients_ready(); } ApplyUpdate(); // Increment the internal iter_ counter -- its value should always indicate // the number of times the weights have been updated. ++iter_; SolverAction::Enum request = GetRequestedAction(); //如果需要保存一次快照 if ((param_.snapshot() && iter_ % param_.snapshot() == 0 && Caffe::root_solver()) || (request == SolverAction::SNAPSHOT)) { Snapshot(); //存储snapshot } if (SolverAction::STOP == request) { requested_early_exit_ = true; // 在一次迭代训练中跳出 break; } } } template <typename Dtype> void Solver<Dtype>::Solve(const char* resume_file) { //实例化一个Solver对象 CHECK(Caffe::root_solver()); LOG(INFO) << "Solving " << net_->name(); LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy(); //输出基本信息 // Initialize to false every time we start solving. requested_early_exit_ = false; if (resume_file) { LOG(INFO) << "Restoring previous solver status from " << resume_file; Restore(resume_file); } Step(param_.max_iter() - iter_); //调用Step()方法来迭代,迭代 param_.max_iter() - iter_ 次,//训练实际过程在此步执行 if (param_.snapshot_after_train() && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) { Snapshot(); //如果训练没有完成,保存一次snapshot,除非设置 snapshot_after_train := false } if (requested_early_exit_) { LOG(INFO) << "Optimization stopped early."; return; } //在优化完后,运行一个额外的训练和测试过程展示训练测试的loss或者输出。 if (param_.display() && iter_ % param_.display() == 0) { Dtype loss; net_->ForwardPrefilled(&loss); LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss; } if (param_.test_interval() && iter_ % param_.test_interval() == 0) { TestAll(); } LOG(INFO) << "Optimization Done."; } //定期的评价测试网络 template <typename Dtype> void Solver<Dtype>::TestAll() { //Testall 是对test_net全部进行测试 for (int test_net_id = 0; test_net_id < test_nets_.size() && !requested_early_exit_; ++test_net_id) { Test(test_net_id); } } template <typename Dtype> void Solver<Dtype>::Test(const int test_net_id) { CHECK(Caffe::root_solver()); LOG(INFO) << "Iteration " << iter_ << ", Testing net (#" << test_net_id << ")"; CHECK_NOTNULL(test_nets_[test_net_id].get())-> ShareTrainedLayersWith(net_.get()); vector<Dtype> test_score; vector<int> test_score_output_id; vector<Blob<Dtype>*> bottom_vec; const shared_ptr<Net<Dtype> >& test_net = test_nets_[test_net_id]; Dtype loss = 0; for (int i = 0; i < param_.test_iter(test_net_id); ++i) { SolverAction::Enum request = GetRequestedAction(); // 对于网络不断检测请求状态,如果在训练或测试中断请求发出后,随时执行保存快照 while (request != SolverAction::NONE) { if (SolverAction::SNAPSHOT == request) { Snapshot(); } else if (SolverAction::STOP == request) { requested_early_exit_ = true; } request = GetRequestedAction(); } if (requested_early_exit_) { // 跳出测试循环 break; } Dtype iter_loss; const vector<Blob<Dtype>*>& result = test_net->Forward(bottom_vec, &iter_loss); if (param_.test_compute_loss()) { loss += iter_loss; } if (i == 0) { for (int j = 0; j < result.size(); ++j) { const Dtype* result_vec = result[j]->cpu_data();//第一次测试时: //取每一个输出层的blob result_vec = result[j]->cpu_data() //把每一个blob的数据(降为一维)存入一个vector–“test_score” for (int k = 0; k < result[j]->count(); ++k) { test_score.push_back(result_vec[k]); test_score_output_id.push_back(j); } } } else { int idx = 0; for (int j = 0; j < result.size(); ++j) { const Dtype* result_vec = result[j]->cpu_data(); for (int k = 0; k < result[j]->count(); ++k) { test_score[idx++] += result_vec[k]; //不是第一次测试: //把输出层对应位置的blob值累加 //test_score[idx++] += result_vec[k] } } } } if (requested_early_exit_) { LOG(INFO) << "Test interrupted."; return; } if (param_.test_compute_loss()) { loss /= param_.test_iter(test_net_id); //求出平均loss值并输出 LOG(INFO) << "Test loss: " << loss; } for (int i = 0; i < test_score.size(); ++i) { const int output_blob_index = test_net->output_blob_indices()[test_score_output_id[i]]; const string& output_name = test_net->blob_names()[output_blob_index]; const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index]; ostringstream loss_msg_stream; const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id); if (loss_weight) { loss_msg_stream << " (* " << loss_weight << " = " << loss_weight * mean_score << " loss)"; } LOG(INFO) << " Test net output #" << i << ": " << output_name << " = " << mean_score << loss_msg_stream.str(); } //输出测试分数结果 } template <typename Dtype> void Solver<Dtype>::Snapshot() { //选择合适保存方式,保存快照 CHECK(Caffe::root_solver()); string model_filename; switch (param_.snapshot_format()) { case caffe::SolverParameter_SnapshotFormat_BINARYPROTO: model_filename = SnapshotToBinaryProto(); break; case caffe::SolverParameter_SnapshotFormat_HDF5: model_filename = SnapshotToHDF5(); break; default: LOG(FATAL) << "Unsupported snapshot format."; } SnapshotSolverState(model_filename); } template <typename Dtype> void Solver<Dtype>::CheckSnapshotWritePermissions() { //检查是否允许保存快照 if (Caffe::root_solver() && param_.snapshot()) { CHECK(param_.has_snapshot_prefix()) << "In solver params, snapshot is specified but snapshot_prefix is not"; string probe_filename = SnapshotFilename(".tempfile"); std::ofstream probe_ofs(probe_filename.c_str()); if (probe_ofs.good()) { probe_ofs.close(); std::remove(probe_filename.c_str()); } else { LOG(FATAL) << "Cannot write to snapshot prefix '" << param_.snapshot_prefix() << "'. Make sure " << "that the directory exists and is writeable."; } } } template <typename Dtype> string Solver<Dtype>::SnapshotFilename(const string extension) { //生成快照文件名 string filename(param_.snapshot_prefix()); const int kBufferSize = 20; char iter_str_buffer[kBufferSize]; sprintf_s(iter_str_buffer, kBufferSize, "_iter_%d", iter_); return filename + iter_str_buffer + extension; } template <typename Dtype> string Solver<Dtype>::SnapshotToBinaryProto() { string model_filename = SnapshotFilename(".caffemodel"); LOG(INFO) << "Snapshotting to binary proto file " << model_filename; NetParameter net_param; net_->ToProto(&net_param, param_.snapshot_diff()); WriteProtoToBinaryFile(net_param, model_filename); return model_filename; } template <typename Dtype> string Solver<Dtype>::SnapshotToHDF5() { string model_filename = SnapshotFilename(".caffemodel.h5"); LOG(INFO) << "Snapshotting to HDF5 file " << model_filename; net_->ToHDF5(model_filename, param_.snapshot_diff()); return model_filename; } template <typename Dtype> void Solver<Dtype>::Restore(const char* state_file) { CHECK(Caffe::root_solver()); string state_filename(state_file); if (state_filename.size() >= 3 && state_filename.compare(state_filename.size() - 3, 3, ".h5") == 0) { RestoreSolverStateFromHDF5(state_filename); } else { RestoreSolverStateFromBinaryProto(state_filename); } } INSTANTIATE_CLASS(Solver); } // namespace caffe
#include <string> #include <vector> #include "caffe/sgd_solvers.hpp" #include "caffe/util/hdf5.hpp" #include "caffe/util/io.hpp" #include "caffe/util/upgrade_proto.hpp" namespace caffe { // Return the current learning rate. The currently implemented learning rate // policies are as follows: // - fixed: always return base_lr. // - step: return base_lr * gamma ^ (floor(iter / step)) // - exp: return base_lr * gamma ^ iter // - inv: return base_lr * (1 + gamma * iter) ^ (- power) // - multistep: similar to step but it allows non uniform steps defined by // stepvalue // - poly: the effective learning rate follows a polynomial decay, to be // zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power) // - sigmoid: the effective learning rate follows a sigmod decay // return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize)))) // // where base_lr, max_iter, gamma, step, stepvalue and power are defined // in the solver parameter protocol buffer, and iter is the current iteration. template <typename Dtype> Dtype SGDSolver<Dtype>::GetLearningRate() { //得到学习率 Dtype rate; const string& lr_policy = this->param_.lr_policy(); if (lr_policy == "fixed") { //判断学习率类型 rate = this->param_.base_lr(); } else if (lr_policy == "step") { this->current_step_ = this->iter_ / this->param_.stepsize(); rate = this->param_.base_lr() * pow(this->param_.gamma(), this->current_step_); } else if (lr_policy == "exp") { rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_); } else if (lr_policy == "inv") { rate = this->param_.base_lr() * pow(Dtype(1) + this->param_.gamma() * this->iter_, - this->param_.power()); } else if (lr_policy == "multistep") { if (this->current_step_ < this->param_.stepvalue_size() && this->iter_ >= this->param_.stepvalue(this->current_step_)) { this->current_step_++; LOG(INFO) << "MultiStep Status: Iteration " << this->iter_ << ", step = " << this->current_step_; } rate = this->param_.base_lr() * pow(this->param_.gamma(), this->current_step_); } else if (lr_policy == "poly") { rate = this->param_.base_lr() * pow(Dtype(1.) - (Dtype(this->iter_) / Dtype(this->param_.max_iter())), this->param_.power()); } else if (lr_policy == "sigmoid") { rate = this->param_.base_lr() * (Dtype(1.) / (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) - Dtype(this->param_.stepsize()))))); } else { LOG(FATAL) << "Unknown learning rate policy: " << lr_policy; } return rate; //返回学习率 } template <typename Dtype> void SGDSolver<Dtype>::PreSolve() { // Initialize the history const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); history_.clear(); update_.clear(); temp_.clear(); for (int i = 0; i < net_params.size(); ++i) { const vector<int>& shape = net_params[i]->shape(); history_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape))); update_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape))); temp_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape))); } } //history维护旧的动量数据。update维护更新的相关数据,而且在snapshots中是不需要的。Temp //维护其他信息,这些信息可能是在计算梯度或者更新时需要的,而且在snapshots中是不需要的。 //前面的这几个参数输入到vector,用于后面的blob输出 template <typename Dtype> void SGDSolver<Dtype>::ClipGradients() { const Dtype clip_gradients = this->param_.clip_gradients(); if (clip_gradients < 0) { return; } const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); Dtype sumsq_diff = 0; for (int i = 0; i < net_params.size(); ++i) { sumsq_diff += net_params[i]->sumsq_diff(); } const Dtype l2norm_diff = std::sqrt(sumsq_diff); if (l2norm_diff > clip_gradients) { Dtype scale_factor = clip_gradients / l2norm_diff; LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm " << l2norm_diff << " > " << clip_gradients << ") " << "by scale factor " << scale_factor; for (int i = 0; i < net_params.size(); ++i) { net_params[i]->scale_diff(scale_factor); } //根据设定的门限,当梯度更新值平方和大于门限时,所有梯度值按照scale_factor //比例缩小 } } template <typename Dtype> void SGDSolver<Dtype>::ApplyUpdate() { CHECK(Caffe::root_solver()); Dtype rate = GetLearningRate(); if (this->param_.display() && this->iter_ % this->param_.display() == 0) { LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; } ClipGradients(); for (int param_id = 0; param_id < this->net_->learnable_params().size(); ++param_id) { Normalize(param_id); //归一化操作 Regularize(param_id); // 正则化操作 ComputeUpdateValue(param_id, rate); } this->net_->Update(); } template <typename Dtype> void SGDSolver<Dtype>::Normalize(int param_id) { //对不同的iter_size进行规范化操作,当GPU显存较小时,batch_size大小受限制,进行iter _size参数限定,此时size=batch_size*iter_size, 对数据归一化时,乘以 1/iter_size if (this->param_.iter_size() == 1) { return; } // Scale gradient to counterbalance accumulation. const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); //求得归一化参数为1/size switch (Caffe::mode()) { case Caffe::CPU: { caffe_scal(net_params[param_id]->count(), accum_normalization, net_params[param_id]->mutable_cpu_diff());//diff=diff*(1/size) 归一化操作 break; } case Caffe::GPU: { #ifndef CPU_ONLY caffe_gpu_scal(net_params[param_id]->count(), accum_normalization, net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } template <typename Dtype> void SGDSolver<Dtype>::Regularize(int param_id) { const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); const vector<float>& net_params_weight_decay = this->net_->params_weight_decay(); //网络权重衰减 Dtype weight_decay = this->param_.weight_decay(); //本层权重衰减 string regularization_type = this->param_.regularization_type(); //范数类型读取 Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; switch (Caffe::mode()) { case Caffe::CPU: { if (local_decay) { if (regularization_type == "L2") { // add weight decay caffe_axpy(net_params[param_id]->count(), local_decay, net_params[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff());//对于二范数偏导值就是数据本身 } else if (regularization_type == "L1") { caffe_cpu_sign(net_params[param_id]->count(), net_params[param_id]->cpu_data(), temp_[param_id]->mutable_cpu_data()); caffe_axpy(net_params[param_id]->count(), local_decay, temp_[param_id]->cpu_data(),//对于二范数偏导值就是数据本身的符号值 net_params[param_id]->mutable_cpu_diff()); } else { LOG(FATAL) << "Unknown regularization type: " << regularization_type; } } break; } case Caffe::GPU: { #ifndef CPU_ONLY if (local_decay) { if (regularization_type == "L2") { // add weight decay caffe_gpu_axpy(net_params[param_id]->count(), local_decay, net_params[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); } else if (regularization_type == "L1") { caffe_gpu_sign(net_params[param_id]->count(), net_params[param_id]->gpu_data(), temp_[param_id]->mutable_gpu_data()); caffe_gpu_axpy(net_params[param_id]->count(), local_decay, temp_[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); } else { LOG(FATAL) << "Unknown regularization type: " << regularization_type; } } #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } template <typename Dtype> void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) { const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); const vector<float>& net_params_lr = this->net_->params_lr(); Dtype momentum = this->param_.momentum(); Dtype local_rate = rate * net_params_lr[param_id]; // Compute the update to history, then copy it to the parameter diff. switch (Caffe::mode()) { case Caffe::CPU: { caffe_cpu_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->cpu_diff(), momentum, history_[param_id]->mutable_cpu_data()); //计算更新后的梯度值Data= local_rate*diff+ momentum*data caffe_copy(net_params[param_id]->count(), history_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); break; //将更新后的值赋值给梯度 } case Caffe::GPU: { #ifndef CPU_ONLY caffe_gpu_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->gpu_diff(), momentum, history_[param_id]->mutable_gpu_data()); caffe_copy(net_params[param_id]->count(), history_[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } //剩余为snapshot操作,同solver.cpp
相关文章推荐
- caffe code 理解-net.hpp-net.cpp
- 【撸码caffe四】 solver.cpp&&sgd_solver.cpp
- caffe code 理解之 blob.hpp + blob.cpp
- Caffe Solver理解篇(2) SGD, AdaDelta, Ada-Grad, Adam, NAG, RMSprop 六种梯度下降方法横向对比
- 基础加强____【哈希表数据结构】【深入理解hashcode & equals】
- caffe 运行过程中如何显示更新梯度信息 solver.cpp
- 解决wxutil .cpp(626) : error C2400: inline assembler syntax error in 'opcode'; found 'newline'
- NVcaffe源码阅读——Net&Solver
- caffe源码解析 — solver.cpp
- caffe源码解析 — solver.cpp
- Caffe SGD solver代码阅读分析
- caffe的solver理解
- Microsoft.CppCommon.targets(574,5): error MSB6006: "mt.exe" exited with code 31.解决办法
- caffe:solver.cpp——init()
- caffe(3) solver.prototxt的理解
- caffe源码分析:softmax_layer.cpp && softmax_loss_layer.cpp
- hashcode & equals通俗理解
- Microsoft.CppCommon.targets(574,5): error MSB6006: "mt.exe" exited with code 31.解决办法
- caffe源码解析 — solver.cpp
- caffe源码学习中--src/caffe/solver.cpp