您的位置:首页 > Web前端

caffe1源码解析从入门到放弃1):内存管理syncedmem.hpp / syncedmem.cpp

2017-06-07 17:38 204 查看
/*这些代码都是本人在linux-nsight-eclipse环境下纯手打。
文章结尾都会抛出一些本人尚未解决的问题,欢迎各路大神拍砖。
文章属于学习交流性质,随着本人学力的提升,此blog将会长期修正更新。
* syncedmem.hpp
*  Created on: Jun 4, 2017
*      Author: pan
*/
#ifndef SYNCEDMEM_HPP_
#define SYNCEDMEM_HPP_
#include <cstdlib>
#include "caffe/common.hpp"
/*定义了caffe名称空间,内部封装了caffe所有的类和方法,
* eg:using namespace caffe / using namespace std*/
namespace caffe
{
// If CUDA is available and in GPU mode, host memory will be allocated pinned,
// using cudaMallocHost. It avoids dynamic pinning for transfers (DMA).
// The improvement in performance seems negligible in the single GPU case,
// but might be more significant for parallel training. Most importantly,
// it improved stability for large models on many GPUs.
/*如果主机支持CUDA并且工作在GPU模式下,主机内存将会 allocated(分配) pinned, 使用cudaMallocHost().
* 它避免了dynamic pinning for transfers (DMA).在单GPU情况下使用cudaMallocHost(),这个操作在性能
* 上的提高看起来几乎可以忽视。但是在多GPU并行训练的情况下,cudaMallocHost()可能会显的更重要。最重要的是,
* cudaMallocHost()的使用提高了在多GPU环境下大模型的稳定性。
* caffe工作在GPU模式下使用cudaMallocHost()在主机上分配内存将会比使用malloc()方法有性能和稳定性的提高。
*/
/*在主机上分配内存,CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);方法使用二级指针
cpu_ptr_分配内存*/
inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda)
{
#ifndef CPU_ONLY
if(Caffe::mode() == Caffe::GPU)
{
CUDA_CHECK(cudaMallocHost(ptr, size));//*****************
*use_cuda = true;
return ;//在void类型的函数中,return用于返回空,不是返回 0 值
}
#endif
/*这里分配了size个字节的内存,由于使用的是void*最后要强制类型转换成特定类型的
* 指针eg: static_cast<int*> cpu_ptr_ 。这点在Blob中会详细陈述*/
*ptr = malloc(size);
*use_cuda = false;
CHECK(*ptr)<<"host allocation of size "<< size <<" failed";//**********************
}
/*内存释放方法,由于在cuda环境下有两种主机分配内存的方法,所以在这里做了一个宏定义处理,分别是
cudaFreeHost()和 free()*/
inline void CaffeFreeHost(void* ptr, bool use_cuda)
{
#ifndef CPU_ONLY
if(use_cuda)
{
CUDA_CHECK(cudaFreeHost(ptr));//***************
return ;
}
#endif
free(ptr);
}
/**
* @brief Manages memory allocation and synchronization between the host (CPU)
*        and device (GPU).
*
* TODO(dox): more thorough description.
*/
/*
*SyncedMemory类 @简单的用于在主机(CPU)和 设备(GPU)之间进行内存分配和同步工作,也就是说在CPU和GPU
*之间管理内存。
*TODO(dox): more thorough description.
* */
class SyncedMemory
{
public:
/*构造函数将初始化各种指针*/
SyncedMemory()
: cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
own_cpu_data_(false), cpu_malloc_use_cuda_(false),own_gpu_data_(false),
gpu_device_(-1){}
/*构造函数将初始化各种指针
* explicit 表示构造函数不接受隐式转换 eg:  ********************/
explicit SyncedMemory(size_t size) : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size),
head_(UNINITIALIZED), own_cpu_data_(false), cpu_malloc_use_cu
bd1f
da_(false),
own_gpu_data_(false), gpu_device_(-1){}
/*析构函数中定义了释放堆区内存的操作,在caffe的数据容器Blob中,定义了shared_ptr<syncedmemory> data_
* 定义了shared_ptr<syncedmemory> diff_ 的智能指针,通过reset方法控制内存的释放。由于nvcc编译器
* 对C++11支持的不好,暂且不能够使用unique_ptr智能指针,目前只能调用boost库的shared_ptr*/
~SyncedMemory(){};

public:
/*cpu_data()和gpu_data()返回值为const void* 表示cpu_ptr_和gpu_ptr_所指向的内存空间不允许被修改
* 与此相反void* mutable_cpu_data() 和 void* mutable_gpu_data(); 返回的是void* 的指针,也即内存返回的
* 内存空间是允许修改的*/
const void* cpu_data();
void set_cpu_data(void* data);
const void* gpu_data();
void set_gpu_data(void* data);
void* mutable_cpu_data();
void* mutable_gpu_data();
/*此处定义了一个枚举类型SyncedHead主要作用是标志头指针状态,其中SYNCED表示内存已经同步*/
enum SyncedHead {UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED};

SyncedHead head()  {return head_;}
//size_t是标准C库中定义的,应为unsigned int,在64位系统中为 long unsigned int
size_t size()  {return size_;}

#ifndef CPU_ONLY
void async_gpu_push(const cudaStream_t& stream);//*****************
#endif

private:
SyncedHead head_;//头指针位置
/*控制内存同步的方法,如果head在cpu上执行to_cpu()表示内存已经同步,否则要调用caffe_gpu_memcpy()方法
* 实质上调用的是cudaMemcpy(Y, X, N, cudaMemcpyDefault),caffe_gpu_memcpy()做了一层封装而已。
* 同理to_gpu()*/
void to_cpu();
void to_gpu();

void* cpu_ptr_;
void* gpu_ptr_;
size_t size_;

bool own_cpu_data_;
bool cpu_malloc_use_cuda_;
bool own_gpu_data_;
int gpu_device_;
DISABLE_COPY_AND_ASSIGN(SyncedMemory);//***************
};// class SyncedMemory

};//namespace caffe

#endif /* SYNCEDMEM_HPP_ */
---------------------------------------------
---------------------------------------------
---------------------------------------------
/*
* syncedmem.cpp
*
*  Created on: Jun 4, 2017
*      Author: pan
*/
#include "common.hpp"
#include "syncedmem.hpp"
#include "util/math_functions.hpp"

namespace caffe
{
SyncedMemory::~SyncedMemory()
{
/*cpu_ptr_不为NULL,不能释放NULL指针, own_cpu_data_标志位不为 0这个标志位不知道如何理解 ??????????????????*/
if(cpu_ptr_ && own_cpu_data_)
{
CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
}
#ifndef CPU_ONLY
if(gpu_ptr_ && own_gpu_data_)
{
int initial_device;
cudaGetDevice(&initial_device);
if (gpu_device_ != -1)
{
CUDA_CHECK(cudaSetDevice(gpu_device_));//????????????????
}
CUDA_CHECK(cudaFree(gpu_ptr_));
cudaSetDevice(initial_device);
}
#endif

}
//同步内存到CPU 即设置cpu_ptr_
inline void SyncedMemory::to_cpu()
{
switch (head_)
{
case UNINITIALIZED:
CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
caffe_memset(size_, 0, cpu_ptr_);
head_ = HEAD_AT_CPU;
own_cpu_data_ = true;
break;

case HEAD_AT_GPU:
#ifndef CPU_ONLY//Makefile.config中定义
if (cpu_ptr_ == NULL)
{
CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
own_cpu_data_ = true;
}
caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
head_ = SYNCED;
#else
NO_GPU;//Makefile.config中定义
#endif
break;

case HEAD_AT_CPU://头指针指向CPU内存已经同步
case SYNCED:
break;
}
}//to_cpu()
//同步内存到CPU 即设置gpu_ptr_
inline void SyncedMemory::to_gpu()
{
#ifndef CPU_ONLY
switch (head_)
{
case UNINITIALIZED://???????????????????/
head_ = HEAD_AT_GPU;
own_gpu_data_ = true;
break;

case HEAD_AT_CPU:
if(gpu_ptr_ == NULL)
{
CUDA_CHECK(cudaGetDevice(&gpu_device_));
CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
own_gpu_data_ = true;
}
caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);
head_ = SYNCED;
break;
case HEAD_AT_GPU:
case SYNCED:
break;
}
#else
NO_GPU
#endif
}//to_gpu()
//获取cpu 堆区内存头指针
const void* SyncedMemory::cpu_data()
{
to_cpu();
return (const void*)cpu_ptr_;
}

void SyncedMemory::set_cpu_data(void* data)
{
CHECK(data);//???????????????????????
//????if(data == NULL) return -1;?????????????????
if(own_cpu_data_)
{
CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
}
cpu_ptr_ = data;
head_ = HEAD_AT_CPU;
own_cpu_data_ = false;
}

const void* SyncedMemory::gpu_data()
{
#ifndef CPU_ONLY
to_gpu();
return (const void*)gpu_ptr_;
#else
NO_GPU;
return NULL;
#endif
}

void SyncedMemory::set_gpu_data(void* data)
{
#ifndef CPU_ONLY
CHECK(data);
if (own_gpu_data_)
{
int initial_device;
cudaGetDevice(&initial_device);
if (gpu_device_ != -1)
{
CUDA_CHECK(cudaSetDevice(gpu_device_));
}
CUDA_CHECK(cudaFree(gpu_ptr_));
cudaSetDevice(initial_device);
}
gpu_ptr_ = data;
head_ = HEAD_AT_GPU;
own_gpu_data_ = false;
#else
NO_GPU;
#endif
}

void* SyncedMemory::mutable_cpu_data()
{
to_cpu();
head_ = HEAD_AT_CPU;
return cpu_ptr_;
}

void* SyncedMemory::mutable_gpu_data()
{
#ifndef CPU_ONLY
to_gpu();
head_ = HEAD_AT_GPU;
return gpu_ptr_;
#else
NO_GPU;
return NULL;
#endif
}

#ifndef CPU_ONLY
void SyncedMemory::async_gpu_push(const cudaStream_t& stream)
{
CHECK(head_ == HEAD_AT_CPU);
if (gpu_ptr_ == NULL)
{
CUDA_CHECK(cudaGetDevice(&gpu_device_));
CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
own_gpu_data_ = true;
}
const cudaMemcpyKind put = cudaMemcpyHostToDevice;
CUDA_CHECK(cudaMemcpyAsync(gpu_ptr_, cpu_ptr_, size_, put, stream));
// Assume caller will synchronize on the stream before use
head_ = SYNCED;
}
#endif
};//namespace caffe


自己写的测试代码分析构造函数和析构函数的行为
/*
* caffe.cpp
*
*  Created on: Jun 5, 2017
*      Author: pan
*/
#include <iostream>
#include <climits>
#include <cstdlib>
#include <boost/shared_ptr.hpp>
using namespace std;
using boost::shared_ptr;
// inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda)

inline void CaffeMallocoHost(void** ptr, size_t size)
{
// #ifndef CPU_ONLY
//    cudaMallocHost(ptr, size);
//      void* ptr = (void*)(new char[size]);
*ptr = malloc(size);
if(ptr == NULL)
{
cout<<"malloc error in fuction CaffeMallocoHost !"<<endl;
}
}

inline void CaffeFreeHost(void* ptr)
{
cout<<">>>>>>>>>>>>>CaffeFreeHost";
if(ptr != NULL)
{
free(ptr);
cout<<">>>>>>>>>>>now free cpu_ptr_ "<<endl;
}
}

class synced
{
public:
synced(size_t size, int num) : cpu_ptr_(NULL),
gpu_ptr_(NULL), own_cpu_data_(false),own_gpu_data_(false),
size_(size), cpu_malloc_use_cuda_(0),num_(num){cout<<"constructor "<< num_<<" called !\n";}

~synced()
{

CaffeFreeHost(cpu_ptr_);

cout<<"destructor "<<num_<<" called!\n";

}

void to_cpu()
{
CaffeMallocoHost(&cpu_ptr_, size_);
}
void* cpu_data()
{
to_cpu();
return cpu_ptr_;
}
private:
void* cpu_ptr_;
bool own_cpu_data_;
void* gpu_ptr_;
bool own_gpu_data_;
bool cpu_malloc_use_cuda_;
size_t size_;

int num_;
};

int main()
{
shared_ptr<synced> data;
data.reset(new synced(10 * sizeof(int), 1));
int* ptr = static_cast<int*>(data->cpu_data());
ptr[9] = 10;
// cout<< INT_MAX <<endl;

return 0;
}


抛出问题: caffe内存管理如何使用new delete 形式重写CaffeFreeHost() ; CaffeMallocHost()。 主要困难是c++
如何分配一个void* 并delete 一个void* 。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: