您的位置：首页 > 其它

CUDA核函数share memory

2015-10-14 19:07 246 查看

标签： CUDAExample

CUDA核函数运行参数

核函数是GPU每个thread上运行的程序。必须通过gloabl函数类型限定符定义。形式如下：

__global__ void kernel(param list){  }

核函数只能在主机端调用，调用时必须申明执行参数。调用形式如下：

Kernel<<<Dg,Db, Ns, S>>>(param list);

其中，参数Ns是一个可选参数，用于设置kernel函数中动态分配shared memory大小，动态分配shared memory应小于每个块允许的最大share memory大小，单位为byte。不需要动态分配时该值为0或省略不写。

share memory

share memory的分配方式分为静态分配和动态分配，静态分配指的是在核函数中申请固定大小的share memory，动态分配指的是在核函数运行过程中设置share memory的大小。两者可以单独存在与核函数中，也可以共同存在核函数中，但是必须保证总大小不大于每一块中share memory的大小。在核函数调用是在<<<>>>中第三个参数就是用来设置动态申请share memory最大尺寸的允许值，也就是说，核函数中使用的动态share memory应该小于设置的大小，即第三个参数的大小。以下例子说明此问题：

测试例子硬件配置

cpu Intel 至强四核E3-1231 V3 @ 3.40GHz

显卡 NVIDIA GeForce GTX 980

vs2013 x64

测试代码

// This example shows how to use the share memory

// System includes
#include <stdio.h>
#include <assert.h>

/*
* gtx 980 每一块中允许最大share memory是48k
* 12288 = 48 * 1024 / 4;
* 11776 = 46 * 1024 / 4;
* 程序中改变静态share mem的大小测试，静态，动态，最大share memory之间的关系
*/
__global__ static void timedReduction(const float *input, float *output)
{
const int staticShareMem = 11776; //测试值 11776, 11778, 12288, 12289
__shared__ float staticShared[staticShareMem];
extern __shared__ float shared[];

const int tid = threadIdx.x;
const int bid = blockIdx.x;
if (tid == 1) //给静态内存赋值，防止被优化掉
{
for (int i = 0; i < staticShareMem ; i++)
{
staticShared[i] = i * 2.0f;
}
}

// 动态显存进行简单计算
shared[tid] = input[tid]*3.0f + input[tid + blockDim.x]*2.0f;
shared[tid + blockDim.x] = input[tid + blockDim.x] + input[tid]*2;

// Write result.每个block输出一个参数
if (tid == 0) output[bid] = shared[0];
}

#define NUM_BLOCKS    64
#define NUM_THREADS   256

// Start the main CUDA Sample here
int main(int argc, char **argv)
{

float *dinput = NULL;
float *doutput = NULL;

float input[NUM_THREADS * 2];
float output[NUM_BLOCKS];

//动态share mem输入值大小为2kB
for (int i = 0; i < NUM_THREADS * 2; i++)
{
input[i] = (float)i;
}

cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2);
cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS);

cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice);

timedReduction << <NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS >> >(dinput, doutput);

cudaMemcpy(output, doutput, sizeof(float) * NUM_BLOCKS, cudaMemcpyDeviceToHost);

cudaFree(dinput);
cudaFree(doutput);

//cpu端准备好测试结果数据
float temp, sumall = 0;
temp = input[0]*3.0 + input[256]*2.0;
printf("the set result = %f\n", temp);

for (int i = 1; i < NUM_BLOCKS; i++)
{
sumall += temp - output[i];
//printf("out = %f\n", output[i]);
}
//输出对比结果，0 - 核函数运行正确，其他值 - 错误
printf("result = %f\n", sumall);

cudaDeviceReset();

return 1;
}

运行结果

const int staticShareMem = 11775;

the set result = 512.000000

result = 0.000000

const int staticShareMem = 11776;

the set result = 512.000000

result = 0.000000

const int staticShareMem = 11778;

the set result = 512.000000

result = 30240.000000

const int staticShareMem = 12288;

the set result = 512.000000

result = 30240.000000

const int staticShareMem = 12289;

报错，超出share memory总尺寸

结论

share memory的静态分配和动态分配可以单独存在于核函数中，也可以共同存在核函数中，但是必须保证总大小不大于每一块中share memory的总大小。

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航