您的位置:首页 > 其它

CUDA(16)之内存对齐

2016-10-04 16:51 218 查看
摘要
本文主要讲述CUDA内存对齐。

1. 背景

CUDA内存对齐的背景就不说了。

2. 采用SoA设计/构造并行的数据结构

array of structures(AoS)和structure of arrays(SoA)是C语言的基本背景。SoA的内存操作适合并行计算的数据结构的设计。SoA在并行计算上的具体实现见下面过程分析。

#define threads 16

struct T {
int s0[threads];
int s1[threads];
int s2[threads];
int s3[threads];
int s4[threads];
int s5[threads];
int s6[threads];
int s7[threads];
int s8[threads];
int s9[threads];
int s10[threads];
int s11[threads];
int s12[threads];
int s13[threads];
int s14[threads];
int s15[threads];
};

SoA结构的数据类型T如上图所示,T在CUDA的global memory中的存储方式是理解并行数据结构设计的关键,thread0读取s0[0],s0[1],...,s0[15]; thread1读取s1[0],s1[1],...,s1[15]; thread15读取s15[0],s15[1],...,s15[15]。

任何一次并发的15个threads访问的内存总是对齐的;比如,并发的thread0,thread1,......,thread15第一次访问的global memory的对齐的地址为s0[0],s0[1],s0[2],......,s0[15].

3. global memory内存对齐代码测试

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>

using namespace std;

#define threads 160

struct T {
int s1[threads];
int s2[threads];
int s3[threads];
int s4[threads];
int s5[threads];
int s6[threads];
int s7[threads];
int s8[threads];
int s9[threads];
int s10[threads];
int s11[threads];
int s12[threads];
int s13[threads];
int s14[threads];
int s15[threads];
int s16[threads];
};

__global__ void initStruct(T *data, const int threadsNum){
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < threadsNum){
data->s1[i] = 1;
data->s2[i] = 2;
data->s3[i] = 3;
data->s4[i] = 4;
data->s5[i] = 5;
data->s6[i] = 6;
data->s7[i] = 7;
data->s8[i] = 8;
data->s9[i] = 9;
data->s10[i] = 10;
data->s11[i] = 11;
data->s12[i] = 12;
data->s13[i] = 13;
data->s14[i] = 14;
data->s15[i] = 15;
data->s16[i] = 16;
}
}

int main(int argc, char **argv) {

int dev = 0;
cudaSetDevice(dev);

T *data;
cudaMalloc((T **)&data,sizeof(T));

T *res = (T *)malloc(sizeof(T));

// execution configuration
dim3 block (threads,1);
dim3 grid (1,1);

// kernel
initStruct<<< grid, block >>> (data, threads);

// copy back
cudaMemcpy(res, data, sizeof(T), cudaMemcpyDeviceToHost);

// print
for (int i=0; i<threads; i++){
cout << res->s1[i] <<" ";
cout << res->s2[i] <<" ";
cout << res->s3[i] <<" ";
cout << res->s4[i] <<" ";
cout << res->s5[i] <<" ";
cout << res->s6[i] <<" ";
cout << res->s7[i] <<" ";
cout << res->s8[i] <<" ";
cout << res->s9[i] <<" ";
cout << res->s10[i] <<" ";
cout << res->s11[i] <<" ";
cout << res->s12[i] <<" ";
cout << res->s13[i] <<" ";
cout << res->s14[i] <<" ";
cout << res->s15[i] <<" ";
cout << res->s16[i] <<" ";
cout << endl;
}

// free memories both host and device
cudaFree(data);
free(res);

// reset device
cudaDeviceReset();

return EXIT_SUCCESS;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: