cuda之thread,block,gird详解
2015-08-25 16:50
513 查看
本文将通过一个程序帮助了解线程块的分配,以及线程束,线程全局标号等
可以看到总线程数为0~127,共有2个线程块,每个线程块包含64个 线程,
每个线程块内部线程的索引为0~63.一个线程块包含2个线程束(warp)
(1个warp包括32个线程)
#include<cuda_runtime.h> #include<conio.h> #include<stdio.h> #include<stdlib.h> #include<device_launch_parameters.h> #define ARRAY_SIZE 128 #define ARRAY_SIZE_IN_BYTES (sizeof(unsigned int)*(ARRAY_SIZE)) __global__ void what_is_my_id(unsigned int *const block, unsigned int *const thread, unsigned int *const warp, unsigned int *const calc_thread) { const unsigned int thread_idx = blockIdx.x*blockDim.x + threadIdx.x; block[thread_idx] = blockIdx.x; thread[thread_idx] = threadIdx.x;//内部线程的索引 warp[thread_idx] = threadIdx.x / warpSize; calc_thread[thread_idx] = thread_idx; } int main() { /* 本地开辟4个数组存放我们要计算的内容 */ unsigned int cpu_block[ARRAY_SIZE]; unsigned int cpu_thread[ARRAY_SIZE]; unsigned int cpu_warp[ARRAY_SIZE]; unsigned int cpu_calc_thread[ARRAY_SIZE]; //设计线程数为2*64=128个线程 const unsigned int num_blocks = 2; const unsigned int num_threads = 64; /* 在GPU上分配同样大小的4个数组 */ unsigned int * gpu_block; unsigned int * gpu_thread; unsigned int * gpu_warp; unsigned int * gpu_calc_thread; cudaMalloc((void**)&gpu_block, ARRAY_SIZE_IN_BYTES); cudaMalloc((void**)&gpu_thread, ARRAY_SIZE_IN_BYTES); cudaMalloc((void**)&gpu_warp, ARRAY_SIZE_IN_BYTES); cudaMalloc((void**)&gpu_calc_thread, ARRAY_SIZE_IN_BYTES); //执行内核函数 what_is_my_id << <num_blocks, num_threads >> >(gpu_block, gpu_thread, gpu_warp, gpu_calc_thread); //将GPU运算完的结果复制回本地 cudaMemcpy(cpu_block, gpu_block, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); cudaMemcpy(cpu_thread, gpu_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); cudaMemcpy(cpu_warp, gpu_warp, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); cudaMemcpy(cpu_calc_thread, gpu_calc_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); cudaFree(gpu_block); cudaFree(gpu_thread); cudaFree(gpu_warp); cudaFree(gpu_calc_thread); //输出 for (unsigned int i = 0; i < ARRAY_SIZE; i++) { printf("总线程数%3u-Blocks:%2u-Warp%2u-内部线程数%3u\n", cpu_calc_thread[i], cpu_block[i], cpu_warp[i], cpu_thread[i]); } return 0; }
可以看到总线程数为0~127,共有2个线程块,每个线程块包含64个 线程,
每个线程块内部线程的索引为0~63.一个线程块包含2个线程束(warp)
(1个warp包括32个线程)
#include<cuda_runtime.h> #include<stdio.h> #include<device_launch_parameters.h> #define ARRAY_SIZE_X 32 #define ARRAY_SIZE_Y 16 #define ARRAY_SIZE_IN_BYTES (sizeof(unsigned int)*(ARRAY_SIZE_X)*(ARRAY_SIZE_Y)) __global__ void what_is_my_id_2d_A(unsigned int *const block_x,unsigned int *const block_y, unsigned int *const thread,unsigned int *const calc_thread, unsigned int *const x_thread, unsigned int *const y_thread, unsigned int *const gird_dimx, unsigned int *const block_dimx, unsigned int *const gird_dimy, unsigned int *const block_dimy) { const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y; const unsigned int thread_idx =((gridDim.x*blockDim.x)*idy) + idx; block_x[thread_idx] = blockIdx.x;//X维度线程块索引 block_y[thread_idx] = blockIdx.y;//Y维度线程块索引 thread[thread_idx] = threadIdx.x;//1个线程块内部X维度线程索引 calc_thread[thread_idx] = thread_idx;//总索引 x_thread[thread_idx] = idx; y_thread[thread_idx] = idy; gird_dimx[thread_idx] = gridDim.x;//线程网格X维度上线程块数量 block_dimx[thread_idx] = blockDim.x;//线程网格Y维度上线程数量 gird_dimy[thread_idx] = gridDim.y; block_dimy[thread_idx] = blockDim.y; } int main() { /* 本地开辟4个数组存放我们要计算的内容 */ unsigned int cpu_block_x[ARRAY_SIZE_Y][ARRAY_SIZE_X]; unsigned int cpu_block_y[ARRAY_SIZE_Y][ARRAY_SIZE_X]; unsigned int cpu_thread[ARRAY_SIZE_Y][ARRAY_SIZE_X]; unsigned int cpu_calc_thread[ARRAY_SIZE_Y][ARRAY_SIZE_X]; unsigned int cpu_xthread[ARRAY_SIZE_Y][ARRAY_SIZE_X]; unsigned int cpu_ythread[ARRAY_SIZE_Y][ARRAY_SIZE_X]; unsigned int cpu_grid_dimx[ARRAY_SIZE_Y][ARRAY_SIZE_X]; unsigned int cpu_block_dimx[ARRAY_SIZE_Y][ARRAY_SIZE_X]; unsigned int cpu_grid_dimy[ARRAY_SIZE_Y][ARRAY_SIZE_X]; unsigned int cpu_block_dimy[ARRAY_SIZE_Y][ARRAY_SIZE_X]; //设计线程数为2*64=128个线程 const dim3 threads_rect(32, 4); const dim3 blocks_rect(1, 4); const dim3 threads_square(16, 8); const dim3 blocks_square(2,2); /* 在GPU上分配同样大小的4个数组 */ unsigned int * gpu_block_x; unsigned int * gpu_block_y; unsigned int * gpu_thread; unsigned int * gpu_warp; unsigned int * gpu_calc_thread; unsigned int * gpu_xthread; unsigned int * gpu_ythread; unsigned int * gpu_grid_dimx; unsigned int * gpu_block_dimx; unsigned int * gpu_grid_dimy; unsigned int * gpu_block_dimy; cudaMalloc((void**)&gpu_block_x, ARRAY_SIZE_IN_BYTES); cudaMalloc((void**)&gpu_block_y, ARRAY_SIZE_IN_BYTES); cudaMalloc((void**)&gpu_thread, ARRAY_SIZE_IN_BYTES); cudaMalloc((void**)&gpu_calc_thread, ARRAY_SIZE_IN_BYTES); cudaMalloc((void**)&gpu_xthread, ARRAY_SIZE_IN_BYTES); cudaMalloc((void**)&gpu_ythread, ARRAY_SIZE_IN_BYTES); cudaMalloc((void**)&gpu_grid_dimx, ARRAY_SIZE_IN_BYTES); cudaMalloc((void**)&gpu_block_dimx, ARRAY_SIZE_IN_BYTES); cudaMalloc((void**)&gpu_grid_dimy, ARRAY_SIZE_IN_BYTES); cudaMalloc((void**)&gpu_block_dimy, ARRAY_SIZE_IN_BYTES); //执行条纹式布局左边的图 // what_is_my_id_2d_A << <blocks_rect, threads_rect >> > ( gpu_block_x, gpu_block_y, gpu_thread,gpu_calc_thread,gpu_xthread,gpu_ythread,gpu_grid_dimx,gpu_block_dimx,gpu_grid_dimy,gpu_block_dimy); //执行方块式布局右边的图 what_is_my_id_2d_A << <blocks_square, threads_square >> >(gpu_block_x, gpu_block_y, gpu_thread,gpu_calc_thread, gpu_xthread,gpu_ythread,gpu_grid_dimx,gpu_block_dimx,gpu_grid_dimy, gpu_block_dimy); //将GPU运算完的结果复制回本地 cudaMemcpy(cpu_block_x, gpu_block_x, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); cudaMemcpy(cpu_block_y, gpu_block_y, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); cudaMemcpy(cpu_thread, gpu_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); cudaMemcpy(cpu_calc_thread, gpu_calc_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); cudaMemcpy(cpu_xthread, gpu_xthread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); cudaMemcpy(cpu_ythread, gpu_ythread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); cudaMemcpy(cpu_grid_dimx, gpu_grid_dimx, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); cudaMemcpy(cpu_block_dimx, gpu_block_dimx, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); cudaMemcpy(cpu_grid_dimy, gpu_grid_dimy, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); cudaMemcpy(cpu_block_dimy, gpu_block_dimy, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost); // printf("\nKernel &d\n", kernel); for (int y = 0; y < ARRAY_SIZE_Y; y++) { for (int x = 0; x < ARRAY_SIZE_X; x++) { printf("总%3u X维度block索引:%1u Y维度block索引:%1u TID:%2u YTID:%2u XTID:%2uGridX维度上block数量%1u BDX:%1u GridY维度上block数量%1u blockY维度线程数量%1u\n", cpu_calc_thread[y][x], cpu_block_x[y][x], cpu_block_y[y][x], cpu_thread[y][x], cpu_ythread[y][x], cpu_xthread[y][x], cpu_grid_dimx[y][x], cpu_block_dimx[y][x],cpu_grid_dimy[y][x], cpu_block_dimy[y][x]); } } // } cudaFree(gpu_block_x); cudaFree(gpu_block_y); cudaFree(gpu_thread); cudaFree(gpu_calc_thread); cudaFree(gpu_xthread); cudaFree(gpu_ythread); cudaFree(gpu_grid_dimx); cudaFree(gpu_block_dimx); cudaFree(gpu_grid_dimy); cudaFree(gpu_block_dimy); return 0; }
相关文章推荐
- 用selenium实现对微博搜索数据的抓取
- OpenRTSP的使用
- SVG学习笔录(一)
- Java反射机制
- Win10系统中svn不显示小图标怎么办?svn图标不显示的解决办法
- linux sed command record
- Linux的文件系统
- 内存操作
- 假定CSomething 是一个类,执行下面这些语句之后,内存中创建了多少个CSomething 对象。
- 黑马程序员——C语言中的源程序
- CentOS7.1下生产环境Keepalived+Nginx配置
- Android 实现个性的ViewPager切换动画 实战PageTransformer(兼容Android3.0以下)
- sencha touch 学习笔记- 安装与环境
- 动态查找树比较——BST、AVL、RBT、B、B+
- C Intro - Two unsigned int subtract
- 使用 Eclipse 插件提高代码质量,让开发自动化。
- Struts2_2_第一Struts2应用
- 简洁的文字加图片应用:“深蓝”,源代码分享
- YII修改Exception返回值为任意格式
- Android训练课程(Android Training) - 高效的显示图片