炼数成金CUDA视频教程——第三课1——学习笔记
2017-11-07 21:24
543 查看
/*** * gputimer.h 源程序来自炼数成金教程 * ***/ #ifndef __GPU_TIMER_H__ #define __GPU_TIMER_H__ struct GpuTimer { cudaEvent_t start; cudaEvent_t stop; GpuTimer() { cudaEventCreate(&start); cudaEventCreate(&stop); } ~GpuTimer() { cudaEventDestroy(start); cudaEventDestroy(stop); } void Start() { cudaEventRecord(start, 0); } void Stop() { cudaEventRecord(stop, 0); } float Elapsed() { float elapsed; cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed, start, stop); return elapsed; } }; #endif /* __GPU_TIMER_H__ */
////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////
/**** * reduce.cu 源程序来自炼数成金教程 * ***/ #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> __global__ void global_reduce_kernel(float * d_out, float * d_in) { int myId = threadIdx.x + blockDim.x * blockIdx.x; int tid = threadIdx.x; // do reduction in global mem for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { d_in[myId] += d_in[myId + s]; } __syncthreads(); // make sure all adds at one stage are done! } // only thread 0 writes result for this block back to global mem if (tid == 0) { d_out[blockIdx.x] = d_in[myId]; } } __global__ void shmem_reduce_kernel(float * d_out, const float * d_in) { // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>> extern __shared__ float sdata[]; int myId = threadIdx.x + blockDim.x * blockIdx.x; int tid = threadIdx.x; // load shared mem from global mem sdata[tid] = d_in[myId]; __syncthreads(); // make sure entire block is loaded! // do reduction in shared mem for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); // make sure all adds at one stage are done! } // only thread 0 writes result for this block back to global mem if (tid == 0) { d_out[blockIdx.x] = sdata[0]; } } void reduce(float * d_out, float * d_intermediate, float * d_in, int size, bool usesSharedMemory) { // assumes that size is not greater than maxThreadsPerBlock^2 // and that size is a multiple of maxThreadsPerBlock const int maxThreadsPerBlock = 1024; int threads = maxThreadsPerBlock; int blocks = size / maxThreadsPerBlock; if (usesSharedMemory) { shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>> (d_intermediate, d_in); } else { global_reduce_kernel<<<blocks, threads>>> (d_intermediate, d_in); } // now we're down to one block left, so reduce it threads = blocks; // launch one thread for each block in prev step blocks = 1; if (usesSharedMemory) { shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>> (d_out, d_intermediate); } else { global_reduce_kernel<<<blocks, threads>>> (d_out, d_intermediate); } } int main(int argc, char **argv) { int deviceCount; cudaGetDeviceCount(&deviceCount); if (deviceCount == 0) { fprintf(stderr, "error: no devices supporting CUDA.\n"); exit(EXIT_FAILURE); } int dev = 0; cudaSetDevice(dev); cudaDeviceProp devProps; if (cudaGetDeviceProperties(&devProps, dev) == 0) { printf("Using device %d:\n", dev); printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n", devProps.name, (int)devProps.totalGlobalMem, (int)devProps.major, (int)devProps.minor, (int)devProps.clockRate); } const int ARRAY_SIZE = 1 << 20; const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float); // generate the input array on the host float h_in[ARRAY_SIZE]; float sum = 0.0f; for(int i = 0; i < ARRAY_SIZE; i++) { // generate random float in [-1.0f, 1.0f] h_in[i] = -1.0f + (float)random()/((float)RAND_MAX/2.0f); sum += h_in[i]; } // declare GPU memory pointers float * d_in, * d_intermediate, * d_out; // allocate GPU memory cudaMalloc((void **) &d_in, ARRAY_BYTES); cudaMalloc((void **) &d_intermediate, ARRAY_BYTES); // overallocated cudaMalloc((void **) &d_out, sizeof(float)); // transfer the input array to the GPU cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); int whichKernel = 0; if (argc == 2) { whichKernel = atoi(argv[1]); } cudaEvent_t start, stop; cudaEventCreate(&start 9819 ); cudaEventCreate(&stop); // launch the kernel switch(whichKernel) { case 0: printf("Running global reduce\n"); cudaEventRecord(start, 0); for (int i = 0; i < 100; i++) { reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false); } cudaEventRecord(stop, 0); break; case 1: printf("Running reduce with shared mem\n"); cudaEventRecord(start, 0); for (int i = 0; i < 100; i++) { reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true); } cudaEventRecord(stop, 0); break; default: fprintf(stderr, "error: ran no kernel\n"); exit(EXIT_FAILURE); } cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); elapsedTime /= 100.0f; // 100 trials // copy back the sum from GPU float h_out; cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost); printf("average time elapsed: %f\n", elapsedTime); // free GPU memory allocation cudaFree(d_in); cudaFree(d_intermediate); cudaFree(d_out); return 0; }
相关文章推荐
- 炼数成金CUDA视频教程——第三课2——学习笔记
- 孙鑫视频教程《Java从入门到精通》学习笔记
- VC视频教程笔记(第三课)
- 视频专辑:炼数成金 hadoop 视频教程
- 看cuda初级教程视频笔记(周斌讲的)--CUDA、GPU编程模型
- 看cuda初级教程视频笔记(周斌讲的)--CUDA编程1
- 看cuda初级教程视频笔记-GPU体系架构概述
- 微软官方windows phone开发视频教程第三/四天视频(附下载地址)
- 视频专辑:炼数成金 hadoop 视频教程
- SSH-Struts第三弹:传智播客视频教程第一天上午的笔记
- 【备忘】 2017年5月炼数成金《MySQL DBA从小白到大神实战》视频教程
- 从零开始学习C语言开发视频教程在线完整版
- 视频教程/软件892
- 陈广C#程序设计入门视频教程全集下载
- 【office培训】【王佩丰】Excel2010视频教程第3讲:查找、替换及定位
- SEO视频教程集合在线观看
- 【备忘】中国移动大型分布式redis,solr,Linux,nginx,springmvc,mybatis电商项目视频教程
- ExtJs 4.0 视频教程代码
- 【C++程序设计语言A视频教程 全12讲 中科院】【下载链接】
- reactjs 视频教程