您的位置：首页 > 其它

cuda stream处理

2016-06-24 17:46 363 查看

需要通过某种方式一次性地执行完读取、修改写入这三个操作，并且执行过程中不被其他线程中断，这种操作称为原子操作。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <math.h>
#include <stdlib.h>

#define SIZE    (100*1024*1024)

__global__ void histo_kernel( unsigned char *buffer,long size,unsigned int *histo ) {
__shared__  unsigned int temp[256];  //用来保存字母出现的个数,每个block中均有一个
temp[threadIdx.x] = 0;
__syncthreads();

// 第i个线程处理字母buffer[i]，对应的temp加1
int i=threadIdx.x+blockIdx.x*blockDim.x;
int stride = blockDim.x * gridDim.x;
while (i < size) {
atomicAdd( &temp[buffer[i]], 1 );
i += stride;
}
// 等数据都写入到temp后，将每个block中的shared变量temp加到global变量histo中
// 因为每个block启动的线程数是256，所以可以刚好与disto、temp对应
__syncthreads();
atomicAdd( &(histo[threadIdx.x]), temp[threadIdx.x] );
}

void* big_random_block( int size ) {
unsigned char *data = (unsigned char*)malloc( size );
for (int i=0; i<size; i++)
data[i] = rand();
return data;
}

int main( void ) {
unsigned char *buffer =(unsigned char*)big_random_block( SIZE );//buffer数组分配值

//开始计时
cudaEvent_t     start, stop;
cudaEventCreate( &start ) ;
cudaEventCreate( &stop ) ;
cudaEventRecord( start, 0 ) ;

unsigned char *dev_buffer;//设备上的buffer
unsigned int *dev_histo;//设备上的变量，存储每个字母出现的个数
cudaMalloc( (void**)&dev_buffer, SIZE ) ;
cudaMemcpy( dev_buffer, buffer, SIZE,cudaMemcpyHostToDevice ) ;
cudaMalloc( (void**)&dev_histo,256 * sizeof( int ) );
cudaMemset( dev_histo, 0,256 * sizeof( int ) ) ;//初始化为0

// 调用kernel采用“2x mps个数” 会得到最好性能
cudaDeviceProp  prop;
cudaGetDeviceProperties( &prop, 0 ) ;
int blocks = prop.multiProcessorCount;
histo_kernel<<<blocks*2,256>>>( dev_buffer,SIZE, dev_histo );

unsigned int    histo[256];
cudaMemcpy( histo, dev_histo,256 * sizeof( int ),cudaMemcpyDeviceToHost ) ;

// 计时结束
cudaEventRecord( stop, 0 ) ;
cudaEventSynchronize( stop ) ;
float   elapsedTime;
cudaEventElapsedTime( &elapsedTime,start, stop ) ;
printf( "Time to generate:  %3.1f ms\n", elapsedTime );

long histoCount = 0;
for (int i=0; i<256; i++) {
histoCount += histo[i];
}
printf( "Histogram Sum:  %ld\n", histoCount );

// 检测是否与CPU版本一样
for (int i=0; i<SIZE; i++)
histo[buffer[i]]--;
for (int i=0; i<256; i++) {
if (histo[i] != 0)
printf( "Failure at %d!\n", i );
}

cudaEventDestroy( start ) ;
cudaEventDestroy( stop ) ;
cudaFree( dev_histo );
cudaFree( dev_buffer );
free( buffer );
return 0;
}

CUDA流在加速应用程序方面起着重要的作用。cuda流表示一个GPU队列，并且队列中的操作将以制定顺序执行。我们能够在流中添加一些操作，例如核函数启动、内存复制，以及事件的启动和结束等。这些操作添加的顺序就是流的执行顺序。可以将每个流视为GPU上的一个任务，并且这些任务可以并行执行。第0个流执行核函数的同时，第1个流执行复制。。。

虽然逻辑上每个流之间是相互独立的，然而，硬件中并没有流的概念，例如，内存复制操作在硬件是是必须排队的。CUDA驱动程序负责对用户和硬件进行协调。操作被添加的顺序包含的依赖性，进入硬件后进行内存复制和核函数执行的排队时，这些依赖性会丢失，CUDA驱动程序需要确保不破坏流内部的依赖性。

例如：硬件上内存复制引擎的队列以及核函数执行引擎的队列如图

如果 stream0:memcpy C 必须等待stream0:kernel执行完，这时候stream1：memcpy A以及后续的copy工作被阻塞了。将操作放入流中的顺序影响CUDA驱动程序调度这些操作及执行方式。

这个调度应该进行如下修改：

第0个流复制A，B后，第0个流的kernel就开始执行，这时候第一个流可以复制A，B。这样使得GPU并行的执行复制操作和核函数。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include<math.h>
#define N   (1024*1024)
#define FULL_DATA_SIZE   (N*20)

__global__ void kernel( int *a, int *b, int *c ) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
int idx1 = (idx + 1) % 256;
int idx2 = (idx + 2) % 256;
float   as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;  //从index开始的三个值的平均值
float   bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
c[idx] = (as + bs) / 2;  //写入到缓冲区c
}
}

int main( void ) {
cudaDeviceProp  prop;
int whichDevice;
cudaGetDevice( &whichDevice ) ;
cudaGetDeviceProperties( &prop, whichDevice ) ;
if (!prop.deviceOverlap) {
printf( "Device will not handle overlaps, so no speed up from streams\n" );
return 0;
}

cudaEvent_t     start, stop;
float     elapsedTime;

cudaStream_t    stream0, stream1;   //主函数中定义两个流
int *host_a, *host_b, *host_c;    //主机上的a,b,c
int *dev_a0, *dev_b0, *dev_c0;
int *dev_a1, *dev_b1, *dev_c1;

cudaEventCreate( &start ) ;
cudaEventCreate( &stop ) ;

//初始化流
cudaStreamCreate( &stream0 ) ;
cudaStreamCreate( &stream1 ) ;

// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a0,N * sizeof(int) ) ;
cudaMalloc( (void**)&dev_b0,N * sizeof(int) ) ;
cudaMalloc( (void**)&dev_c0,N * sizeof(int) ) ;
cudaMalloc( (void**)&dev_a1,N * sizeof(int) ) ;
cudaMalloc( (void**)&dev_b1,N * sizeof(int) ) ;
cudaMalloc( (void**)&dev_c1,N * sizeof(int) ) ;

// allocate host locked memory, used to stream
//cudaHostAlloc是CUDA运行时在主机上分配内存，这个内存是不可分页内存（malloc分配可以分页内存）
//操作系统不会对这块内存分页并交换到磁盘上，从而确保了该内存始终驻留在物理内存中，因此它能被
//安全的访问，因为它不会被破坏会重定位。从而可以采用DMA技术在GPU和主机之间复制数据，这个过程
//无需CPU介入，这种操作会比分页内存性能高约2倍。但使用固定内存会丧失虚拟内存的所以功能。
//这使得固定内存跟容易耗尽内存
cudaHostAlloc( (void**)&host_a,FULL_DATA_SIZE * sizeof(int),cudaHostAllocDefault ) ;
cudaHostAlloc( (void**)&host_b,FULL_DATA_SIZE * sizeof(int),cudaHostAllocDefault ) ;
cudaHostAlloc( (void**)&host_c,FULL_DATA_SIZE * sizeof(int),cudaHostAllocDefault ) ;

for (int i=0; i<FULL_DATA_SIZE; i++) {
host_a[i] = rand();
host_b[i] = rand();
}

cudaEventRecord( start, 0 ) ;

for (int i=0; i<FULL_DATA_SIZE; i+= N*2) {
// enqueue copies of a in stream0 and stream1
cudaMemcpyAsync( dev_a0, host_a+i,N * sizeof(int),cudaMemcpyHostToDevice,stream0 ) ;
cudaMemcpyAsync( dev_a1, host_a+i+N,N * sizeof(int),cudaMemcpyHostToDevice,stream1 ) ;
// enqueue copies of b in stream0 and stream1
cudaMemcpyAsync( dev_b0, host_b+i,N * sizeof(int),cudaMemcpyHostToDevice,stream0 ) ;
cudaMemcpyAsync( dev_b1, host_b+i+N,N * sizeof(int),cudaMemcpyHostToDevice,stream1 ) ;

// enqueue kernels in stream0 and stream1
kernel<<<N/256,256,0,stream0>>>( dev_a0, dev_b0, dev_c0 );
kernel<<<N/256,256,0,stream1>>>( dev_a1, dev_b1, dev_c1 );

// enqueue copies of c from device to locked memory
cudaMemcpyAsync( host_c+i, dev_c0,N * sizeof(int),cudaMemcpyDeviceToHost,stream0 ) ;
cudaMemcpyAsync( host_c+i+N, dev_c1,N * sizeof(int),cudaMemcpyDeviceToHost,stream1 ) ;
}
cudaStreamSynchronize( stream0 ) ;
cudaStreamSynchronize( stream1 ) ;

cudaEventRecord( stop, 0 ) ;

cudaEventSynchronize( stop ) ;
cudaEventElapsedTime( &elapsedTime,start, stop ) ;
printf( "Time taken:  %3.1f ms\n", elapsedTime );

// 要释放掉cudaHostAlloc分配的内存
cudaFreeHost( host_a ) ;
cudaFreeHost( host_b ) ;
cudaFreeHost( host_c ) ;
cudaFree( dev_a0 ) ;
cudaFree( dev_b0 ) ;
cudaFree( dev_c0 ) ;
cudaFree( dev_a1 ) ;
cudaFree( dev_b1 ) ;
cudaFree( dev_c1 ) ;
cudaStreamDestroy( stream0 ) ;
cudaStreamDestroy( stream1 ) ;

return 0;
}

转载于http://812991616.iteye.com/blog/1880683

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： gpu CUDA

相关文章推荐

新的分享

章节导航