 SIMT:单指令多线程(SINGLE INSTRUCTION MULTI THREAD)的简写。就像这名字一样,相同的代码在不同线程中并行执行,每个线程使用不同的数据来执行同一段代码。

 Work-item(工作项):Work-item与CUDA Threads是一样的,是最小的执行单元。每次一个Kernel开始执行,很多(程序员定义数量)的Work-item就开始运行,每个都执行同样的代码。每个work-item有一个ID,这个ID在kernel中是可以访问的,每个运行在work-item上的kernel通过这个ID来找出work-item需要处理的数据。

 Work-group(工作组):work-group的存在是为了允许work-item之间的通信和协作。它反映出work-item的组织形式(work-group是以N维网格形式组织的,N=1,2或3)。Work-group等价于CUDA thread blocks。像work-items一样,work-groups也有一个kernel可以读取的唯一的ID。 




void vector_add_cpu (const float* src_a, const float* src_b, float* res, const int num)

{ for (int i = 0; i < num; i++) res[i] = src_a[i] + src_b[i];}


__kernel void vector_add_gpu (__global const float* src_a, __global const float* src_b, __global float* res, const int      
num){ /* get_global_id(0)

 返回正在执行的这个线程的ID。 许多线程会在同一时间开始执行同一个kernel, 每个线程都会收到一个不同的ID,所以必然会执行一个不同的计算。*/ const int idx = get_global_id(0); /* 


 如果在,work-item就会执行相应的计算。*/ if (idx < num) res[idx] = src_a[idx] + src_b[idx];}


1. Kernel关键字定义了一个函数是kernel函数。Kernel函数必须返回void。

2. Global关键字位于参数前面。它定义了参数内存的存放位置。




// Returns the error codecl_int oclGetPlatformID (cl_platform_id *platforms) // Pointer to the platform



// Returns the error codecl_int clGetDeviceIDs (cl_platform_id platform,cl_device_type device_type, // Bitfield identifying the type. For the GPU we use CL_DEVICE_TYPE_GPUcl_uint num_entries, // Number of devices, typically 1cl_device_id
*devices, // Pointer to the device objectcl_uint *num_devices) // Puts here the number of devices matching the device_type

Context(上下文):定义了整个OpenCL化境,包括OpenCL kernel、设备、内存管理、命令队列等。上下文使用cl_context来表现。使用以下代码初始化:

// Returs the contextcl_context clCreateContext (const cl_context_properties *properties, // Bitwise with the properties (see specification)cl_uint num_devices, // Number of devicesconst cl_device_id *devices, // Pointer to the
devices objectvoid (*pfn_notify)(const char *errinfo, const void *private_info, size_t cb, void *user_data), // (don't worry about this)void *user_data, // (don't worry about this)cl_int *errcode_ret) // error code result


”cl_command_queue clCreateCommandQueue (cl_context context,cl_device_id device,cl_command_queue_properties properties, // Bitwise with the propertiescl_int *errcode_ret) // error code result


cl_int error = 0; // Used to handle error codescl_platform_id platform;cl_context context;cl_command_queue queue;cl_device_id device;// 

Platformerror = oclGetPlatformID(&platform);

if (error != CL_SUCCESS) { cout << "Error getting platform id: " << errorMessage(error) << endl; exit(error);}// Deviceerror = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);if (err != CL_SUCCESS) { cout << "Error
getting device ids: " << errorMessage(error) << endl; exit(error);}//

 Contextcontext = clCreateContext(0, 1, &device, NULL, NULL, &error);if (error != CL_SUCCESS) { cout << "Error creating context: " << errorMessage(error) << endl; exit(error);}// 

Command-queuequeue = clCreateCommandQueue(context, device, 0, &error);

if (error != CL_SUCCESS) { cout << "Error creating command queue: " << errorMessage(error) << endl; exit(error);}


const int size = 1234567float* src_a_h = new float[size];

float* src_b_h = new float[size];

float* res_h = new float[size];// Initialize both vectorsfor (int i = 0; i < size; i++) { src_a_h = src_b_h = (float) i;}


// Returns the cl_mem object referencing the memory allocated on the devicecl_mem clCreateBuffer (cl_context context, 

// The context where the memory will be allocatedcl_mem_flags flags,size_t size,

 // The size in bytesvoid *host_ptr,cl_int *errcode_ret)flags是逐位的,选项如下:







从 host_ptr处拷贝数据我们通过下面的代码使用这个函数:

const int mem_size = sizeof(float)*size;

// Allocates a buffer of size mem_size and copies mem_size bytes from src_a_hcl_mem src_a_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_a_h, &error);cl_mem src_b_d = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_b_h, &error);cl_mem res_d = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size, NULL, &error);

程序和kernel到现在为止,你可能会问自己一些问题,比如:我们怎么调用kernel?编译器怎么知道如何将代码放到设备上?我们怎么编译kernel?下面是我们在对比OpenCL程序和OpenCL kernel时的一些容易混乱的概念:


Program。Program:OpenCL Program由kernel函数、其他函数和声明组成。它通过cl_program表示。当创建一个program时,你必须指定它是由哪些文件组成的,然后编译它。

你需要用到下面的函数来建立一个Program:// Returns the OpenCL programcl_program clCreateProgramWithSource (cl_context context, cl_uint count, // number of files const char **strings, // array of strings, each one is a file const size_t *lengths,
// array specifying the file lengths cl_int *errcode_ret) // error code to be returned当我们创建了Program我们可以用下面的函数执行编译操作:cl_int clBuildProgram (cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, // Compiler options, see
the specifications for more details void (*pfn_notify)(cl_program, void *user_data), void *user_data)查看编译log,必须使用下面的函数:cl_int clGetProgramBuildInfo (cl_program program, cl_device_id device, cl_program_build_info param_name, // The parameter we want to know
size_t param_value_size, void *param_value, // The answer size_t *param_value_size_ret)最后,我们需要“提取”program的入口点。使用cl_kernel:cl_kernel clCreateKernel (cl_program program, // The program where the kernel isconst char *kernel_name, // The name of the kernel, i.e.
the name of the kernel function as it's declared in the codecl_int *errcode_ret)注意我们可以创建多个OpenCL program,每个pro
gram可以拥有多个kernel。以下是这一章节的代码:// Creates the program// Uses NVIDIA helper functions to get the code string and it's size (in bytes)size_t src_size =
0;const char* path = shrFindFilePath("vector_add_gpu.cl", NULL);const char* source = oclLoadProgSource(path, "", &src_size);cl_program program = clCreateProgramWithSource(context, 1, &source, &src_size, &error);assert(error == CL_SUCCESS);// Builds the programerror
= clBuildProgram(program, 1, &device, NULL, NULL, NULL);assert(error == CL_SUCCESS);// Shows the logchar* build_log;size_t log_size;// First call to know the proper sizeclGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);build_log
= new char[log_size+1];// Second call to get the logclGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);build_log[log_size] = '\0';cout << build_log << endl;delete[] build_log;// Extracting the kernelcl_kernel vector_add_kernel
= clCreateKernel(program, "vector_add_gpu", &error);assert(error == CL_SUCCESS);运行kernel一旦我们的kernel建立好,我们就可以运行它。首先,我们必须设置kernel的参数:cl_int clSetKernelArg (cl_kernel kernel, // Which kernel cl_uint arg_index, // Which argument size_t arg_size, // Size of the
next argument (not of the value pointed by it!) const void *arg_value) // Value每个参数都需要调用一次这个函数。当所有参数设置完毕,我们就可以调用这个kernel:cl_int clEnqueueNDRangeKernel (cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, // Choose if we are using 1D, 2D or
3D work-items and work-groups const size_t *global_work_offset, const size_t *global_work_size, // The total number of work-items (must have work_dim dimensions) const size_t *local_work_size, // The number of work-items per work-group (must have work_dim
dimensions) cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)下面是这一章节的代码:// Enqueuing parameters// Note that we inform the size of the cl_mem object, not the size of the memory pointed by iterror = clSetKernelArg(vector_add_k,
0, sizeof(cl_mem), &src_a_d);error |= clSetKernelArg(vector_add_k, 1, sizeof(cl_mem), &src_b_d);error |= clSetKernelArg(vector_add_k, 2, sizeof(cl_mem), &res_d);error |= clSetKernelArg(vector_add_k, 3, sizeof(size_t), &size);assert(error == CL_SUCCESS);//
Launching kernelconst size_t local_ws = 512; // Number of work-items per work-group// shrRoundUp returns the smallest multiple of local_ws bigger than sizeconst size_t global_ws = shrRoundUp(local_ws, size); // Total number of work-itemserror = clEnqueueNDRangeKernel(queue,
vector_add_k, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL);assert(error == CL_SUCCESS);读取结果读取结果非常简单。与之前讲到的写入内存(设备内存)的操作相似,现在我们需要存入队列一个读取缓冲区的操作:cl_int clEnqueueReadBuffer (cl_command_queue command_queue, cl_mem buffer, // from which buffer cl_bool blocking_read,
// whether is a blocking or non-blocking read size_t offset, // offset from the beginning size_t cb, // size to be read (in bytes) void *ptr, // pointer to the host memory cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)使用方法如下://
Reading backfloat* check = new float[size];clEnqueueReadBuffer(queue, res_d, CL_TRUE, 0, mem_size, check, 0, NULL, NULL);清理作为一名牛X的程序员我们肯定要考虑如何清理内存!你需要知道最基本东西:使用clCreate申请的(缓冲区、kernel、队列)必须使用clRelease释放。代码如下:// Cleaning updelete[] src_a_h;delete[] src_b_h;delete[]
res_h;delete[] check;clReleaseKernel(vector_add_k);clReleaseCommandQueue(queue);clReleaseContext(context);clReleaseMemObject(src_a_d);clReleaseMemObject(src_b_d);clReleaseMemObject(res_d);
