【转载】OpenCL实现矩阵相乘
2017-01-13 14:02
393 查看
矩阵相乘其实就是前一个矩阵的每一行乘以后一个矩阵的每一列,然后将乘后的每一个数字相加,得到结果矩阵的指定位置的数值。具体算法回顾一下线性代数即可。但是这种行列相乘其实都是独立的,如果是CPU计算必须串行算法,一行一列的乘,但是放到GPU里面则可以并行相乘,如果维数很大那就会大大节约时间。
具体代码如下:
具体代码如下:
__kernel void simpleMultiply(__global float* outPutC, int widthA, int heightA, int widthB , int heightB , __global float* inputA , __global float* inputB ) { int row = get_global_id(1); int col = get_global_id(0); float sum = 0.0f ; for(int i=0;i<widthA; i++) { sum += inputA[row*widthA+i] * inputB[i*widthB+col]; } outPutC[row*widthB+col] = sum; } ;
// FirstOpenCL.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include <iostream> #include<time.h> #include <string> #include<math.h> #include <vector> #include <CL/cl.h> #include <fstream> using namespace std; #pragma comment (lib,"OpenCL.lib") std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = (size_t)f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); std::cout << "Memory allocation failed"; return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; delete[] str; return s; } else { std::cout << "\nFile containg the kernel code(\".cl\") not found. Please copy the required file in the folder containg the executable.\n"; exit(1); } return NULL; } int main() { //查询平台 cl_int ciErrNum; cl_platform_id platform; ciErrNum = clGetPlatformIDs(1, &platform, NULL); //获取设备信息 cl_device_id device; cl_int status; cl_uint maxDims; cl_event events[2]; size_t globalThreads[1]; size_t localThreads[1]; size_t maxWorkGroupSize; size_t maxWorkItemSizes[3]; ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); status = clGetDeviceInfo( device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),(void*)&maxWorkGroupSize,NULL); status = clGetDeviceInfo( device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint),(void*)&maxDims, NULL); status = clGetDeviceInfo( device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDims,(void*)maxWorkItemSizes, NULL); //创建上下文 cl_context ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &ciErrNum); cl_command_queue myqueue = clCreateCommandQueue(ctx,device,0,&ciErrNum); int wA=6,hA=6; int wB=6,hB=6; int wC=6,hC=6; // 数组的大小 const int elementsA = wA*hA; const int elementsB = wB*hB; const int elementsC = hA*wB; // 计算内存大小 size_t datasizeA = sizeof(float)*elementsA; size_t datasizeB = sizeof(float)*elementsB; size_t datasizeC = sizeof(float)*elementsC; // 分配内存空间 float *A = (float*)malloc(datasizeA); float *B = (float*)malloc(datasizeB); float *C = (float*)malloc(datasizeC); // 初始化输入数组 for(int i = 0;i < elementsA;i++) { A[i] = (float)((float)i + 1.0); } for(int i = 0;i < elementsB;i++) { B[i] = (float)((float)i + 1.0); } cl_mem bufferA = clCreateBuffer(ctx,CL_MEM_READ_ONLY,wA*hA*sizeof(float),NULL,&ciErrNum); ciErrNum = clEnqueueWriteBuffer(myqueue,bufferA,CL_TRUE,0,wA*hA*sizeof(float),(void*)A,0,NULL,NULL); cl_mem bufferB = clCreateBuffer(ctx,CL_MEM_READ_ONLY,wB*hB*sizeof(float),NULL,&ciErrNum); ciErrNum = clEnqueueWriteBuffer(myqueue,bufferB,CL_TRUE,0,wB*hB*sizeof(float),(void*)B,0,NULL,NULL); cl_mem bufferC = clCreateBuffer(ctx,CL_MEM_WRITE_ONLY,hA*wB*sizeof(float),NULL,&ciErrNum); //运行时kernel编译 const char * filename = "HelloWorld_Kernel.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; //直接将CL文件读到记忆体 cl_program myprog = clCreateProgramWithSource( ctx, 1,&source, sourceSize, &ciErrNum); //cl_program myprog = clCreateProgramWithSource(ctx,1,(const char**)&programSource,NULL,&ciErrNum); ciErrNum = clBuildProgram(myprog,0,NULL,NULL,NULL,NULL); cl_kernel mykernel = clCreateKernel(myprog,"simpleMultiply",&ciErrNum); //运行程序 clSetKernelArg(mykernel,0,sizeof(cl_mem),(void*)&bufferC); clSetKernelArg(mykernel,1,sizeof(cl_mem),(void*)&wA); clSetKernelArg(mykernel,2,sizeof(cl_mem),(void*)&hA); clSetKernelArg(mykernel,3,sizeof(cl_mem),(void*)&wB); clSetKernelArg(mykernel,4,sizeof(cl_mem),(void*)&hB); clSetKernelArg(mykernel,5,sizeof(cl_mem),(void*)&bufferA); clSetKernelArg(mykernel,6,sizeof(cl_mem),(void*)&bufferB); size_t localws[2] ={wC,wC}; size_t globalws[2]={wC,hC}; ciErrNum = clEnqueueNDRangeKernel(myqueue,mykernel,2,NULL,globalws,localws,0,NULL,&events[0]); status = clWaitForEvents(1, &events[0]); status = clReleaseEvent(events[0]); //将结果拷贝到主机端 ciErrNum = clEnqueueReadBuffer(myqueue,bufferC,CL_TRUE,0,wC*hC*sizeof(float),(void*)C,0,NULL,&events[1]); status = clWaitForEvents(1, &events[1]); status = clReleaseEvent(events[1]); printf("\nArray A:\n"); for (int i = 0; i < wA; i++) { for (int j = 0; j < hA; j++) printf("%4.3f\t", A[i*hA + j]); printf("\n"); } printf("\nArray B:\n"); for (int i = 0; i < wB; i++) { for (int j = 0; j < hB; j++) printf("%4.3f\t", B[i*hB + j]); printf("\n"); } printf("\nArray C:\n"); for (int i = 0; i < wC; i++) { for (int j = 0; j < hC; j++) printf("%4.3f\t", C[i*hC + j]); printf("\n"); } getchar(); return 0; }
相关文章推荐
- 矩阵相乘--高效实现方法
- 二维矩阵相乘的串行和并行实现
- 矩阵相乘的一维数组实现
- C语言实现矩阵相乘
- 利用Hadoop实现超大矩阵相乘之我见(一)
- 求逆矩阵以及两矩阵相乘的算法实现
- 算法系列:矩阵相乘算法的MapReduce实现
- hadoop矩阵相乘简单实现
- 矩阵相乘的三种实现
- c++实现两个矩阵相乘
- 矩阵相乘求最少相乘次数的C++实现代码
- hadoop矩阵相乘简单实现
- opencv 矩阵与一个常数相乘的 两种实现方法
- 矩阵相乘的两种实现方法
- openCL-矩阵相乘
- [转载]采用加法器数乘法器实现17位有符号数相乘(Verilog)
- 矩阵相乘--递归与非递归实现
- 基于MPI的矩阵相乘summa算法实现(附源程序)
- 文件读写方式实现矩阵相乘
- 矩阵相乘算法的Java语言实现