openCL-矩阵相乘
2012-11-20 22:10
183 查看
这是AMD异构大赛发的书上的例子,我自己加了一些东西,实现了一下。不过还没有学会如何分析运行的时间,先贴上代码
这是第一次用amd APP KernelAnalyzer感觉还可以吧,
simpleMultiply.cl
main.cpp
这是第一次用amd APP KernelAnalyzer感觉还可以吧,
simpleMultiply.cl
// Enter your kernel in this window __kernel void simpleMultiply(__global float* outPutC, int widthA, int heightA, int widthB , int heightB , __global float* inputA , __global float* inputB ) { int row = get_global_id(1); int col = get_global_id(0); float sum = 0.0f ; for(int i=0;i<widthA; i++) { sum += inputA[row*widthA+i] * inputB[i*widthB+col]; } outPutC[row*widthB+col] = sum; } ;
main.cpp
/* 项目:openCL的矩阵相乘 作者:刘荣 时间:2012.11.20 */ #include <iostream> #include<time.h> #include <string> #include<math.h> #include <vector> #include <CL/cl.h> #include <fstream> using namespace std; //kernel函数 std::string convertToString(const char *filename)//将kernel源码,即自己写的并行化的函数,转化成字符串 { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = (size_t)f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); std::cout << "Memory allocation failed"; return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; delete[] str; return s; } else { std::cout << "\nFile containg the kernel code(\".cl\") not found. Please copy the required file in the folder containg the executable.\n"; exit(1); } return NULL; } int main() { double start,end,time1,time2; //查询平台 cl_int ciErrNum; cl_platform_id platform; ciErrNum = clGetPlatformIDs(1, &platform, NULL); if(ciErrNum != CL_SUCCESS) { cout<<"获取设备失败"<<endl; return 0; } //获取设备信息 cl_device_id device; cl_int status; cl_uint maxDims; cl_event events[2]; size_t globalThreads[1]; size_t localThreads[1]; size_t maxWorkGroupSize; size_t maxWorkItemSizes[3]; //////////////////////////////////////////////////////////////////// // STEP 7 Analyzing proper workgroup size for the kernel // by querying device information // 7.1 Device Info CL_DEVICE_MAX_WORK_GROUP_SIZE // 7.2 Device Info CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS // 7.3 Device Info CL_DEVICE_MAX_WORK_ITEM_SIZES //////////////////////////////////////////////////////////////////// /** * Query device capabilities. Maximum * work item dimensions and the maximmum * work item sizes */ ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); status = clGetDeviceInfo( device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Device Info. (clGetDeviceInfo)\n"; return 0; } status = clGetDeviceInfo( device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDims, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Device Info. (clGetDeviceInfo)\n"; return 0; } status = clGetDeviceInfo( device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDims, (void*)maxWorkItemSizes, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Device Info. (clGetDeviceInfo)\n"; return 0; } cout<<"maxWorkItemSizes"<<maxWorkItemSizes<<endl; cout<<"maxDims"<<maxDims<<endl; cout<<"maxWorkGroupSize"<<(int)maxWorkGroupSize<<endl; //创建上下文 cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; cl_context ctx = clCreateContext(cps, 1, &device, NULL, NULL, &ciErrNum); if(ciErrNum != CL_SUCCESS) { cout<<"创建上下文失败"<<endl; return 0; } cl_command_queue myqueue = clCreateCommandQueue(ctx,device,0,&ciErrNum); if(ciErrNum != CL_SUCCESS) { cout<<"命令队列失败"<<endl; return 0; } //声明buffer,传输数据 float *A = NULL; // 输入数组 float *B = NULL; // 输入数组 float *C = NULL; // 输出数组 int wA=20,hA=20; int wB=20,hB=20; int wC=20,hC=20; // 数组的大小 const int elementsA = wA*hA; const int elementsB = wB*hB; const int elementsC = hA*wB; // 计算内存大小 size_t datasizeA = sizeof(float)*elementsA; size_t datasizeB = sizeof(float)*elementsB; size_t datasizeC = sizeof(float)*elementsC; // 分配内存空间 A = (float*)malloc(datasizeA); B = (float*)malloc(datasizeB); C = (float*)malloc(datasizeC); // 初始化输入数组 for(int i = 0;i < elementsA;i++) { A[i] = std::rand()/1.5; //B[i] = std::rand()/1.5; } for(int i = 0;i < elementsB;i++) { B[i] = std::rand()/1.5; //B[i] = std::rand()/1.5; } cl_mem bufferA = clCreateBuffer(ctx,CL_MEM_READ_ONLY,wA*hA*sizeof(float),NULL,&ciErrNum); ciErrNum = clEnqueueWriteBuffer(myqueue,bufferA,CL_TRUE,0,wA*hA*sizeof(float),(void*)A,0,NULL,NULL); cl_mem bufferB = clCreateBuffer(ctx,CL_MEM_READ_ONLY,wB*hB*sizeof(float),NULL,&ciErrNum); ciErrNum = clEnqueueWriteBuffer(myqueue,bufferB,CL_TRUE,0,wB*hB*sizeof(float),(void*)B,0,NULL,NULL); cl_mem bufferC = clCreateBuffer(ctx,CL_MEM_WRITE_ONLY,hA*wB*sizeof(float),NULL,&ciErrNum); //运行时kernel编译 const char * filename = "simpleMultiply.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; //直接将CL文件读到记忆体 cl_program myprog = clCreateProgramWithSource( ctx, 1, &source, sourceSize, &ciErrNum); //cl_program myprog = clCreateProgramWithSource(ctx,1,(const char**)&programSource,NULL,&ciErrNum); ciErrNum = clBuildProgram(myprog,0,NULL,NULL,NULL,NULL); cl_kernel mykernel = clCreateKernel(myprog,"simpleMultiply",&ciErrNum); //运行程序 clSetKernelArg(mykernel,0,sizeof(cl_mem),(void*)&bufferC); clSetKernelArg(mykernel,1,sizeof(cl_mem),(void*)&wA); clSetKernelArg(mykernel,2,sizeof(cl_mem),(void*)&hA); clSetKernelArg(mykernel,3,sizeof(cl_mem),(void*)&wB); clSetKernelArg(mykernel,4,sizeof(cl_mem),(void*)&hB); clSetKernelArg(mykernel,5,sizeof(cl_mem),(void*)&bufferA); clSetKernelArg(mykernel,6,sizeof(cl_mem),(void*)&bufferB); size_t localws[2] ={20,20}; size_t globalws[2]={wC,hC}; // // start = clock(); ciErrNum = clEnqueueNDRangeKernel(myqueue,mykernel,2,NULL,globalws,localws,0,NULL,&events[0]); //时间同步 status = clWaitForEvents(1, &events[0]); if(status != CL_SUCCESS) { std::cout << "Error: Waiting for kernel run to finish. \ (clWaitForEvents)\n"; return 0; } status = clReleaseEvent(events[0]); //将结果拷贝到主机端 end = clock(); time1=end-start; cout<<"shijian "<<time1<<endl; ciErrNum = clEnqueueReadBuffer(myqueue,bufferC,CL_TRUE,0,wC*hC*sizeof(float),(void*)C,0,NULL,&events[1]); status = clWaitForEvents(1, &events[1]); if(status != CL_SUCCESS) { std::cout << "Error: Waiting for read buffer call to finish. \ (clWaitForEvents)\n"; return 0; } status = clReleaseEvent(events[1]); if(status != CL_SUCCESS) { std::cout << "Error: Release event object. \ (clReleaseEvent)\n"; return 0; } // return 0; }
相关文章推荐
- OpenCL例程3-矩阵相乘
- 【转载】OpenCL实现矩阵相乘
- C++实战之OpenCL矩阵相乘
- C++实战之OpenCL矩阵相乘优化(二)
- OpenCL 初实践(1)矩阵相乘
- [转]用矩阵相乘解线性递推方程
- 矩阵相乘的Strassen算法
- Hadoop 2.6 使用Map Reduce实现矩阵相乘1 矩阵转置
- 利用Hadoop实现超大矩阵相乘之我见(二)
- 稀疏矩阵利用三元组相乘(c语言)
- 矩阵相乘的最优顺序
- Erlang实现的矩阵相乘C=A*B单线程与并行多线程性能对比
- numpy中矩阵相乘的用法:dot函数和乘号*
- 矩阵相乘这个代码也应该是需要掌握的内容,今天一大早就写了一下
- MapReduce实现矩阵相乘
- 矩阵的应用--矩阵相乘与矩阵快速幂
- 矩阵相乘-c++代码实现及运行实例结果
- 矩阵相乘strassen-c++代码实现及运行实例结果
- spark矩阵向量-矩阵矩阵相乘
- POJ 3318 两个大矩阵相乘是否与另一矩阵结果相等