您的位置:首页 > 运维架构

【转载】OpenCL实现矩阵相乘

2017-01-13 14:02 393 查看
矩阵相乘其实就是前一个矩阵的每一行乘以后一个矩阵的每一列,然后将乘后的每一个数字相加,得到结果矩阵的指定位置的数值。具体算法回顾一下线性代数即可。但是这种行列相乘其实都是独立的,如果是CPU计算必须串行算法,一行一列的乘,但是放到GPU里面则可以并行相乘,如果维数很大那就会大大节约时间。

具体代码如下:

__kernel
void simpleMultiply(__global float* outPutC,
int widthA,
int heightA,
int widthB ,
int heightB ,
__global float* inputA ,
__global  float* inputB
)
{
int row = get_global_id(1);
int col = get_global_id(0);
float sum = 0.0f  ;
for(int i=0;i<widthA; i++)
{
sum += inputA[row*widthA+i] * inputB[i*widthB+col];
}
outPutC[row*widthB+col] = sum;
} ;


// FirstOpenCL.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <iostream>
#include<time.h>
#include <string>
#include<math.h>
#include <vector>
#include <CL/cl.h>
#include <fstream>

using namespace std;

#pragma comment (lib,"OpenCL.lib")
std::string  convertToString(const char *filename)
{
size_t size;
char*  str;
std::string s;
std::fstream f(filename, (std::fstream::in | std::fstream::binary));
if(f.is_open())
{
size_t fileSize;
f.seekg(0, std::fstream::end);
size = fileSize = (size_t)f.tellg();
f.seekg(0, std::fstream::beg);
str = new char[size+1];
if(!str)
{
f.close();
std::cout << "Memory allocation failed";
return NULL;
}

f.read(str, fileSize);
f.close();
str[size] = '\0';
s = str;
delete[] str;
return s;
}
else
{
std::cout << "\nFile containg the kernel code(\".cl\") not found. Please copy the required file in the folder containg the executable.\n";
exit(1);
}
return NULL;
}

int main()
{
//查询平台
cl_int ciErrNum;
cl_platform_id platform;
ciErrNum = clGetPlatformIDs(1, &platform, NULL);

//获取设备信息
cl_device_id device;
cl_int   status;
cl_uint maxDims;
cl_event events[2];
size_t globalThreads[1];
size_t localThreads[1];
size_t maxWorkGroupSize;
size_t maxWorkItemSizes[3];

ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
status = clGetDeviceInfo( device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),(void*)&maxWorkGroupSize,NULL);
status = clGetDeviceInfo( device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint),(void*)&maxDims, NULL);
status = clGetDeviceInfo( device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDims,(void*)maxWorkItemSizes, NULL);

//创建上下文
cl_context ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &ciErrNum);
cl_command_queue myqueue = clCreateCommandQueue(ctx,device,0,&ciErrNum);

int wA=6,hA=6;
int wB=6,hB=6;
int wC=6,hC=6;

// 数组的大小
const int  elementsA = wA*hA;
const int  elementsB = wB*hB;
const int  elementsC = hA*wB;

// 计算内存大小
size_t datasizeA = sizeof(float)*elementsA;
size_t datasizeB = sizeof(float)*elementsB;
size_t datasizeC = sizeof(float)*elementsC;
// 分配内存空间
float *A = (float*)malloc(datasizeA);
float *B = (float*)malloc(datasizeB);
float *C = (float*)malloc(datasizeC);

// 初始化输入数组
for(int i = 0;i < elementsA;i++)
{
A[i] = (float)((float)i + 1.0);
}
for(int i = 0;i < elementsB;i++)
{
B[i] = (float)((float)i + 1.0);
}

cl_mem bufferA = clCreateBuffer(ctx,CL_MEM_READ_ONLY,wA*hA*sizeof(float),NULL,&ciErrNum);
ciErrNum = clEnqueueWriteBuffer(myqueue,bufferA,CL_TRUE,0,wA*hA*sizeof(float),(void*)A,0,NULL,NULL);

cl_mem bufferB = clCreateBuffer(ctx,CL_MEM_READ_ONLY,wB*hB*sizeof(float),NULL,&ciErrNum);
ciErrNum = clEnqueueWriteBuffer(myqueue,bufferB,CL_TRUE,0,wB*hB*sizeof(float),(void*)B,0,NULL,NULL);

cl_mem bufferC = clCreateBuffer(ctx,CL_MEM_WRITE_ONLY,hA*wB*sizeof(float),NULL,&ciErrNum);

//运行时kernel编译
const char * filename  = "HelloWorld_Kernel.cl";
std::string  sourceStr = convertToString(filename);
const char * source    = sourceStr.c_str();
size_t sourceSize[]    = { strlen(source) };
//直接将CL文件读到记忆体
cl_program myprog = clCreateProgramWithSource( ctx, 1,&source, sourceSize, &ciErrNum);
//cl_program myprog = clCreateProgramWithSource(ctx,1,(const char**)&programSource,NULL,&ciErrNum);
ciErrNum = clBuildProgram(myprog,0,NULL,NULL,NULL,NULL);

cl_kernel mykernel = clCreateKernel(myprog,"simpleMultiply",&ciErrNum);
//运行程序
clSetKernelArg(mykernel,0,sizeof(cl_mem),(void*)&bufferC);
clSetKernelArg(mykernel,1,sizeof(cl_mem),(void*)&wA);
clSetKernelArg(mykernel,2,sizeof(cl_mem),(void*)&hA);
clSetKernelArg(mykernel,3,sizeof(cl_mem),(void*)&wB);
clSetKernelArg(mykernel,4,sizeof(cl_mem),(void*)&hB);
clSetKernelArg(mykernel,5,sizeof(cl_mem),(void*)&bufferA);
clSetKernelArg(mykernel,6,sizeof(cl_mem),(void*)&bufferB);

size_t localws[2] ={wC,wC};
size_t globalws[2]={wC,hC};

ciErrNum = clEnqueueNDRangeKernel(myqueue,mykernel,2,NULL,globalws,localws,0,NULL,&events[0]);
status = clWaitForEvents(1, &events[0]);
status = clReleaseEvent(events[0]);
//将结果拷贝到主机端
ciErrNum = clEnqueueReadBuffer(myqueue,bufferC,CL_TRUE,0,wC*hC*sizeof(float),(void*)C,0,NULL,&events[1]);

status = clWaitForEvents(1, &events[1]);
status = clReleaseEvent(events[1]);

printf("\nArray A:\n");
for (int i = 0; i < wA; i++) {
for (int j = 0; j < hA; j++)
printf("%4.3f\t", A[i*hA + j]);
printf("\n");
}
printf("\nArray B:\n");
for (int i = 0; i < wB; i++) {
for (int j = 0; j < hB; j++)
printf("%4.3f\t", B[i*hB + j]);
printf("\n");
}
printf("\nArray C:\n");
for (int i = 0; i < wC; i++) {
for (int j = 0; j < hC; j++)
printf("%4.3f\t", C[i*hC + j]);
printf("\n");
}

getchar();
return 0;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: