您的位置:首页 > 运维架构

openCL在GPU与CPU设备上的区别--计算E值

2012-12-27 10:04 405 查看
最近老是布置了个作业,让考虑计算E的不同并行算法和不同语言是想,于是考虑到了最近openCL的实现方式,之前也考虑了openMP及多线程的实现,这里只讨论openCL的实现及发现的问题,之后整理一下自己的文章,把所有的并行语言的计算E值的实现贴出来。

一、首先介绍两种计算E值并行结构

     






二、代码实现

  1.多项式分段kernel

   

// Enter your kernel in this window
__kernel
void CaluE(__global float* result,
int StepNum,
int MaxItem
)
{
int id = get_global_id(0);
float fact = 1;
float e = 0;
for(int i = id+1; i <= StepNum;i+=MaxItem)
{
for(int j=0; j<MaxItem && j<i;j++)
{
fact *= (i-j);
}
e += (1.0/fact);
}
result[id] = e;
//	barrier(CLK_LOCAL_MEM_FENCE);
};

 2.提取公因式kernel

// Enter your kernel in this window
__kernel
void CaluE_2(__global float* result,
int StepNum,
int MaxItem
)
{

int id=get_global_id(0);
float start,end,res;
int offest = StepNum/MaxItem;
//获得所求的初末
start = id*offest+1;
end = (id+1)*offest+1;
//开始计算
res = 0;
float fact = 1;

for(int i = start; i < end; i++)
{
fact *= i;
res += (1.0/fact);
}
//传回
result[id*2] = res;
result[id*2+1] = fact;

barrier(CLK_LOCAL_MEM_FENCE);
};


 3.主机端程序

/*
项目:openCL的计算E
作者:刘荣
时间:2012.11.20

在本次运行中,采用了两种计算E的方法,
1.采用分多项式
2.提取多项式
*/
#include <iostream>
#include<time.h>
#include <string>
#include<math.h>
#include <vector>
#include <CL/cl.h>
#include <fstream>
using namespace std;
//kernel函数
std::string
convertToString(const char *filename)//将kernel源码,即自己写的并行化的函数,转化成字符串
{
size_t size;
char*  str;
std::string s;

std::fstream f(filename, (std::fstream::in | std::fstream::binary));

if(f.is_open())
{
size_t fileSize;
f.seekg(0, std::fstream::end);
size = fileSize = (size_t)f.tellg();
f.seekg(0, std::fstream::beg);

str = new char[size+1];
if(!str)
{
f.close();
std::cout << "Memory allocation failed";
return NULL;
}

f.read(str, fileSize);
f.close();
str[size] = '\0';

s = str;
delete[] str;
return s;
}
else
{
std::cout << "\nFile containg the kernel code(\".cl\") not found. Please copy the required file in the folder containg the executable.\n";
exit(1);
}
return NULL;
}

double method1(int MaxItem, int StepNum, int dev_id=0)
{
//int MaxItem=10;
//int StepNum = 1000;
//
double start,end,time1,time2;
//查询平台
cl_int ciErrNum;
cl_platform_id platform;
ciErrNum = clGetPlatformIDs(1, &platform, NULL);
if(ciErrNum != CL_SUCCESS)
{
cout<<"获取设备失败"<<endl;
return 0;
}
//获取设备信息
cl_device_id device;
cl_int   status;
cl_uint maxDims;
cl_event events[3];
size_t globalThreads[1];
size_t localThreads[1];
size_t maxWorkGroupSize;
size_t maxWorkItemSizes[3];
//创建设备
if(dev_id==0)
{
ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
}
else
{
ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
}

//创建上下文
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
cl_context ctx = clCreateContext(cps, 1, &device, NULL, NULL, &ciErrNum);
if(ciErrNum != CL_SUCCESS)
{
cout<<"创建上下文失败"<<endl;
return 0;
}
cl_command_queue myqueue = clCreateCommandQueue(ctx,device,0,&ciErrNum);
if(ciErrNum != CL_SUCCESS)
{
cout<<"命令队列失败"<<endl;
return 0;
}
//声明buffer,传输数据
float *C = NULL; // 输出数组

size_t datasize = sizeof(float)*MaxItem;

// 分配内存空间
C = (float*)malloc(datasize);

// 初始化输入数组
cl_mem bufferC = clCreateBuffer(ctx,CL_MEM_WRITE_ONLY,datasize*sizeof(float),NULL,&ciErrNum);

//运行时kernel编译
const char * filename  = "CaluE.cl";
std::string  sourceStr = convertToString(filename);
const char * source    = sourceStr.c_str();
size_t sourceSize[]    = { strlen(source) };
//直接将CL文件读到记忆体
cl_program myprog = clCreateProgramWithSource(
ctx,
1,
&source,
sourceSize,
&ciErrNum);
//cl_program myprog = clCreateProgramWithSource(ctx,1,(const char**)&programSource,NULL,&ciErrNum);
if(ciErrNum != 0)
{
cout<<"createprogram failed"<<endl;
}
ciErrNum = clBuildProgram(myprog,0,NULL,NULL,NULL,NULL);
if(ciErrNum != 0)
{
cout<<"clBuildProgram failed"<<endl;
}

cl_kernel mykernel = clCreateKernel(myprog,"CaluE",&ciErrNum);
if(ciErrNum != 0)
{
cout<<"clCreateKernel failed"<<endl;
}
//运行程序,设置参数
clSetKernelArg(mykernel,0,sizeof(cl_mem),(void*)&bufferC);
clSetKernelArg(mykernel,1,sizeof(int),&StepNum);
clSetKernelArg(mykernel,2,sizeof(int),&MaxItem);

size_t globalWorkSize[1];
globalWorkSize[0] = MaxItem;
//
start = clock();
ciErrNum = clEnqueueNDRangeKernel(myqueue,mykernel,1,NULL,globalWorkSize,NULL,0,NULL,&events[0]);
if(ciErrNum != 0)
{
cout<<"clEnqueueNDRangeKernel failed"<<endl;
}
//时间同步
status = clWaitForEvents(1, &events[0]);
if(status != CL_SUCCESS)
{
std::cout <<
"Error: Waiting for kernel run to finish. \
(clWaitForEvents0)\n";
return 0;
}
status = clReleaseEvent(events[0]);
//将结果拷贝到主机端
end = clock();
time1=end-start;
cout<<"method1 time: "<<time1<<endl;
ciErrNum = clEnqueueReadBuffer(myqueue,bufferC,CL_TRUE,0,datasize,C,0,NULL,&events[1]);

status = clWaitForEvents(1, &events[1]);
if(status != CL_SUCCESS)
{
std::cout <<
"Error: Waiting for read buffer call to finish. \
(clWaitForEvents1)n";
return 0;
}
status = clReleaseEvent(events[1]);
if(status != CL_SUCCESS)
{
std::cout <<
"Error: Release event object. \
(clReleaseEvent)\n";
return 0;
}
double e=0;
//
for(int i=0; i<MaxItem; i++)
{
e += C[i];
}
printf("method1 e = %1.22f  \n",e+1);
return time1;
}
double method2(int MaxItem, int StepNum,int dev_id=0)
{
//
double start,end,time1,time2;
//查询平台
cl_int ciErrNum;
cl_platform_id platform;
ciErrNum = clGetPlatformIDs(1, &platform, NULL);
if(ciErrNum != CL_SUCCESS)
{
cout<<"获取设备失败"<<endl;
return 0;
}
//获取设备信息
cl_device_id device;
cl_int   status;
cl_uint maxDims;
cl_event events[3];
size_t globalThreads[1];
size_t localThreads[1];
size_t maxWorkGroupSize;
size_t maxWorkItemSizes[3];
//创建设备
if(dev_id==0)
{
ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
}
else
{
ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
}

//创建上下文
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
cl_context ctx = clCreateContext(cps, 1, &device, NULL, NULL, &ciErrNum);
if(ciErrNum != CL_SUCCESS)
{
cout<<"创建上下文失败"<<endl;
return 0;
}
cl_command_queue myqueue = clCreateCommandQueue(ctx,device,0,&ciErrNum);
if(ciErrNum != CL_SUCCESS)
{
cout<<"命令队列失败"<<endl;
return 0;
}
//声明buffer,传输数据
float *C = NULL; // 输出数组
//int MaxItem=100;
//	int StepNum = 1000000000;
size_t datasize = sizeof(float)*MaxItem*2;

// 分配内存空间
C = (float*)malloc(datasize);

// 初始化输入数组
cl_mem bufferC = clCreateBuffer(ctx,CL_MEM_WRITE_ONLY,datasize,NULL,&ciErrNum);

//运行时kernel编译
const char * filename  = "CaluE_2.cl";
std::string  sourceStr = convertToString(filename);
const char * source    = sourceStr.c_str();
size_t sourceSize[]    = { strlen(source) };
//直接将CL文件读到记忆体
cl_program myprog = clCreateProgramWithSource(
ctx,
1,
&source,
sourceSize,
&ciErrNum);
//cl_program myprog = clCreateProgramWithSource(ctx,1,(const char**)&programSource,NULL,&ciErrNum);
if(ciErrNum != 0)
{
cout<<"createprogram failed"<<endl;
}
ciErrNum = clBuildProgram(myprog,0,NULL,NULL,NULL,NULL);
if(ciErrNum != 0)
{
cout<<"clBuildProgram failed"<<endl;
}

cl_kernel mykernel = clCreateKernel(myprog,"CaluE_2",&ciErrNum);
if(ciErrNum != 0)
{
cout<<"clCreateKernel failed"<<endl;
}
//运行程序,设置参数
clSetKernelArg(mykernel,0,sizeof(cl_mem),(void*)&bufferC);
clSetKernelArg(mykernel,1,sizeof(int),&StepNum);
clSetKernelArg(mykernel,2,sizeof(int),&MaxItem);

size_t globalWorkSize[1];
globalWorkSize[0] = MaxItem;
//
start = clock();
ciErrNum = clEnqueueNDRangeKernel(myqueue,mykernel,1,NULL,globalWorkSize,NULL,0,NULL,&events[0]);
if(ciErrNum != 0)
{
cout<<"clEnqueueNDRangeKernel failed"<<endl;
}
//时间同步
status = clWaitForEvents(1, &events[0]);
if(status != CL_SUCCESS)
{
std::cout <<
"Error: Waiting for kernel run to finish. \
(clWaitForEvents0)\n";
return 0;
}
status = clReleaseEvent(events[0]);
//将结果拷贝到主机端
end = clock();
time1=end-start;
cout<<"method2 时间: "<<time1<<endl;
ciErrNum = clEnqueueReadBuffer(myqueue,bufferC,CL_TRUE,0,datasize,C,0,NULL,&events[1]);

status = clWaitForEvents(1, &events[1]);
if(status != CL_SUCCESS)
{
std::cout <<
"Error: Waiting for read buffer call to finish. \
(clWaitForEvents1)n";
return 0;
}
status = clReleaseEvent(events[1]);
if(status != CL_SUCCESS)
{
std::cout <<
"Error: Release event object. \
(clReleaseEvent)\n";
return 0;
}
double e=0;
double result = 0;
double temp = 1;
//
for(int i=0; i<MaxItem; i++)
{
result = C[i*2];
e += (1/temp)*result;
temp = C[i*2+1];
}
printf("method2 e = %1.18f \n",e+1);
return time1;

}
double serial(int StepNum)
{
int start,end,time;
double fact = 1;
double e = 1;
start = clock();
for(int i=1; i < StepNum; i++)
{
fact *= i;
e += (1.0/fact);
}
end  = clock();
time = end - start;
printf("串行结果:%1.12f \n",e);
printf("串行时间: %d \n",time);
return time;
}
//测试
int main()
{

double time1,time2,time3;
int MaxItem=100;
int StepNum = 1000000;
int dev_id;
cout<<"选择kernel设备(0-cpu;1-gpu)"<<endl;//kernel运行的设备不一样,结果是不一样的
cin>>dev_id;
cout<<"输入MaxItem(1-128)"<<endl;
cin>>MaxItem;
cout<<"输入StepNum"<<endl;
cin>>StepNum;
time1 = serial(StepNum);
time2 = method1(MaxItem, StepNum,dev_id);
time3 = method2(MaxItem, StepNum, dev_id);
printf("方法1与串行加速比:%f \n",time1/time2);
printf("方法2与串行加速比:%f \n",time1/time3);

return 0;
}


三、测试结果

      本人cpu amd两盒两线程,显卡amd hd 4000

     1.基于cpu

      在测试中

  


      (  3.2对应方法2,3.3对应方法1)

     2.基于gpu

      在本机上测试,当步数变大时,电脑就死了。只是测试不步数小的时候,加速不明显

四、总结

     由于时间临近考试,没有做一些分析,希望大牛给解释,并且自己在kernel中的GPU中不能使用double,很是郁闷,求解决
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: