CUDA第一个程序(只有一个线程)
2010-05-17 12:27
375 查看
#include <stdio.h>
#include <cuda_runtime.h>
#include<stdlib.h>
#define TOTAL_NUM 50000
bool CUDA_initial(void)
{
int i;
int device_count;
if( cudaGetDeviceCount(&device_count) )
{
printf(" There is zero device beyond 1.0/n");
return false;
}
else
printf("There is %d device beyond 1.0/n",device_count);
for(i=0;i<device_count;i++)
{
struct cudaDeviceProp device_prop;
if(cudaGetDeviceProperties(&device_prop,i)==cudaSuccess)
{
printf("device properties is :/n"
"/t device name is %s/n"
"/t totalGlobalMem is %d/n"
"/t sharedMemPerBlock is %d/n"
"/t regsPerBlock is %d/n"
"/t warpSize is %d/n"
"/t memPitch is %d/n"
"/t maxThreadsPerBlock is %d/n"
"/t maxThreadsDim [3] is %d X %d X %d/n"
"/t maxGridSize [3] is %d X %d X %d/n"
"/t totalConstMem is %d/n"
"/t device version is major %d ,minor %d/n"
"/t clockRate is %d/n"
"/t textureAlignment is %d/n"
"/t deviceOverlap is %d/n"
"/t multiProcessorCount is %d/n",
device_prop.name,
device_prop.totalGlobalMem,
device_prop.sharedMemPerBlock,
device_prop.regsPerBlock,
device_prop.warpSize,
device_prop.memPitch,
device_prop.maxThreadsPerBlock,
device_prop.maxThreadsDim[0],device_prop.maxThreadsDim[1],device_prop.maxThreadsDim[2],
device_prop.maxGridSize[0],device_prop.maxGridSize[1],device_prop.maxGridSize[2],
device_prop.totalConstMem,
device_prop.major,device_prop.minor,
device_prop.clockRate,
device_prop.textureAlignment,
device_prop.deviceOverlap,
device_prop.multiProcessorCount);
break;
}
}
if(i==device_count)
{
printf("Get the propertites of device occurred error/n");
return false;
}
if(cudaSetDevice(i)==cudaErrorInvalidDevice)
{
printf("Set Device occurred error/n");
return false;
}
return true;
}
void generate_num(int *num,int data_num)
{
int i;
for(i=0;i<data_num;i++)
{
*(num+i)=rand()%10;
}
}
__global__ void square_sum(int *num,int num_of_num,int * result,clock_t *time)
{
int i;
int sum=0;
clock_t start,end;
start=clock();
for(i=0;i<num_of_num;i++)
{
sum+=(*(num+i)) *(*(num+i)); //这里若用*result+=(*(num+i)) *(*(num+i));会在存取内存方面浪费时钟周期
}
*result=sum;
end=clock();
*time=end-start;
}
int main()
{
if(CUDA_initial()==true)
printf("CUDA initial successed!/n");
int num_str[TOTAL_NUM];
generate_num(num_str,TOTAL_NUM);
int *gpudata;
int *result;
clock_t *time;
cudaMalloc((void **)&gpudata,sizeof(int)*TOTAL_NUM);
cudaMalloc((void **)&result,sizeof(int));
cudaMalloc((void **)&time,sizeof(clock_t));
cudaMemcpy((void *)gpudata,num_str,sizeof(int)*TOTAL_NUM,cudaMemcpyHostToDevice);
square_sum<<<1,1>>>(gpudata,TOTAL_NUM,result,time);
int result_in_GPU;
cudaMemcpy((void *)&result_in_GPU,result,sizeof(int),cudaMemcpyDeviceToHost);
clock_t time_used;
cudaMemcpy((void *)&time_used,time,sizeof(clock_t),cudaMemcpyDeviceToHost);
printf("In GPU result is %d/n",result_in_GPU);
printf("In GPU time used is %d/n",time_used);
int result_in_CPU=0;
int i;
for(i=0;i<TOTAL_NUM;i++)
{
result_in_CPU+=num_str[i]*num_str[i];
}
printf("In CPU result is %d/n",result_in_CPU);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
return 0;
}
测试结果:
There is 1 device beyond 1.0
device properties is :
device name is GeForce 9800 GT
totalGlobalMem is 536543232
sharedMemPerBlock is 16384
regsPerBlock is 8192
warpSize is 32
memPitch is 262144
maxThreadsPerBlock is 512
maxThreadsDim [3] is 512 X 512 X 64
maxGridSize [3] is 65535 X 65535 X 1
totalConstMem is 65536
device version is major 1 ,minor 1
clockRate is 1350000
textureAlignment is 256
deviceOverlap is 1
multiProcessorCount is 14
CUDA initial successed!
In GPU result is 1419240
In GPU time used is 29763916
In CPU result is 1419240
请按任意键继续. . .
计算执行的时间:29763916/1.35GHz=0.022s;
内存带宽:50000/1048576*4/0.022=8.67MB/s
#include <cuda_runtime.h>
#include<stdlib.h>
#define TOTAL_NUM 50000
bool CUDA_initial(void)
{
int i;
int device_count;
if( cudaGetDeviceCount(&device_count) )
{
printf(" There is zero device beyond 1.0/n");
return false;
}
else
printf("There is %d device beyond 1.0/n",device_count);
for(i=0;i<device_count;i++)
{
struct cudaDeviceProp device_prop;
if(cudaGetDeviceProperties(&device_prop,i)==cudaSuccess)
{
printf("device properties is :/n"
"/t device name is %s/n"
"/t totalGlobalMem is %d/n"
"/t sharedMemPerBlock is %d/n"
"/t regsPerBlock is %d/n"
"/t warpSize is %d/n"
"/t memPitch is %d/n"
"/t maxThreadsPerBlock is %d/n"
"/t maxThreadsDim [3] is %d X %d X %d/n"
"/t maxGridSize [3] is %d X %d X %d/n"
"/t totalConstMem is %d/n"
"/t device version is major %d ,minor %d/n"
"/t clockRate is %d/n"
"/t textureAlignment is %d/n"
"/t deviceOverlap is %d/n"
"/t multiProcessorCount is %d/n",
device_prop.name,
device_prop.totalGlobalMem,
device_prop.sharedMemPerBlock,
device_prop.regsPerBlock,
device_prop.warpSize,
device_prop.memPitch,
device_prop.maxThreadsPerBlock,
device_prop.maxThreadsDim[0],device_prop.maxThreadsDim[1],device_prop.maxThreadsDim[2],
device_prop.maxGridSize[0],device_prop.maxGridSize[1],device_prop.maxGridSize[2],
device_prop.totalConstMem,
device_prop.major,device_prop.minor,
device_prop.clockRate,
device_prop.textureAlignment,
device_prop.deviceOverlap,
device_prop.multiProcessorCount);
break;
}
}
if(i==device_count)
{
printf("Get the propertites of device occurred error/n");
return false;
}
if(cudaSetDevice(i)==cudaErrorInvalidDevice)
{
printf("Set Device occurred error/n");
return false;
}
return true;
}
void generate_num(int *num,int data_num)
{
int i;
for(i=0;i<data_num;i++)
{
*(num+i)=rand()%10;
}
}
__global__ void square_sum(int *num,int num_of_num,int * result,clock_t *time)
{
int i;
int sum=0;
clock_t start,end;
start=clock();
for(i=0;i<num_of_num;i++)
{
sum+=(*(num+i)) *(*(num+i)); //这里若用*result+=(*(num+i)) *(*(num+i));会在存取内存方面浪费时钟周期
}
*result=sum;
end=clock();
*time=end-start;
}
int main()
{
if(CUDA_initial()==true)
printf("CUDA initial successed!/n");
int num_str[TOTAL_NUM];
generate_num(num_str,TOTAL_NUM);
int *gpudata;
int *result;
clock_t *time;
cudaMalloc((void **)&gpudata,sizeof(int)*TOTAL_NUM);
cudaMalloc((void **)&result,sizeof(int));
cudaMalloc((void **)&time,sizeof(clock_t));
cudaMemcpy((void *)gpudata,num_str,sizeof(int)*TOTAL_NUM,cudaMemcpyHostToDevice);
square_sum<<<1,1>>>(gpudata,TOTAL_NUM,result,time);
int result_in_GPU;
cudaMemcpy((void *)&result_in_GPU,result,sizeof(int),cudaMemcpyDeviceToHost);
clock_t time_used;
cudaMemcpy((void *)&time_used,time,sizeof(clock_t),cudaMemcpyDeviceToHost);
printf("In GPU result is %d/n",result_in_GPU);
printf("In GPU time used is %d/n",time_used);
int result_in_CPU=0;
int i;
for(i=0;i<TOTAL_NUM;i++)
{
result_in_CPU+=num_str[i]*num_str[i];
}
printf("In CPU result is %d/n",result_in_CPU);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
return 0;
}
测试结果:
There is 1 device beyond 1.0
device properties is :
device name is GeForce 9800 GT
totalGlobalMem is 536543232
sharedMemPerBlock is 16384
regsPerBlock is 8192
warpSize is 32
memPitch is 262144
maxThreadsPerBlock is 512
maxThreadsDim [3] is 512 X 512 X 64
maxGridSize [3] is 65535 X 65535 X 1
totalConstMem is 65536
device version is major 1 ,minor 1
clockRate is 1350000
textureAlignment is 256
deviceOverlap is 1
multiProcessorCount is 14
CUDA initial successed!
In GPU result is 1419240
In GPU time used is 29763916
In CPU result is 1419240
请按任意键继续. . .
计算执行的时间:29763916/1.35GHz=0.022s;
内存带宽:50000/1048576*4/0.022=8.67MB/s
相关文章推荐
- JavaScript引擎是单线程运行的,浏览器无论在什么时候都只且只有一个线程在运行JavaScript程序
- 一个程序是不是可以只有进程没有线程呢?
- 线程,是不是打破了“程序是顺序执行“的真谛?如一个main函数中,顺序定义了二个线程,并启动,第一个线程还没执行完,第二个线程就开始执行了
- CUDA第一个程序优化(若干个线程)
- 命令行方式实现第一个CUDA程序的运行(Win7环境)
- cocos2d-x 基础系列 (01) 创建一个只有退出程序按钮的demo
- 确保只有一个程序实例运行(C#)之解决方案
- java程序中,如何安全的结束一个正在运行的线程?
- 编写一个程序,将两个字符串链接起来,结果取代第一个字符串。要求用方法
- Win32 程序 确保只有一个实例运行
- 3D球体一个(纪念第一个opengl程序~~)
- 编写一个程序,开启 3 个线程,这三个线程的 ID 分别为 A、B、C,每个线程将自己的 ID 在屏幕上打印 10 遍,要求输出的结果必须按顺序显示。如:ABCABCABC…… 依次递归
- 编写一个程序,开启3个线程,这3个线程的ID分别为A、B、C,每个线程将自己的ID在屏幕上打印10遍,要求输出结果必须按ABC的顺序显示;如:ABCABC….依次递推。
- 第一个Silverlight程序,绘制一个长方型、一个椭圆、画一条线
- CUDA从入门到精通(二):第一个CUDA程序
- 编写一个程序,将两个字符串链接起来, 结果取代第一个字符串
- 第一个cuda程序
- 3D球体一个(纪念第一个opengl程序~~)
- 在同一个线程中,只有存在一个活动的Display
- 使用互斥量保证程序最多只有一个实例运行