您的位置：首页 > 大数据 > 人工智能

测试求和 waiting

2014-04-21 21:45 211 查看

//duide 1

#include "h.h"

#define G 4 //灰度阶

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

//测试求和

float ComputeFeature1( int* pdMatrix,int total)

{

int i,j;

float f1=0;

for(i=0; i<G; i++)

{

for(j=0; j<G; j++)

{

float t;

t=(float)pdMatrix[i*G+j]/(float)total;

f1+= (t)*(t);

}

}

return f1;

}

__global__ static void ComputeFeature1( float *sum1,int* pdMatrix,int width,int height,int *total) //186ye

{

int i;

float t;

int tid=threadIdx.x+threadIdx.y*4;//

int tid_in_x = threadIdx.x;

int tid_in_y = threadIdx.y;

__shared__ float stemp1[G*G];

if(tid_in_y<G &&(tid_in_x<G))

{

stemp1[tid_in_x*G+tid_in_y]=0;

}

__syncthreads();

//归一化

if(tid_in_y<G &&(tid_in_x<G))

{

stemp1[tid_in_x*G+tid_in_y]=(float)pdMatrix[tid]/((float)(*total));

}

__syncthreads();

if(threadIdx.x<G && threadIdx.y<G)

{

stemp1[threadIdx.x*G+threadIdx.y]=stemp1[threadIdx.x*G+threadIdx.y]*stemp1[threadIdx.x*G+threadIdx.y];

}

__syncthreads();

if(tid<G*G)

{

for(i=G*G/2; i>0; i/=2)

{

if(tid<i)

{

stemp1[tid] = stemp1[tid] + stemp1[tid+i] ;

}

__syncthreads();

}

}

__syncthreads();

*sum1 = stemp1[0];

////DUI

//if(tid<G*G)

//{

// for(i=G*G/2; i>0; i/=2)

// {

// if(tid<i)

// {

// pdMatrix[tid] = pdMatrix[tid] + pdMatrix[tid+i] ;

//

// }

// __syncthreads();

//

// }

//}

// __syncthreads();

// *sum1 = pdMatrix[0];

//

}

int main()

{

float *sum1,sum2;

//dim3 grid(2,2);

dim3 block(4,4);

int* pdMatrix;

int*p;

float *g_f1,*g_f2;

int width,height;

width=height=4;

int t=height*width;

//int tt=height*width/4;

p=(int *)malloc(height*width* sizeof(float));

int tot=0;

int *total;

float f1=0;

cudaMalloc((void**)&total, sizeof(int));

for(int i=0;i<t;i++)

{

p[i]=i;//i;

tot+=p[i];

}

f1=ComputeFeature1( p,tot);

printf("%f\n",f1);

//tot=1;

cudaMalloc((void**)&total, sizeof(int));

cudaMemset(total,0,sizeof(int));

cudaMemcpy(total,&tot,sizeof(int), cudaMemcpyHostToDevice);

cudaMalloc((void**)&pdMatrix,height*width* sizeof(int));

//cudaMalloc((void**)&g_f1,tt* sizeof(float));

//cudaMemset(g_f1,0,sizeof(float)*tt);

cudaMalloc((void**)&sum1, sizeof(float));

cudaMemset(sum1,0,sizeof(float));

//g_f2=(float*)malloc(tt* sizeof(float));

//memset(g_f2,0,sizeof(float)*tt);

sum2=0;

cudaMemcpy(pdMatrix,p,height*width*sizeof(int), cudaMemcpyHostToDevice);

ComputeFeature1<<<1,block>>>(sum1,pdMatrix,width,height,total);

cudaError_t cudaStatus = cudaGetLastError();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));

goto Error;

}

//cudaMemcpy(g_f2,g_f1,tt*sizeof(float), cudaMemcpyDeviceToHost);

cudaMemcpy(&sum2,sum1,sizeof(float), cudaMemcpyDeviceToHost);

cudaStatus = cudaDeviceSynchronize();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

}

printf("%f\n",sum2);

//printf("\n%d\n",sum2);

Error:

return cudaStatus;

}

//duide 1 end

#include "h.h"

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

//测试求和

__global__ static void ComputeFeature1( int *sum1,int* pdMatrix,int width,int height,float *g_f1) //186ye

{

//pdMatrix[threadIdx.x]=pdMatrix[threadIdx.x]*pdMatrix[threadIdx.x];

__shared__ float stemp1[G][G];

int i,j;

unsigned int xIndex=__mul24(blockDim.x,blockIdx.x)+threadIdx.x;

unsigned int yIndex=__mul24(blockDim.y,blockIdx.y)+threadIdx.y;

unsigned int index=yIndex*width+xIndex; //?

int tid_in_x=threadIdx.x;

int tid_in_y=threadIdx.y;

if(tid_in_x<G &&tid_in_y<G){

stemp1[tid_in_x][tid_in_y]=0.0f;}

__syncthreads();

if((index<width*height)&&(threadIdx.x<G)&&(threadIdx.y<G)) //index_in

{

stemp1[threadIdx.x][threadIdx.y]=pdMatrix[index];

}

__syncthreads();

//float sum1;

//sum1=0.0f;

//for(i=0;i<16;i++)

//{

// //for(j=0;j<G;j++)

// //{

// *sum1+= pdMatrix[i];

// //}

//}

// 这种对

if(index<height*width)

{

for(i=height*width/2; i>0; i/=2)

{

if(index<i)

{

pdMatrix[index] = pdMatrix[index] + pdMatrix[index+i] ;

}

__syncthreads();

}

}

__syncthreads();

*sum1 = pdMatrix[0];

///

/if(sum1!=0)

//{

//if(threadIdx.x<G &&threadIdx.y<G){

////stemp1[threadIdx.x][threadIdx.y]=stemp1[threadIdx.x][threadIdx.y]/sum1;

//}

//__syncthreads();

//if(threadIdx.x<G &&threadIdx.y<G){

////stemp1[threadIdx.x][threadIdx.y]=stemp1[threadIdx.x][threadIdx.y]*stemp1[threadIdx.x][threadIdx.y];

//}

//__syncthreads();

//这种不对？？？？？？？？？？？？？？

/*if(threadIdx.x<G &&threadIdx.y<G){

for(j=blockDim.y*2;j>0;j--)

{

if(threadIdx.y<j)

{

stemp1[threadIdx.x][threadIdx.y]+=stemp1[threadIdx.x][threadIdx.y+j];

}

__syncthreads();

}

for( i=blockDim.x*2;i > 0 ;i--)

{

if (threadIdx.y < j )

{

stemp1[threadIdx.x][0]+= stemp1[threadIdx.x+i][0];

}

__syncthreads();

}

}

if( threadIdx.x==0&&threadIdx.y==0){

g_f1[blockIdx.x*2+blockIdx.y]=stemp1[0][0];//

//}

}*/

}

int main()

{

int *sum1,sum2;

dim3 grid(2,2,1);

dim3 block(2,2,1);

int* pdMatrix;

int*p;

float *g_f1,*g_f2;

int width,height;

width=height=4;

int t=height*width;

int tt=height*width/4;

p=(int *)malloc(height*width* sizeof(float));

for(int i=0;i<t;i++)

{

p[i]=i;//i;

}

cudaMalloc((void**)&pdMatrix,height*width* sizeof(int));

cudaMalloc((void**)&g_f1,tt* sizeof(float));

cudaMemset(g_f1,0,sizeof(float)*tt);

cudaMalloc((void**)&sum1, sizeof(int));

cudaMemset(sum1,0,sizeof(int));

g_f2=(float*)malloc(tt* sizeof(float));

memset(g_f2,0,sizeof(float)*tt);

sum2=0;

cudaMemcpy(pdMatrix,p,height*width*sizeof(int), cudaMemcpyHostToDevice);

ComputeFeature1<<<grid,block>>>(sum1,pdMatrix,width,height,g_f1);

cudaError_t cudaStatus = cudaGetLastError();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));

goto Error;

}

cudaMemcpy(g_f2,g_f1,tt*sizeof(float), cudaMemcpyDeviceToHost);

cudaMemcpy(&sum2,sum1,sizeof(int), cudaMemcpyDeviceToHost);

cudaStatus = cudaDeviceSynchronize();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

}

printf("%d\n",sum2);sum2=0;

for(int i=0;i<tt;i++)

{

printf("%f ",g_f2[i]);

sum2+=g_f2[i];

}

printf("\n%d\n",sum2);

//p[width*height]={0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,1};

Error:

return cudaStatus;

}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航