测试求和 waiting
2014-04-21 21:45
211 查看
//duide 1
#include "h.h"
#define G 4 //灰度阶
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
//测试求和
float ComputeFeature1( int* pdMatrix,int total)
{
int i,j;
float f1=0;
for(i=0; i<G; i++)
{
for(j=0; j<G; j++)
{
float t;
t=(float)pdMatrix[i*G+j]/(float)total;
f1+= (t)*(t);
}
}
return f1;
}
__global__ static void ComputeFeature1( float *sum1,int* pdMatrix,int width,int height,int *total) //186ye
{
int i;
float t;
int tid=threadIdx.x+threadIdx.y*4;//
int tid_in_x = threadIdx.x;
int tid_in_y = threadIdx.y;
__shared__ float stemp1[G*G];
if(tid_in_y<G &&(tid_in_x<G))
{
stemp1[tid_in_x*G+tid_in_y]=0;
}
__syncthreads();
//归一化
if(tid_in_y<G &&(tid_in_x<G))
{
stemp1[tid_in_x*G+tid_in_y]=(float)pdMatrix[tid]/((float)(*total));
}
__syncthreads();
if(threadIdx.x<G && threadIdx.y<G)
{
stemp1[threadIdx.x*G+threadIdx.y]=stemp1[threadIdx.x*G+threadIdx.y]*stemp1[threadIdx.x*G+threadIdx.y];
}
__syncthreads();
if(tid<G*G)
{
for(i=G*G/2; i>0; i/=2)
{
if(tid<i)
{
stemp1[tid] = stemp1[tid] + stemp1[tid+i] ;
}
__syncthreads();
}
}
__syncthreads();
*sum1 = stemp1[0];
////DUI
//if(tid<G*G)
//{
// for(i=G*G/2; i>0; i/=2)
// {
// if(tid<i)
// {
// pdMatrix[tid] = pdMatrix[tid] + pdMatrix[tid+i] ;
//
// }
// __syncthreads();
//
// }
//}
// __syncthreads();
// *sum1 = pdMatrix[0];
//
}
int main()
{
float *sum1,sum2;
//dim3 grid(2,2);
dim3 block(4,4);
int* pdMatrix;
int*p;
float *g_f1,*g_f2;
int width,height;
width=height=4;
int t=height*width;
//int tt=height*width/4;
p=(int *)malloc(height*width* sizeof(float));
int tot=0;
int *total;
float f1=0;
cudaMalloc((void**)&total, sizeof(int));
for(int i=0;i<t;i++)
{
p[i]=i;//i;
tot+=p[i];
}
f1=ComputeFeature1( p,tot);
printf("%f\n",f1);
//tot=1;
cudaMalloc((void**)&total, sizeof(int));
cudaMemset(total,0,sizeof(int));
cudaMemcpy(total,&tot,sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&pdMatrix,height*width* sizeof(int));
//cudaMalloc((void**)&g_f1,tt* sizeof(float));
//cudaMemset(g_f1,0,sizeof(float)*tt);
cudaMalloc((void**)&sum1, sizeof(float));
cudaMemset(sum1,0,sizeof(float));
//g_f2=(float*)malloc(tt* sizeof(float));
//memset(g_f2,0,sizeof(float)*tt);
sum2=0;
cudaMemcpy(pdMatrix,p,height*width*sizeof(int), cudaMemcpyHostToDevice);
ComputeFeature1<<<1,block>>>(sum1,pdMatrix,width,height,total);
cudaError_t cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
//cudaMemcpy(g_f2,g_f1,tt*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(&sum2,sum1,sizeof(float), cudaMemcpyDeviceToHost);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
printf("%f\n",sum2);
//printf("\n%d\n",sum2);
Error:
return cudaStatus;
}
//duide 1 end
#include "h.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
//测试求和
__global__ static void ComputeFeature1( int *sum1,int* pdMatrix,int width,int height,float *g_f1) //186ye
{
//pdMatrix[threadIdx.x]=pdMatrix[threadIdx.x]*pdMatrix[threadIdx.x];
__shared__ float stemp1[G][G];
int i,j;
unsigned int xIndex=__mul24(blockDim.x,blockIdx.x)+threadIdx.x;
unsigned int yIndex=__mul24(blockDim.y,blockIdx.y)+threadIdx.y;
unsigned int index=yIndex*width+xIndex; //?
int tid_in_x=threadIdx.x;
int tid_in_y=threadIdx.y;
if(tid_in_x<G &&tid_in_y<G){
stemp1[tid_in_x][tid_in_y]=0.0f;}
__syncthreads();
if((index<width*height)&&(threadIdx.x<G)&&(threadIdx.y<G)) //index_in
{
stemp1[threadIdx.x][threadIdx.y]=pdMatrix[index];
}
__syncthreads();
//float sum1;
//sum1=0.0f;
//for(i=0;i<16;i++)
//{
// //for(j=0;j<G;j++)
// //{
// *sum1+= pdMatrix[i];
// //}
//}
// 这种对
if(index<height*width)
{
for(i=height*width/2; i>0; i/=2)
{
if(index<i)
{
pdMatrix[index] = pdMatrix[index] + pdMatrix[index+i] ;
}
__syncthreads();
}
}
__syncthreads();
*sum1 = pdMatrix[0];
///
/if(sum1!=0)
//{
//if(threadIdx.x<G &&threadIdx.y<G){
////stemp1[threadIdx.x][threadIdx.y]=stemp1[threadIdx.x][threadIdx.y]/sum1;
//}
//__syncthreads();
//if(threadIdx.x<G &&threadIdx.y<G){
////stemp1[threadIdx.x][threadIdx.y]=stemp1[threadIdx.x][threadIdx.y]*stemp1[threadIdx.x][threadIdx.y];
//}
//__syncthreads();
//这种不对??????????????
/*if(threadIdx.x<G &&threadIdx.y<G){
for(j=blockDim.y*2;j>0;j--)
{
if(threadIdx.y<j)
{
stemp1[threadIdx.x][threadIdx.y]+=stemp1[threadIdx.x][threadIdx.y+j];
}
__syncthreads();
}
for( i=blockDim.x*2;i > 0 ;i--)
{
if (threadIdx.y < j )
{
stemp1[threadIdx.x][0]+= stemp1[threadIdx.x+i][0];
}
__syncthreads();
}
}
if( threadIdx.x==0&&threadIdx.y==0){
g_f1[blockIdx.x*2+blockIdx.y]=stemp1[0][0];//
//}
}*/
}
int main()
{
int *sum1,sum2;
dim3 grid(2,2,1);
dim3 block(2,2,1);
int* pdMatrix;
int*p;
float *g_f1,*g_f2;
int width,height;
width=height=4;
int t=height*width;
int tt=height*width/4;
p=(int *)malloc(height*width* sizeof(float));
for(int i=0;i<t;i++)
{
p[i]=i;//i;
}
cudaMalloc((void**)&pdMatrix,height*width* sizeof(int));
cudaMalloc((void**)&g_f1,tt* sizeof(float));
cudaMemset(g_f1,0,sizeof(float)*tt);
cudaMalloc((void**)&sum1, sizeof(int));
cudaMemset(sum1,0,sizeof(int));
g_f2=(float*)malloc(tt* sizeof(float));
memset(g_f2,0,sizeof(float)*tt);
sum2=0;
cudaMemcpy(pdMatrix,p,height*width*sizeof(int), cudaMemcpyHostToDevice);
ComputeFeature1<<<grid,block>>>(sum1,pdMatrix,width,height,g_f1);
cudaError_t cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaMemcpy(g_f2,g_f1,tt*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(&sum2,sum1,sizeof(int), cudaMemcpyDeviceToHost);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
printf("%d\n",sum2);sum2=0;
for(int i=0;i<tt;i++)
{
printf("%f ",g_f2[i]);
sum2+=g_f2[i];
}
printf("\n%d\n",sum2);
//p[width*height]={0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,1};
Error:
return cudaStatus;
}
#include "h.h"
#define G 4 //灰度阶
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
//测试求和
float ComputeFeature1( int* pdMatrix,int total)
{
int i,j;
float f1=0;
for(i=0; i<G; i++)
{
for(j=0; j<G; j++)
{
float t;
t=(float)pdMatrix[i*G+j]/(float)total;
f1+= (t)*(t);
}
}
return f1;
}
__global__ static void ComputeFeature1( float *sum1,int* pdMatrix,int width,int height,int *total) //186ye
{
int i;
float t;
int tid=threadIdx.x+threadIdx.y*4;//
int tid_in_x = threadIdx.x;
int tid_in_y = threadIdx.y;
__shared__ float stemp1[G*G];
if(tid_in_y<G &&(tid_in_x<G))
{
stemp1[tid_in_x*G+tid_in_y]=0;
}
__syncthreads();
//归一化
if(tid_in_y<G &&(tid_in_x<G))
{
stemp1[tid_in_x*G+tid_in_y]=(float)pdMatrix[tid]/((float)(*total));
}
__syncthreads();
if(threadIdx.x<G && threadIdx.y<G)
{
stemp1[threadIdx.x*G+threadIdx.y]=stemp1[threadIdx.x*G+threadIdx.y]*stemp1[threadIdx.x*G+threadIdx.y];
}
__syncthreads();
if(tid<G*G)
{
for(i=G*G/2; i>0; i/=2)
{
if(tid<i)
{
stemp1[tid] = stemp1[tid] + stemp1[tid+i] ;
}
__syncthreads();
}
}
__syncthreads();
*sum1 = stemp1[0];
////DUI
//if(tid<G*G)
//{
// for(i=G*G/2; i>0; i/=2)
// {
// if(tid<i)
// {
// pdMatrix[tid] = pdMatrix[tid] + pdMatrix[tid+i] ;
//
// }
// __syncthreads();
//
// }
//}
// __syncthreads();
// *sum1 = pdMatrix[0];
//
}
int main()
{
float *sum1,sum2;
//dim3 grid(2,2);
dim3 block(4,4);
int* pdMatrix;
int*p;
float *g_f1,*g_f2;
int width,height;
width=height=4;
int t=height*width;
//int tt=height*width/4;
p=(int *)malloc(height*width* sizeof(float));
int tot=0;
int *total;
float f1=0;
cudaMalloc((void**)&total, sizeof(int));
for(int i=0;i<t;i++)
{
p[i]=i;//i;
tot+=p[i];
}
f1=ComputeFeature1( p,tot);
printf("%f\n",f1);
//tot=1;
cudaMalloc((void**)&total, sizeof(int));
cudaMemset(total,0,sizeof(int));
cudaMemcpy(total,&tot,sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&pdMatrix,height*width* sizeof(int));
//cudaMalloc((void**)&g_f1,tt* sizeof(float));
//cudaMemset(g_f1,0,sizeof(float)*tt);
cudaMalloc((void**)&sum1, sizeof(float));
cudaMemset(sum1,0,sizeof(float));
//g_f2=(float*)malloc(tt* sizeof(float));
//memset(g_f2,0,sizeof(float)*tt);
sum2=0;
cudaMemcpy(pdMatrix,p,height*width*sizeof(int), cudaMemcpyHostToDevice);
ComputeFeature1<<<1,block>>>(sum1,pdMatrix,width,height,total);
cudaError_t cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
//cudaMemcpy(g_f2,g_f1,tt*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(&sum2,sum1,sizeof(float), cudaMemcpyDeviceToHost);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
printf("%f\n",sum2);
//printf("\n%d\n",sum2);
Error:
return cudaStatus;
}
//duide 1 end
#include "h.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
//测试求和
__global__ static void ComputeFeature1( int *sum1,int* pdMatrix,int width,int height,float *g_f1) //186ye
{
//pdMatrix[threadIdx.x]=pdMatrix[threadIdx.x]*pdMatrix[threadIdx.x];
__shared__ float stemp1[G][G];
int i,j;
unsigned int xIndex=__mul24(blockDim.x,blockIdx.x)+threadIdx.x;
unsigned int yIndex=__mul24(blockDim.y,blockIdx.y)+threadIdx.y;
unsigned int index=yIndex*width+xIndex; //?
int tid_in_x=threadIdx.x;
int tid_in_y=threadIdx.y;
if(tid_in_x<G &&tid_in_y<G){
stemp1[tid_in_x][tid_in_y]=0.0f;}
__syncthreads();
if((index<width*height)&&(threadIdx.x<G)&&(threadIdx.y<G)) //index_in
{
stemp1[threadIdx.x][threadIdx.y]=pdMatrix[index];
}
__syncthreads();
//float sum1;
//sum1=0.0f;
//for(i=0;i<16;i++)
//{
// //for(j=0;j<G;j++)
// //{
// *sum1+= pdMatrix[i];
// //}
//}
// 这种对
if(index<height*width)
{
for(i=height*width/2; i>0; i/=2)
{
if(index<i)
{
pdMatrix[index] = pdMatrix[index] + pdMatrix[index+i] ;
}
__syncthreads();
}
}
__syncthreads();
*sum1 = pdMatrix[0];
///
/if(sum1!=0)
//{
//if(threadIdx.x<G &&threadIdx.y<G){
////stemp1[threadIdx.x][threadIdx.y]=stemp1[threadIdx.x][threadIdx.y]/sum1;
//}
//__syncthreads();
//if(threadIdx.x<G &&threadIdx.y<G){
////stemp1[threadIdx.x][threadIdx.y]=stemp1[threadIdx.x][threadIdx.y]*stemp1[threadIdx.x][threadIdx.y];
//}
//__syncthreads();
//这种不对??????????????
/*if(threadIdx.x<G &&threadIdx.y<G){
for(j=blockDim.y*2;j>0;j--)
{
if(threadIdx.y<j)
{
stemp1[threadIdx.x][threadIdx.y]+=stemp1[threadIdx.x][threadIdx.y+j];
}
__syncthreads();
}
for( i=blockDim.x*2;i > 0 ;i--)
{
if (threadIdx.y < j )
{
stemp1[threadIdx.x][0]+= stemp1[threadIdx.x+i][0];
}
__syncthreads();
}
}
if( threadIdx.x==0&&threadIdx.y==0){
g_f1[blockIdx.x*2+blockIdx.y]=stemp1[0][0];//
//}
}*/
}
int main()
{
int *sum1,sum2;
dim3 grid(2,2,1);
dim3 block(2,2,1);
int* pdMatrix;
int*p;
float *g_f1,*g_f2;
int width,height;
width=height=4;
int t=height*width;
int tt=height*width/4;
p=(int *)malloc(height*width* sizeof(float));
for(int i=0;i<t;i++)
{
p[i]=i;//i;
}
cudaMalloc((void**)&pdMatrix,height*width* sizeof(int));
cudaMalloc((void**)&g_f1,tt* sizeof(float));
cudaMemset(g_f1,0,sizeof(float)*tt);
cudaMalloc((void**)&sum1, sizeof(int));
cudaMemset(sum1,0,sizeof(int));
g_f2=(float*)malloc(tt* sizeof(float));
memset(g_f2,0,sizeof(float)*tt);
sum2=0;
cudaMemcpy(pdMatrix,p,height*width*sizeof(int), cudaMemcpyHostToDevice);
ComputeFeature1<<<grid,block>>>(sum1,pdMatrix,width,height,g_f1);
cudaError_t cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaMemcpy(g_f2,g_f1,tt*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(&sum2,sum1,sizeof(int), cudaMemcpyDeviceToHost);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
printf("%d\n",sum2);sum2=0;
for(int i=0;i<tt;i++)
{
printf("%f ",g_f2[i]);
sum2+=g_f2[i];
}
printf("\n%d\n",sum2);
//p[width*height]={0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,1};
Error:
return cudaStatus;
}
相关文章推荐
- 蓝桥试题测试---序列求和
- zzuli OJ 1081: n个数求和 (多实例测试)
- 1081: n个数求和 (多实例测试)
- 【JMeter】分布式测试(waiting)
- zzuli OJ 1081: n个数求和 (多实例测试)
- BJ 集训测试6 求和
- 小米手机真机测试 waitingforDebugge 不动了
- [原创]性能测试工具介绍
- 需求测试总结
- 软件测试过程模型的种类之--------H模型
- 由测试部门进行单元测试为什么成本昂贵?
- ESX搭建网络测试环境
- GUI自动测试工具
- 【DOC】软件测试工程师工作总结
- 测试知识大闯关第24期 回答
- Junit+Selenium+Maven+SVN+Eclipse+AutoFrame全自动化测试实践实例(一)
- 一个有趣的测试
- apt-get正在等待报头(waiting for headers)解决方法
- 真机测试,file explorer无法打开data文件夹或其他
- 真机测试INSTALL_FAILED_INSUFFICIENT_STORAGE 的解决方法