您的位置:首页 > 其它

VS2010下FPU和SSE优化测试结果

2011-05-16 15:52 176 查看
#include <xmmintrin.h>
#include <math.h>
#include <time.h>
#include <iostream>
using namespace std;
#define asm __asm
#define ARRAY_SIZE 3000000
clock_t start,finish;
void init_CPLUSPLUS(float* pSource ,int nCount)
{
start = clock();
float* pCur = pSource;
for(int i=0;i<nCount;i++)
{
*pCur = (float)(sin((float)i))+(float)cos((float)i);
pCur++;
}
finish = clock();
cout<<"init CPLUSPLUS"<<(finish-start)*1000.0/CLOCKS_PER_SEC<<endl;
}
void init_CPLUSPLUSFPU(float* pSource ,int nCount)
{
start = clock();
float temp;
for(int i=0;i<nCount;i++)
{
asm
{
fild i;
fsincos;
fadd;
mov eax,pSource;
fstp dword ptr[eax];
}
pSource++;
}
finish = clock();
cout<<"init CPLUSPLUSFPU"<<(finish-start)*1000.0/CLOCKS_PER_SEC<<endl;
}

void sqrt_CPLUSPLUS(float* pSource1,float* pSource2,float* pResult,int nCount)
{
start = clock();
float *p1,*p2,*p3;
p1 = pSource1;
p2 = pSource2;
p3 = pResult;
for(int i=0;i<nCount;i++)
{
*p3 = sqrt((*p1)*(*p1)+(*p2)*(*p2))+0.5;
p3++;
p1++;
p2++;
}
finish = clock();
cout<<"sqrt_CPLUSPLUS"<<(finish-start)*1000.0/CLOCKS_PER_SEC<<endl;
}

void sqrt_CPLUSPLUSSSE(float* pSource1,float* pSource2,float* pResult,int nCount)
{
start = clock();
nCount/=4;
__m128 s1,s2,r;

//	__m128* pS1 = (__m128*) pSource1;
//   __m128* pS2 = (__m128*) pSource2;
//    __m128* pR = (__m128*) pResult;

__m128 t;
t = _mm_set_ps1(0.5f);
for(int i = 0; i<nCount;i++)
{
s1 = _mm_load_ps(pSource1+(i<<2));
s2 = _mm_load_ps(pSource2+(i<<2));
s1 = _mm_mul_ps(s1,s1);
s2 = _mm_mul_ps(s2,s2);
s1 = _mm_add_ps(s1,s2);
r = _mm_sqrt_ps(s1);
r = _mm_add_ps(r,t);
_mm_store_ps(pResult+(i<<2),r);

}
finish = clock();
cout<<"sqrt_CPLUSPLUSSSE"<<(finish-start)*1000.0/CLOCKS_PER_SEC<<endl;
}
void main()
{
//	__declspec(align(16)) float fArray[ARRAY_SIZE];

float* pSource1 = (float*)_aligned_malloc(ARRAY_SIZE*sizeof(float),16);
float* pSource2 = (float*)_aligned_malloc(ARRAY_SIZE*sizeof(float),16);
float* pResult1 = (float*)_aligned_malloc(ARRAY_SIZE*sizeof(float),16);
float* pResult2 = (float*)_aligned_malloc(ARRAY_SIZE*sizeof(float),16);

init_CPLUSPLUS(pSource1,ARRAY_SIZE);

init_CPLUSPLUSFPU(pSource2,ARRAY_SIZE);

for(int i=0;i<ARRAY_SIZE;i++)
{
if(fabs(pSource1[i]-pSource2[i])>0.00001f)
{
cout<<"init error";
break;
}
}

sqrt_CPLUSPLUS(pSource1,pSource2,pResult1,ARRAY_SIZE);
sqrt_CPLUSPLUSSSE(pSource1,pSource2,pResult2,ARRAY_SIZE);

for(int i=0;i<ARRAY_SIZE;i++)
{
if(fabs(pResult1[i]-pResult2[i])>0.00001f)
{
cout<<"sqrt error";
break;
}
}

_aligned_free(pResult2);
_aligned_free(pResult1);
_aligned_free(pSource2);
_aligned_free(pSource1);
system("pause");
}


用x87指令和SSE指令对三角函数的计算和开方计算进行优化。

平台:软件VS2010,CPU Intel Celeron E3400 2.6G

Release版本运行结果如下:

init CPLUSPLUS:359ms

init CPLUSPLUSFPU:141ms

sqrt_CPLUSPLUS:47ms

sqrt_CPLUSPLUSSSE:15ms
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: