关于使用MMX/SSE技术优化memcpy的尝试
2010-08-18 09:28
281 查看
近来,希望能通过使用某种技术优化常规memcpy()的性能,于是尝试了 MMX/SSE,希望能借此实现一个性能更高的memcpy函数。
代码如下(里面的USE1函数是借用别人的,但性能也不怎么样):
试验结果:
1. 未优化,memcpy 100M数据:
[root@localhost opt]# ./test
memcpy Use 94 ms
memcpy Use 61 ms
memcpy Use 61
ms
memcpy Use 61 ms
memcpy Use 62 ms
memcpy Use 61 ms
memcpy Use 61
ms
memcpy Use 61 ms
2. 使用MMX/SSE优化,memcpy 100M数据:
[root@localhost
opt]# ./test 1
your choice is 1
memcopy0 Use 110
ms
memcopy1 Use 110 ms
memcopy2 Use 110 ms
memcopy0 Use
40 ms
memcopy1 Use 42 ms
memcopy2 Use 40 ms
memcopy0 Use 48
ms
memcopy1 Use 40 ms
memcopy2 Use 41 ms
memcopy0 Use 40 ms
memcopy1
Use 40
ms
初步结论:
使用MMS/SSE内存技术对memcpy的性能优化空间不太大,而且在执行初期,优化的性能甚至比不上未优化的性能。
从原理上讲,SSE会比MMX快,MMX会比常规memcpy快。可能受限于AT&T汇编掌握程度,暂时未能给出理想的优化结果。
如果谁有更好的想法,欢迎随时交流。
代码如下(里面的USE1函数是借用别人的,但性能也不怎么样):
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <sys/time.h> #define LEN 100*1024*1024 #define USE1 class TimeUse{ public: TimeUse(char * cMsg) { memset(m_cMsg, 0, sizeof(m_cMsg)); strncpy(m_cMsg, cMsg, strlen(cMsg)); gettimeofday(&tTime1, NULL); } ~TimeUse() { gettimeofday(&tTime2, NULL); unsigned long ulDiff = (tTime2.tv_sec-tTime1.tv_sec)*1000 + (tTime2.tv_usec-tTime1.tv_usec)/1000; printf("%s Use %ld ms/n", m_cMsg, ulDiff); } private: struct timeval tTime1, tTime2; char m_cMsg[255]; }; #ifdef USE0 /*100M耗时约85ms*/ static inline void * memcopy(void *dest, const void *src, int size) { int i, n, len, iCount; char * to = (char *)dest; char *from = (char *)src; n = size; len = size; char cFSave[108]; { int i; #if 0 __asm__ __volatile__ ( "1: prefetchnta 128(%0)/n" : : "r" (from) ); #endif /*开始MMX之前要保存FPS*/ iCount = (len/64); if(iCount > 0) { __asm__( ".lcomm buffer, 108/n" "fsave buffer/n" "loop:/n" "movq (%0), %%mm0/n" "movq 8(%0), %%mm1/n" "movq 16(%0), %%mm2/n" "movq 24(%0), %%mm3/n" "movq 32(%0), %%mm4/n" "movq 40(%0), %%mm5/n" "movq 48(%0), %%mm6/n" "movq 56(%0), %%mm7/n" "movntq %%mm0, (%1)/n" "movntq %%mm1, 8(%1)/n" "movntq %%mm2, 16(%1)/n" "movntq %%mm3, 24(%1)/n" "movntq %%mm4, 32(%1)/n" "movntq %%mm5, 40(%1)/n" "movntq %%mm6, 48(%1)/n" "movntq %%mm7, 56(%1)/n" "addl $64,%0/n" "addl $64,%1/n" "dec %2/n" "jnz loop/n" "frstor buffer/n" "emms/n" : : "a" (from), "b" (to), "c" (iCount) : "memory"); } if (len%64) { memcpy(to+(len/64)*64, from+(len/64)*64, len%64); } return to; } } #endif #ifdef USE1 /*http://people.redhat.com/mingo/mmx-patches/mmx-2.3.99-A0 * * http://mail-index.netbsd.org/tech-perform/2002/10/23/0004.html */ /*100M耗时约85ms*/ static inline void * memcopy(void *dest, const void *src, int size) { int i, n; char * to = (char *)dest; char *from = (char *)src; n = size; { size_t size; #define STEP 0x20 #define ALIGN 0x10 if ((unsigned long)to & (ALIGN-1)) { size = ALIGN - ((unsigned long)to & (ALIGN-1)); __asm__ __volatile__("movups (%0),%%xmm0/n/t" "movups %%xmm0,(%1)/n/t" : : "r" (from), "r" (to)); n -= size; from += size; to += size; } /* * If the copy would have tailings, take care of them * now instead of later */ if (n & (ALIGN-1)) { size = n - ALIGN; __asm__ __volatile__("movups (%0),%%xmm0/n/t" "movups %%xmm0,(%1)/n/t" : : "r" (from + size), "r" (to + size)); n &= ~(ALIGN-1); } /* * Prefetch the first two cachelines now. */ __asm__ __volatile__("prefetchnta 0x00(%0)/n/t" "prefetchnta 0x20(%0)/n/t" : : "r" (from)); while (n >= STEP) { __asm__ __volatile__( "movups 0x00(%0),%%xmm0/n/t" "movups 0x10(%0),%%xmm1/n/t" "movntps %%xmm0,0x00(%1)/n/t" "movntps %%xmm1,0x10(%1)/n/t" : : "r" (from), "r" (to) : "memory"); from += STEP; /* * Note: Intermixing the prefetch at *exactly* this point * in time has been shown to be the fastest possible. * Timing these prefetch instructions is a complete black * art with nothing but trial and error showing the way. * To that extent, this optimum version was found by using * a userland version of this routine that we clocked for * lots of runs. We then fiddled with ordering until we * settled on our highest speen routines. So, the long * and short of this is, don't mess with instruction ordering * here or suffer permance penalties you will. */ __asm__ __volatile__( "prefetchnta 0x20(%0)/n/t" : : "r" (from)); to += STEP; n -= STEP; } return to; } } #endif #ifdef USE2 /*100M耗时约85ms*/ static inline void * memcopy(void *dest, const void *src, int size) { int i, n, len; char * to = (char *)dest; char *from = (char *)src; n = size; len = size; { int i; __asm__ __volatile__ ( "1: prefetchnta (%0)/n" "prefetchnta 64(%0)/n" "prefetchnta 128(%0)/n" "prefetchnta 192(%0)/n" : : "r" (from) ); for(i=0; i<len/64; i++) { __asm__ __volatile__ ( "prefetchnta 168(%0)/n" "movq (%0), %%mm0/n" "movntq %%mm0, (%1)/n" "movq 8(%0), %%mm1/n" "movntq %%mm1, 8(%1)/n" "movq 16(%0), %%mm2/n" "movntq %%mm2, 16(%1)/n" "movq 24(%0), %%mm3/n" "movntq %%mm3, 24(%1)/n" "movq 32(%0), %%mm4/n" "movntq %%mm4, 32(%1)/n" "movq 40(%0), %%mm5/n" "movntq %%mm5, 40(%1)/n" "movq 48(%0), %%mm6/n" "movntq %%mm6, 48(%1)/n" "movq 56(%0), %%mm7/n" "movntq %%mm7, 56(%1)/n" : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; } if (len&63) memcpy(to, from, len&63); return to; } } #endif /*用法: ./test 0; ./test 1*/ int main(int argc, char ** argv) { char * pcSrc = NULL; char * pcDst = NULL; char * pcSrc1 = NULL; char * pcDst1 = NULL; char * pcSrc2 = NULL; char * pcDst2 = NULL; int iChoice = 0; int a,b,c,d; float * pfData = NULL; if(argc > 2) { printf("Usage: './test 0' to use memcpy; or './test 1' to use memcopy/n"); } if(argc == 2) { sscanf(argv[1], "%d", &iChoice); printf("your choice is %d/n", iChoice); } pcSrc = new char[LEN]; pcDst = new char[LEN]; pcSrc1 = new char[LEN]; pcDst1 = new char[LEN]; pcSrc2 = new char[LEN]; pcDst2 = new char[LEN]; pfData = new float[LEN]; int iLoop = 0; while((iLoop++) <= 255) { //TimeUse t("Loop"); { { int * piTemp = NULL; piTemp = (int *)(pcDst); TimeUse t("=="); for(int iTemp=0; iTemp<LEN/4;iTemp++) { *piTemp++=123; } } { TimeUse t("memset"); memset(pcSrc, iLoop, LEN); } memset(pcSrc1, iLoop, LEN); memset(pcSrc2, iLoop, LEN); for(int iLoop2=0; iLoop2<LEN; iLoop2++) { pfData[iLoop2]=1.0123456789+iLoop; } } if(iChoice == 0) { TimeUse t("memcpy"); memcpy(pcDst, pcSrc, LEN); } else { { TimeUse t("memcopy0"); memcopy(pcDst, pcSrc, LEN); } { TimeUse t("memcopy1"); //memcopy(pcDst1, pcSrc1, LEN); } { TimeUse t("memcopy2"); // memcopy(pcDst2, pcSrc2, LEN); } } usleep(20000); } return 0; }
试验结果:
1. 未优化,memcpy 100M数据:
[root@localhost opt]# ./test
memcpy Use 94 ms
memcpy Use 61 ms
memcpy Use 61
ms
memcpy Use 61 ms
memcpy Use 62 ms
memcpy Use 61 ms
memcpy Use 61
ms
memcpy Use 61 ms
2. 使用MMX/SSE优化,memcpy 100M数据:
[root@localhost
opt]# ./test 1
your choice is 1
memcopy0 Use 110
ms
memcopy1 Use 110 ms
memcopy2 Use 110 ms
memcopy0 Use
40 ms
memcopy1 Use 42 ms
memcopy2 Use 40 ms
memcopy0 Use 48
ms
memcopy1 Use 40 ms
memcopy2 Use 41 ms
memcopy0 Use 40 ms
memcopy1
Use 40
ms
初步结论:
使用MMS/SSE内存技术对memcpy的性能优化空间不太大,而且在执行初期,优化的性能甚至比不上未优化的性能。
从原理上讲,SSE会比MMX快,MMX会比常规memcpy快。可能受限于AT&T汇编掌握程度,暂时未能给出理想的优化结果。
如果谁有更好的想法,欢迎随时交流。
相关文章推荐
- 关于GCC下使用内建的多媒体指令集(MMX、SSE)函数
- Flash 平台技术的优化(十一) 关于性能方面的其他优化
- 使用procedure analyse()分析mysql给出的关于表结构的优化建议
- 关于使用EF网站优化的几个问题
- 关于不同类型之间使用memcpy
- MySQL查询优化技术之:使用索引
- 【图文】关于Android内存和性能优化的使用教程
- 关于大型网站技术演进的思考(十九)--网站静态化处理—web前端优化—上(11)
- 关于优化程序(优化代码和使用异步委托)
- 关于尝试GUI的简易使用过程中遇到的问题1
- 关于在网页中使用IP的一次技术讨论
- YUV转RGB--使用MMX和CUDA优化
- 淘宝的新Sprite方法——使用Img Sprite技术对按钮加载顺序优化的简单研究
- 关于vue 框架与后台框架的混合使用的尝试
- Memcache技术分享:介绍、使用、存储、算法、优化、命中率
- 关于使用地图 创建 类型数目可控的海量“精灵节点”的优化封装(研究续集)
- 关于strcpy、memset、memcpy的使用详解
- 关于大型网站技术演进的思考(二十一):网站静态化处理—Web前端优化(下)(13)
- 关于 web cam 使用自家的摄像头实现 视频捕捉技术 高级篇
- 关于大型网站技术演进的思考(二十)--网站静态化处理—web前端优化—中(12)