#include "stdafx.h" #include <stdlib.h> #include <windows.h> #include <xmmintrin.h> void sse_add(float *srcA, float *srcB, float *dest, int n) { int len = n >> 2; for (int i = 0; i < len; i++) { *(__m128*)(dest + i*4) = _mm_add_ps(*(__m128*)(srcA + i*4), *(__m128*)(srcB + i*4)); } } void normal_add(float *srcA, float *srcB, float *dest, int n) { for (int i = 0; i < n; i++) { dest[i] = srcA[i] + srcB[i]; } } int main(int argc, _TCHAR* argv[]) { DWORD timeStart = 0, timeEnd = 0; const int size = 10000, count = 10000; // 分配16字节对齐的内存 _MM_ALIGN16 float *srcA = (_MM_ALIGN16 float*)_mm_malloc(sizeof(float)*size, 16); _MM_ALIGN16 float *srcB = (_MM_ALIGN16 float*)_mm_malloc(sizeof(float)*size, 16); _MM_ALIGN16 float *dest = (_MM_ALIGN16 float*)_mm_malloc(sizeof(float)*size, 16); // 初始化 for (int i = 0; i < size; i++) { srcA[i] = (float)i; } memcpy_s(srcB, sizeof(float) * size, srcA, sizeof(float) * size); // 标准加法 timeStart = GetTickCount(); for (int i = 0; i < count; i++) { normal_add(srcA, srcB, dest, size); } timeEnd = GetTickCount(); printf("normal test...time ---> %f \n", (timeEnd - timeStart) * 0.001); // SSE指令加法 timeStart = GetTickCount(); for (int i = 0; i < count; i++) { sse_add(srcA, srcB, dest, size); } timeEnd = GetTickCount(); printf("sse test...time ---> %f \n", (timeEnd - timeStart) * 0.001); // 释放内存 _mm_free(srcA); _mm_free(srcB); _mm_free(dest); system("pause"); return 0; }上述程序使用vs 2005采用release模式编译后,运行结果如下,通过时间对比可知,采用sse指令的加法运算,效率明显得到了提高。
原文:http://blog.csdn.net/grafx/article/details/20001589