/* compiled with -msse2, etc*/ //#include /* MMX */ //#include /* SSE */ #include /* SSE2 */ //#include /* SSE3 */ //#include /* SSSE3 */ //#include /* SSE4.2 SSE4.1 */ #include #include #include #include typedef uint64_t ticks; static __inline__ ticks ticks_start(void) { unsigned a, d; // Serialize + read TSC asm volatile( "cpuid\n\t" "rdtsc\n\t" : "=a"(a), "=d"(d) : "a"(0) : "rbx", "rcx", "memory" ); return ((ticks)d << 32) | a; } static __inline__ ticks ticks_end(void) { unsigned a, d; // Read TSC + serialize (rdtscp is partially serializing; cpuid completes) asm volatile( "rdtscp\n\t" : "=a"(a), "=d"(d) : : "rcx", "memory" ); asm volatile( "cpuid\n\t" : : "a"(0) : "rbx", "rcx", "rdx", "memory" ); return ((ticks)d << 32) | a; } /* typedef unsigned long long ticks; static __inline__ ticks getticks(void) { unsigned a, d; asm("cpuid"); asm volatile("rdtsc" : "=a" (a), "=d" (d)); return (((ticks)a) | (((ticks)d) << 32)); } */ int main() { unsigned long long int t1, t2, t3; float z1[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; float z2[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; float z3[8] = {0.0}; float z4[8]; int i, j; t1 = ticks_start(); for (j=0;j<100; j++) { for(i=0; i<8; i++) { z3[i] += z1[i] + z2[i]; } } t2 = ticks_end(); printf("time = %lld\n", t2-t1); t1 = ticks_start(); for (j=0; j<100; j++) { /* __m128 a0 = _mm_loadu_ps(&z1[0]); __m128 b0 = _mm_loadu_ps(&z2[0]); __m128 c0 = _mm_add_ps(a0, b0); _mm_storeu_ps(&z3[0], c0); _mm_storeu_ps(&z4[0], _mm_add_ps(_mm_loadu_ps(&z4[0]), c0)); __m128 a1 = _mm_loadu_ps(&z1[4]); __m128 b1 = _mm_loadu_ps(&z2[4]); __m128 c1 = _mm_add_ps(a1, b1); _mm_storeu_ps(&z3[4], c1); _mm_storeu_ps(&z4[4], _mm_add_ps(_mm_loadu_ps(&z4[4]), c1)); */ __m128 *v_z1 = (__m128 *)z1; __m128 *v_z2 = (__m128 *)z2; __m128 *v_z3 = (__m128 *)z4; __m128 *v_z4 = (__m128 *)z3; *v_z4 = _mm_add_ps(*v_z1, *v_z2); *v_z3 = _mm_add_ps(*v_z3, *v_z4); v_z1++; v_z2++; v_z3++; *v_z4 = _mm_add_ps(*v_z1, *v_z2); *v_z3 = _mm_add_ps(*v_z3, *v_z4); } t2 = ticks_end(); printf("time = %lld\n", t2-t1); for (i=0; i<8; i++) { // if (z3[i] != z4[i]) {printf("Wrong.\n"); exit(0);} printf("%d %f %f\n", i, z3[i], z4[i]); } printf("Correct, results matched.\n"); return 0; }