/* compiled with -msse2, etc*/ //#include /* MMX */ //#include /* SSE */ #include /* SSE2 */ //#include /* SSE3 */ //#include /* SSSE3 */ //#include /* SSE4.2 SSE4.1 */ #include #include #include #include typedef uint64_t ticks; static __inline__ ticks ticks_start(void) { unsigned a, d; // Serialize + read TSC asm volatile( "cpuid\n\t" "rdtsc\n\t" : "=a"(a), "=d"(d) : "a"(0) : "rbx", "rcx", "memory" ); return ((ticks)d << 32) | a; } static __inline__ ticks ticks_end(void) { unsigned a, d; // Read TSC + serialize (rdtscp is partially serializing; cpuid completes) asm volatile( "rdtscp\n\t" : "=a"(a), "=d"(d) : : "rcx", "memory" ); asm volatile( "cpuid\n\t" : : "a"(0) : "rbx", "rcx", "rdx", "memory" ); return ((ticks)d << 32) | a; } /* plain version x = a*x */ void sapxy(int n, float a, float *x) { int i; for (i=0; i