#include #include #include struct timeval t1, t2; #define BLOCK_SIZE 16 // kernel MM routine __global__ void mmkernel(float *a, float *b, float *c, int N, int M, int K) { int i = blockIdx.x * 64 + threadIdx.x; int j = blockIdx.y; int tx = threadIdx.x; __shared__ float cb[32]; float sum0 = 0.0f, sum1=0.0f; for (int ks = 0; ks < M; ks+= 32) { cb[tx] = b[ks+tx+M*j]; __syncthreads(); for (int k = ks; k< ks+32; k++) { sum0 += a[i+N*k] * cb[k-ks]; sum1 += a[i+32+N*k] * cb[k-ks]; } __syncthreads(); } c [i+N*j] = sum0; c[i+32+N*j] = sum1; } // host multiplication function // C = A * B // A is a hA x wA matrix // B is a wA x wB matrix // C is a hA x wB matrix void Mul (const float *A, const float *B, float *C, int N, int M, int K) { int size; float *dev_A, *dev_B, *dev_C; printf("%d %d %d\n", N, M, K); size = N*M*sizeof(float); cudaMalloc((void **)&dev_A, size); cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice); size = M*K *sizeof(float); cudaMalloc((void **)&dev_B, size); cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice); size = N*K * sizeof(float); cudaMalloc((void **)&dev_C, size); dim3 dimBlock(32); dim3 dimGrid(N/64, N); mmkernel<<>> (dev_A, dev_B, dev_C, N, M, K); cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost); cudaFree(dev_A); cudaFree(dev_B); cudaFree(dev_C); } int main( int argc, char *argv[]) { float *A, *B, *C; int N, M, K, iter, i; int method; if (argc < 6) { printf("Usage: a.out N M K iter method\n"); exit(0); } N= atoi(argv[1]); M = atoi(argv[2]); K = atoi(argv[3]); iter = 1; if (argc >=5) iter = atoi(argv[4]); method = 0; if (argc >= 6) method = atoi(argv[5]); A = (float *)malloc(N*M*sizeof(float)); B = (float *)malloc(M*K*sizeof(float)); C = (float *)malloc(N*K*sizeof(float)); srand48(100); for (i=0; i