#include #include #include #include #include #include "mpi.h" struct timeval t1, t2; /* * N, M, K are the global sizes, A matrix is N x M, * B matrix is M x K * C matrix is N x K * assuming that A and C matrices are stored in a row partition format * B is stored in column partition format * local matrix A is a N/nprocs x M * B is a M x K/nprocs * C is a N/procs x K * The last node (nprocs -1) has the leaf-over rows of the matrics */ void my_mm6_sse2_mpi(int N, int M, int K, double *a, double *b, double *c) { // this is the routine that you must implement } int main( int argc, char *argv[]) { double *A, *B, *C, *W, *Z, *WORK; int N, M, K, I, iter, i, j; int method; int myid, nprocs; int localN, localK; MPI_Status status; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &myid); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); if (argc < 6) { if (myid == 0) printf("Usage: a.out N M K iter method\n"); MPI_Finalize(); exit(0); } N= atoi(argv[1]); M = atoi(argv[2]); K = atoi(argv[3]); iter = 1; if (argc >=5) iter = atoi(argv[4]); method = 0; if (argc >= 6) method = atoi(argv[5]); /* A = malloc(N*M*sizeof(double)); B = malloc(M*K*sizeof(double)); C = malloc(N*K*sizeof(double)); W = malloc(M*K*sizeof(double)); // reorder B */ if (myid != nprocs -1) { localN = N/nprocs; localK = K/nprocs; } else { localN = N - (N/nprocs*(nprocs-1)); localK = K-K/nprocs*(nprocs-1); } posix_memalign((void **)&A, 16, localN*M*sizeof(double)); posix_memalign((void **)&B, 16, M*localK*sizeof(double)); posix_memalign((void **)&C, 16, localN*K*sizeof(double)); W = malloc(2*localN*K*sizeof(double)); srand48(100+myid); for (i=0; i