I want to check the performance of LU factorization on the cluster. So the first, I called pdgetrf routine on the MKL library and execute it in my computer (Intel® Xeon Phi™ Processor 7250).
Performance of pdgetrf with the size matrix = 20000 is 170Gfops for implement on 4 processes.
Please help me check my code, Does my program have an error? How to improve to performance of pdgetrf on KNL - 7250? How much is the maximum performance of pdgetrf ?
Thanks a lot.
- This is compiler:
mpiicc pdgetrf.c -O3 -qopenmp -lmemkind -mkl -xMIC-AVX512 -restrict \
-o pdgetrf -I./ -I/opt/intel/compilers_and_libraries_2018.5.274/linux/mkl/include/ -lmkl_scalapack_lp64 -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -liomp5
- This is for execute: mpirun -np 4 ./pdgetrf
#include <stdio.h> #include <time.h> #include <string.h> #include <stdlib.h> #include "mpi.h" int main(int argc, char **argv) { int i, j, k; /************ MPI ***************************/ int myrank_mpi, nprocs_mpi; MPI_Init( &argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi); MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi); /************ BLACS ***************************/ int ictxt, nprow, npcol, myrow, mycol,nb; int info,itemp; int ZERO=0,ONE=1; nprow = 2; npcol = 2; nb =500; int M=nprow*10000; int K=npcol*10000; Cblacs_pinfo( &myrank_mpi, &nprocs_mpi ) ; Cblacs_get( -1, 0, &ictxt ); Cblacs_gridinit( &ictxt, "Row", nprow, npcol ); Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); int rA = numroc_( &M, &nb, &myrow, &ZERO, &nprow ); int cA = numroc_( &K, &nb, &mycol, &ZERO, &npcol ); double *A = (double*) malloc(rA*cA*sizeof(double)); int descA[9]; int *IPIV; IPIV = (int *)calloc(rA + nb, sizeof(int)); descinit_(descA, &M, &K, &nb, &nb, &ZERO, &ZERO, &ictxt, &rA, &info); double alpha = 1.0; double beta = 1.0; double start, end, flops; srand(time(NULL)*myrow + mycol); #pragma simd for (j=0; j<rA*cA; j++) { A[j]=((double)rand()-(double)(RAND_MAX)*0.5)/(double)(RAND_MAX); } MPI_Barrier(MPI_COMM_WORLD); start=MPI_Wtime(); pdgetrf(&M, &K, A, &ONE, &ONE, descA, IPIV, &info); MPI_Barrier(MPI_COMM_WORLD); end=MPI_Wtime(); double duration = (double)(end - start); if (myrow==0 && mycol==0) { if (M > K) { printf("%f Gigaflops\n", ((double)K * (double)K * (double)M - (double)K * (double)K * (double)K / (double)3) * 1.0e-9 / duration); } else if (K < M) { printf("%f Gigaflops\n", ((double)M * (double)M * (double)K - (double)M * (double)M * (double)M / (double)3) * 1.0e-9 / duration); } else { printf("%f Gigaflops\n", ((double)2*(double)K *(double)K * (double)K / (double)3) * 1.0e-9 / duration); } // printf("%f Gflops\n", flops); } Cblacs_gridexit( 0 ); MPI_Finalize(); return 0; }