I am building up an example with variable no. of processes and bind them to the sockets in a small network with different architecture and number of cpus.
I compile and run with:
mpiicpc avg_4.c -qopenmp -axSSE4.2,AVX,CORE-AVX2 -O3 -par-affinity=noverbose,granularity=core,compact -o b
mpiexec.hydra -machinefile f19 -genv I_MPI_PIN=1 -genv I_MPI_PIN_DOMAIN=socket -genv I_MPI_PIN_ORDER=compact -n 1 ./b
The network (master + slave19) f19 is:
s19:1
ma:1
#include <stdio.h> #include <stdlib.h> #include <omp.h> #include <sched.h> #include <mpi.h> int *create_mlu(int n_omp, int ws) { int *mlu = (int *)calloc(n_omp * ws, sizeof(int)); for (int i = 0; i < ws; i++) for (int j = 0; j < n_omp; j++) mlu[j + i*n_omp] = j + 100 * i; return mlu; } int *C4_Re(int *mal, int n_omp, int wr, int ws) { int *rM8 = (int *)malloc(sizeof(int) * n_omp); char nod[MPI_MAX_PROCESSOR_NAME]; int n_l; MPI_Get_processor_name(nod, &n_l); #pragma omp parallel for for (int i = 0; i < n_omp; i++) { rM8[i] = mal[i] + 10 * omp_get_thread_num(); printf("ws%2d\t\tmpi%2d\t\tmaxTh%2d\t\tmaxPr%2d\t\tomp%2d\t\tcore%3d\t\trM8%4d\t\tnod %s\n", ws, wr, omp_get_num_threads(), omp_get_num_procs(), omp_get_thread_num(), sched_getcpu(), rM8[i], nod); } return rM8; } int main(void) { MPI_Init(NULL, NULL); int ts[2] = {7, 9}; //no of processes for (int t = 0; t < 2; t++) { int ws = ts[t]; int errcodes[ws]; MPI_Comm parentcomm, intercomm; MPI_Comm_get_parent(&parentcomm); if (parentcomm == MPI_COMM_NULL) { MPI_Comm_spawn("./b", MPI_ARGV_NULL, ws, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &intercomm, errcodes); //printf("I'm the parent.\n"); } else { int wr; MPI_Comm_rank(MPI_COMM_WORLD, &wr);// printf("wr %d\n", wr); //int ps; MPI_Comm_size(parentcomm, &ps);// printf("ps %d\n", ps); //int pr; MPI_Comm_rank(parentcomm, &pr);// printf("pr %d\n", pr); int n_omp = 8, *mlu = NULL; if (wr == 0) { mlu = create_mlu(n_omp, ws); //for (int i = 0; i < n_omp*ws; i++) printf("\tmlu[%2d] = %d\n", i, mlu[i]); } int *mal = (int *)malloc(n_omp * sizeof(int)); MPI_Scatter(mlu, n_omp, MPI_INT, mal, n_omp, MPI_INT, 0, MPI_COMM_WORLD); //for (int i = 0; i < n_omp; i++) printf("\t\tmal[%2d] = %d\trank %d\n", i, mal[i], wr); int *rM8 = NULL; rM8 = C4_Re(mal, n_omp, wr, ws); int *rS8 = NULL; if (wr == 0) rS8 = (int *)malloc(sizeof(int) * ws * n_omp); MPI_Gather(rM8, n_omp, MPI_INT, rS8, n_omp, MPI_INT, 0, MPI_COMM_WORLD); if (wr == 0) { //for (int i = 0; i < n_omp * ws; i++) printf("\t\trS8[%2d] = %d\n", i, rS8[i]); free(mlu); free(rS8); } free(mal); free(rM8); } //fflush(stdout); } fflush(stdout); MPI_Finalize(); return 0; }
I have a memory corruption which I need help to find it
Some results look like
ws 7 rM8-37253944 nod ma mpi 7 maxTh 6 maxPr 6 omp 4 core 4
but they must look like
ws 7 rM8 624 nod ma mpi 6 maxTh 6 maxPr 6 omp 2 core 2
addition questions
1 - why using parentcomm for Scatter and Gather is not correct? In my opinion parentcomm is the new communicator
2 - should I create different comunicators for 7 and 9?
3 - mpicc gives me wrong results I don't know why