diff --git a/ChangeLog b/ChangeLog index 95ed8c16..dd498448 100644 --- a/ChangeLog +++ b/ChangeLog @@ -2,6 +2,14 @@ -Date -Name -changes + +-------------- +Dec 10, 2024 +Name: Xin Jing +Changes: (linearAlgebra.c, eigenSolver.c, eigenSovlerKpt.c) +1. Support ELPA +2. Fix a bug in initialization + -------------- Dec 4, 2024 Name: Lucas Timmerman diff --git a/src/include/linearAlgebra.h b/src/include/linearAlgebra.h index e5705063..2996ed7c 100644 --- a/src/include/linearAlgebra.h +++ b/src/include/linearAlgebra.h @@ -145,4 +145,13 @@ void pdgemm_subcomm( int *descA, const double *B, int *descB, double beta, double *C, int *descC, MPI_Comm comm, int max_nproc); +#ifdef USE_ELPA +void elpa_pdsygvx( + double *a, int desca[9], double *b, int descb[9], int nev, double *ev, double *z, MPI_Comm comm, int *ierr); +void elpa_pdsyevx( + double *a, int desca[9], int nev, double *ev, double *z, MPI_Comm comm, int *ierr); +void elpa_pzhegvx( + double _Complex *a, int desca[9], double _Complex *b, int descb[9], int nev, double *ev, double _Complex *z, MPI_Comm comm, int *ierr); +#endif + #endif // LINEARALGEBRA_H \ No newline at end of file diff --git a/src/initialization.c b/src/initialization.c index 4225d72e..d304baeb 100644 --- a/src/initialization.c +++ b/src/initialization.c @@ -429,6 +429,13 @@ void Initialize(SPARC_OBJ *pSPARC, int argc, char *argv[]) { if (rank == 0) printf("The dimension of subgrid for eigen sovler is (%d x %d).\n", pSPARC->eig_paral_subdims[0], pSPARC->eig_paral_subdims[1]); #endif + + #ifdef USE_ELPA + // No processor can have zero data in ELPA + int maxdim = max(pSPARC->eig_paral_subdims[0], pSPARC->eig_paral_subdims[1]); + pSPARC->eig_paral_blksz = max(1,min(pSPARC->Nstates/maxdim, pSPARC->eig_paral_blksz)); + // if (rank == 0) printf("eig_paral_blksz %d\n", pSPARC->eig_paral_blksz); + #endif } } @@ -2601,6 +2608,14 @@ void SPARC_copy_input(SPARC_OBJ *pSPARC, SPARC_INPUT_OBJ *pSPARC_Input) { } } +#if defined(USE_ELPA) + #if !defined(USE_MKL) && !defined(USE_SCALAPACK) + if (rank == 0) + printf(RED "ERROR: To use ELPA, please turn on MKL or SCALAPACK in makefile!\n"RESET); + exit(EXIT_FAILURE); + #endif +#endif + #if !defined(USE_MKL) && !defined(USE_FFTW) if (pSPARC->ixc[3] != 0){ if (rank == 0) @@ -3649,7 +3664,7 @@ void write_output_init(SPARC_OBJ *pSPARC) { } fprintf(output_fp,"***************************************************************************\n"); - fprintf(output_fp,"* SPARC (version Dec 4, 2024) *\n"); + fprintf(output_fp,"* SPARC (version Dec 10, 2024) *\n"); fprintf(output_fp,"* Copyright (c) 2020 Material Physics & Mechanics Group, Georgia Tech *\n"); fprintf(output_fp,"* Distributed under GNU General Public License 3 (GPL) *\n"); fprintf(output_fp,"* Start time: %s *\n",c_time_str); @@ -4281,8 +4296,7 @@ void SPARC_Input_MPI_create(MPI_Datatype *pSPARC_INPUT_MPI) { MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_CHAR, MPI_CHAR, MPI_CHAR, MPI_CHAR, MPI_CHAR, - MPI_CHAR, MPI_CHAR, MPI_CHAR, MPI_CHAR, MPI_CHAR, - MPI_CHAR}; + MPI_CHAR, MPI_CHAR, MPI_CHAR, MPI_CHAR, MPI_CHAR}; int blens[N_MEMBR] = {3, 3, 7, /* int array */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -4325,8 +4339,7 @@ void SPARC_Input_MPI_create(MPI_Datatype *pSPARC_INPUT_MPI) { 1, 1, 1, 1, 1, 1, /* double */ 32, 32, 32, L_STRING, L_STRING, /* char */ - L_STRING, L_STRING, L_STRING, L_STRING, - L_STRING}; + L_STRING, L_STRING, L_STRING, L_STRING, L_STRING}; // calculating offsets in an architecture independent manner MPI_Aint addr[N_MEMBR],disps[N_MEMBR], base; diff --git a/src/linearAlgebra.c b/src/linearAlgebra.c index 9a8cf5f3..1b883b31 100644 --- a/src/linearAlgebra.c +++ b/src/linearAlgebra.c @@ -24,6 +24,11 @@ #include "blacs.h" // Cblacs_* #include "scalapack.h" // ScaLAPACK functions #endif +/* ELPA routines */ +#ifdef USE_ELPA +#include +#define assert_elpa_ok(x) assert(x == ELPA_OK) +#endif #include "linearAlgebra.h" #include "parallelization.h" @@ -530,8 +535,9 @@ void pdsygvx_subcomm_ ( " total number of processors in the provided communicator.\n", dims[0], dims[1]); exit(EXIT_FAILURE); } + #ifdef DEBUGSUBGRID - if (rank == 0) printf("pdsyevx_subcomm_: process grid = (%d, %d)\n", dims[0], dims[1]); + if (rank == 0) printf("pdsygvx_subcomm_: process grid = (%d, %d)\n", dims[0], dims[1]); fflush(stdout); #endif // generate a (subset) process grid within comm @@ -546,6 +552,11 @@ void pdsygvx_subcomm_ ( // create new context with dimensions: nproc x 1 Cblacs_gridinit(&ictxt_old, "Row", nproc, 1); +#ifdef USE_ELPA + MPI_Comm subcomm; + MPI_Comm_split(comm, ictxt >= 0, 0, &subcomm); +#endif + if (ictxt >= 0) { int nprow, npcol, myrow, mycol; Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol); @@ -590,10 +601,14 @@ void pdsygvx_subcomm_ ( // if abstol is not provided in advance, use the most orthogonal setting if (*abstol < 0) *abstol = pdlamch_(&ictxt, "U"); +#ifdef USE_ELPA + elpa_pdsygvx(A_BLCYC, descA_BLCYC, B_BLCYC, descB_BLCYC, *n, w, Z_BLCYC, subcomm, info); +#else automem_pdsygvx_(ibtype, jobz, range, uplo, n, A_BLCYC, &ONE, &ONE, descA_BLCYC, B_BLCYC, &ONE, &ONE, descB_BLCYC, vl, vu, il, iu, abstol, m, nz, w, orfac, Z_BLCYC, &ONE, &ONE, descZ_BLCYC, ifail, info); - +#endif + #ifdef DEBUGSUBGRID t2 = MPI_Wtime(); if (!rank) printf("pdsygvx_subcomm: AZ=ZD: %.3f ms\n", (t2-t1)*1e3); @@ -625,6 +640,12 @@ void pdsygvx_subcomm_ ( } if (dims[0] * dims[1] < nproc) Bcast_eigenvalues(w, N, ictxt, dims[0]*dims[1], comm); + +#ifdef USE_ELPA + if (subcomm != MPI_COMM_NULL) + MPI_Comm_free(&subcomm); +#endif + Cblacs_gridexit(ictxt_old); #endif // (USE_MKL or USE_SCALAPACK) } @@ -665,6 +686,11 @@ void pdsyevx_subcomm_ ( // limitations: only correct when a, b, z are in the same context int ictxt_old = desca[1]; +#ifdef USE_ELPA + MPI_Comm subcomm; + MPI_Comm_split(comm, ictxt >= 0, 0, &subcomm); +#endif + if (ictxt >= 0) { int nprow, npcol, myrow, mycol; Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol); @@ -700,10 +726,14 @@ void pdsyevx_subcomm_ ( t1 = MPI_Wtime(); #endif +#ifdef USE_ELPA + elpa_pdsyevx(A_BLCYC, descA_BLCYC, *n, w, Z_BLCYC, subcomm, info); +#else automem_pdsyevx_ ( jobz, range, uplo, n, A_BLCYC, &ONE, &ONE, descA_BLCYC, vl, vu, il, iu, abstol, m, nz, w, orfac, Z_BLCYC, &ONE, &ONE, descZ_BLCYC, ifail, info); +#endif #ifdef DEBUGSUBGRID t2 = MPI_Wtime(); @@ -734,6 +764,12 @@ void pdsyevx_subcomm_ ( if (dims[0] * dims[1] < nproc) Bcast_eigenvalues(w, N, ictxt, dims[0]*dims[1], comm); + +#ifdef USE_ELPA + if (subcomm != MPI_COMM_NULL) + MPI_Comm_free(&subcomm); +#endif + #endif // (USE_MKL or USE_SCALAPACK) } @@ -774,6 +810,11 @@ void pzhegvx_subcomm_ ( // limitations: only correct when a, b, z are in the same context int ictxt_old = desca[1]; +#ifdef USE_ELPA + MPI_Comm subcomm; + MPI_Comm_split(comm, ictxt >= 0, 0, &subcomm); +#endif + if (ictxt >= 0) { int nprow, npcol, myrow, mycol; Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol); @@ -816,10 +857,14 @@ void pzhegvx_subcomm_ ( t1 = MPI_Wtime(); #endif +#ifdef USE_ELPA + elpa_pzhegvx(A_BLCYC, descA_BLCYC, B_BLCYC, descB_BLCYC, *n, w, Z_BLCYC, subcomm, info); +#else automem_pzhegvx_(ibtype, jobz, range, uplo, n, A_BLCYC, &ONE, &ONE, descA_BLCYC, B_BLCYC, &ONE, &ONE, descB_BLCYC, vl, vu, il, iu, abstol, m, nz, w, orfac, Z_BLCYC, &ONE, &ONE, descZ_BLCYC, ifail, info); - +#endif + #ifdef DEBUGSUBGRID t2 = MPI_Wtime(); if (!rank) printf("pzhegvx_subcomm_: AZ=ZD: %.3f ms\n", (t2-t1)*1e3); @@ -851,6 +896,12 @@ void pzhegvx_subcomm_ ( if (dims[0] * dims[1] < nproc) Bcast_eigenvalues(w, N, ictxt, dims[0]*dims[1], comm); + +#ifdef USE_ELPA + if (subcomm != MPI_COMM_NULL) + MPI_Comm_free(&subcomm); +#endif + #endif // (USE_MKL or USE_SCALAPACK) } @@ -1124,3 +1175,202 @@ void pdgemm_subcomm( } #endif // (USE_MKL or USE_SCALAPACK) } + + +#ifdef USE_ELPA +void elpa_pdsygvx( + double *a, int desca[9], double *b, int descb[9], + int nev, double *ev, double *z, MPI_Comm comm, int *ierr) +{ + int ictxt = desca[1]; + int nprow, npcol, my_prow, my_pcol; + Cblacs_gridinfo(ictxt, &nprow, &npcol, &my_prow, &my_pcol); + + int na = desca[2]; + int nblk = desca[4]; + int ZERO = 0; + int na_rows = numroc_(&na, &nblk, &my_prow, &ZERO, &nprow); + int na_cols = numroc_(&na, &nblk, &my_pcol, &ZERO, &npcol); + + elpa_t handle; + + if (elpa_init(20171202) != ELPA_OK) { + printf("Invalid elpa version\n"); + exit(1); + } + + handle = elpa_allocate(ierr); + assert_elpa_ok(*ierr); + + elpa_set_integer(handle, "cannon_for_generalized", 0,ierr); + assert_elpa_ok(*ierr); + elpa_set_integer(handle, "na", na,ierr); + assert_elpa_ok(*ierr); + elpa_set_integer(handle, "nev", nev,ierr); + assert_elpa_ok(*ierr); + elpa_set_integer(handle, "local_nrows", na_rows,ierr); + assert_elpa_ok(*ierr); + elpa_set_integer(handle, "local_ncols", na_cols,ierr); + assert_elpa_ok(*ierr); + elpa_set_integer(handle, "nblk", nblk,ierr); + assert_elpa_ok(*ierr); + elpa_set_integer(handle, "mpi_comm_parent", (int)(MPI_Comm_c2f(comm)),ierr); + assert_elpa_ok(*ierr); + elpa_set(handle, "process_row", my_prow, ierr); // row coordinate of MPI process + assert_elpa_ok(*ierr); + elpa_set(handle, "process_col", my_pcol, ierr); // column coordinate of MPI process + assert_elpa_ok(*ierr); + + elpa_set_integer(handle, "blacs_context", (int) ictxt,ierr); + assert_elpa_ok(*ierr); + + assert_elpa_ok(elpa_setup(handle)); + + elpa_set_integer(handle, "solver", ELPA_SOLVER_2STAGE, ierr); + assert_elpa_ok(*ierr); + + elpa_generalized_eigenvectors_d(handle, a, b, ev, z, 0, ierr); + if (*ierr != ELPA_OK) { + printf("Unable to solve elpa_pdsygvx\n"); + exit(-1); + } + + elpa_deallocate(handle,ierr); + elpa_uninit(ierr); +} + + +void elpa_pdsyevx( + double *a, int desca[9], int nev, double *ev, double *z, MPI_Comm comm, int *ierr) +{ + int ictxt = desca[1]; + int nprow, npcol, my_prow, my_pcol; + Cblacs_gridinfo(ictxt, &nprow, &npcol, &my_prow, &my_pcol); + + int na = desca[2]; + int nblk = desca[4]; + int ZERO = 0; + int na_rows = numroc_(&na, &nblk, &my_prow, &ZERO, &nprow); + int na_cols = numroc_(&na, &nblk, &my_pcol, &ZERO, &npcol); + + elpa_t handle; + + if (elpa_init(20171201) != ELPA_OK) { // put here the API version that you are using + printf("Invalid elpa version\n"); + exit(1); + } + + handle = elpa_allocate(ierr); + assert_elpa_ok(*ierr); + + + /* Set parameters the matrix and it's MPI distribution */ + elpa_set(handle, "na", na, ierr); // size of the na x na matrix + assert_elpa_ok(*ierr); + elpa_set(handle, "nev", nev, ierr); // number of eigenvectors that should be computed ( 1<= nev <= na) + assert_elpa_ok(*ierr); + elpa_set(handle, "local_nrows", na_rows, ierr); // number of local rows of the distributed matrix on this MPI task + assert_elpa_ok(*ierr); + elpa_set(handle, "local_ncols", na_cols, ierr); // number of local columns of the distributed matrix on this MPI task + assert_elpa_ok(*ierr); + elpa_set(handle, "nblk", nblk, ierr); // size of the BLACS block cyclic distribution + assert_elpa_ok(*ierr); + elpa_set(handle, "mpi_comm_parent", MPI_Comm_c2f(comm), ierr); // the global MPI communicator + assert_elpa_ok(*ierr); + elpa_set(handle, "process_row", my_prow, ierr); // row coordinate of MPI process + assert_elpa_ok(*ierr); + elpa_set(handle, "process_col", my_pcol, ierr); // column coordinate of MPI process + assert_elpa_ok(*ierr); + + /* Setup */ + elpa_set_integer(handle, "blacs_context", (int) ictxt,ierr); + assert_elpa_ok(*ierr); + + assert_elpa_ok(elpa_setup(handle)); + + /* if desired, set any number of tunable run-time options */ + /* look at the list of possible options as detailed later in + USERS_GUIDE.md */ + + elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, ierr); + + // set the AVX BLOCK2 kernel, otherwise ELPA_2STAGE_REAL_DEFAULT will + // be used + // elpa_set(handle, "real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2, ierr); + assert_elpa_ok(*ierr); + + /* use method solve to solve the eigenvalue problem */ + /* other possible methods are desribed in USERS_GUIDE.md */ + elpa_eigenvectors(handle, a, ev, z, ierr); + if (*ierr != ELPA_OK) { + printf("Unable to solve elpa_pdsyevx\n"); + exit(-1); + } + + /* cleanup */ + elpa_deallocate(handle, ierr); + elpa_uninit(ierr); +} + + +void elpa_pzhegvx( + double _Complex *a, int desca[9], double _Complex *b, int descb[9], + int nev, double *ev, double _Complex *z, MPI_Comm comm, int *ierr) +{ + int ictxt = desca[1]; + int nprow, npcol, my_prow, my_pcol; + Cblacs_gridinfo(ictxt, &nprow, &npcol, &my_prow, &my_pcol); + + int na = desca[2]; + int nblk = desca[4]; + int ZERO = 0; + int na_rows = numroc_(&na, &nblk, &my_prow, &ZERO, &nprow); + int na_cols = numroc_(&na, &nblk, &my_pcol, &ZERO, &npcol); + + elpa_t handle; + + if (elpa_init(20171202) != ELPA_OK) { + printf("Invalid elpa version\n"); + exit(1); + } + + handle = elpa_allocate(ierr); + assert_elpa_ok(*ierr); + + elpa_set_integer(handle, "cannon_for_generalized", 0,ierr); + assert_elpa_ok(*ierr); + elpa_set_integer(handle, "na", na,ierr); + assert_elpa_ok(*ierr); + elpa_set_integer(handle, "nev", nev,ierr); + assert_elpa_ok(*ierr); + elpa_set_integer(handle, "local_nrows", na_rows,ierr); + assert_elpa_ok(*ierr); + elpa_set_integer(handle, "local_ncols", na_cols,ierr); + assert_elpa_ok(*ierr); + elpa_set_integer(handle, "nblk", nblk,ierr); + assert_elpa_ok(*ierr); + elpa_set_integer(handle, "mpi_comm_parent", (int)(MPI_Comm_c2f(comm)),ierr); + assert_elpa_ok(*ierr); + elpa_set(handle, "process_row", my_prow, ierr); // row coordinate of MPI process + assert_elpa_ok(*ierr); + elpa_set(handle, "process_col", my_pcol, ierr); // column coordinate of MPI process + assert_elpa_ok(*ierr); + + elpa_set_integer(handle, "blacs_context", (int) ictxt,ierr); + assert_elpa_ok(*ierr); + + assert_elpa_ok(elpa_setup(handle)); + + elpa_set_integer(handle, "solver", ELPA_SOLVER_2STAGE, ierr); + assert_elpa_ok(*ierr); + + elpa_generalized_eigenvectors_dc(handle, a, b, ev, z, 0, ierr); + if (*ierr != ELPA_OK) { + printf("Unable to solve elpa_pzhegvx\n"); + exit(-1); + } + + elpa_deallocate(handle,ierr); + elpa_uninit(ierr); +} +#endif \ No newline at end of file diff --git a/src/makefile b/src/makefile index 075e9c58..09616d2d 100644 --- a/src/makefile +++ b/src/makefile @@ -16,7 +16,10 @@ USE_FFTW = 0 DEBUG_MODE = 0 # Set USE_SOCKET = 0 to disable compilation with socket support -USE_SOCKET = 1 +USE_SOCKET = 0 + +# set USE_ELPA = 1 to use ELPA for parallel eigensolver. ELPA requires MKL or Scalapack +USE_ELPA = 0 # Enable SIMD vectorization for complex stencil routines # CAUTION: for some compilers this results in wrong results! Use for intel/19.0.3 or later versions @@ -80,6 +83,13 @@ ifeq ($(ENABLE_SIMD_COMPLEX), 1) CPPFLAGS += -DENABLE_SIMD_COMPLEX endif +# to enable ELPA for parallel eigensolver +ifeq ($(USE_ELPA), 1) +CPPFLAGS += -I${ELPA_ROOT}/include/elpa_openmp-2021.11.001 -DUSE_ELPA +LDFLAGS += -L${ELPA_ROOT}/lib +LDLIBS += -lelpa_openmp +endif + # for old Intel compiler, use -qopenmp instead of -fopenmp. ICC 17 and later also accepts -fopenmp. CFLAGS = -std=gnu99 -O3 -fopenmp