diff --git a/ChangeLog b/ChangeLog
index 95ed8c16..dd498448 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -2,6 +2,14 @@
 -Date
 -Name
 -changes
+
+--------------
+Dec 10, 2024
+Name: Xin Jing
+Changes: (linearAlgebra.c, eigenSolver.c, eigenSovlerKpt.c)
+1. Support ELPA 
+2. Fix a bug in initialization
+
 --------------
 Dec 4, 2024
 Name: Lucas Timmerman
diff --git a/src/include/linearAlgebra.h b/src/include/linearAlgebra.h
index e5705063..2996ed7c 100644
--- a/src/include/linearAlgebra.h
+++ b/src/include/linearAlgebra.h
@@ -145,4 +145,13 @@ void pdgemm_subcomm(
     int *descA, const double *B, int *descB, double beta, double *C, int *descC,
     MPI_Comm comm, int max_nproc);
 
+#ifdef USE_ELPA
+void elpa_pdsygvx(
+	double *a, int desca[9], double *b, int descb[9], int nev, double *ev, double *z, MPI_Comm comm, int *ierr);
+void elpa_pdsyevx(
+	double *a, int desca[9], int nev, double *ev, double *z, MPI_Comm comm, int *ierr);
+void elpa_pzhegvx(
+    double _Complex *a, int desca[9], double _Complex *b, int descb[9], int nev, double *ev, double _Complex *z, MPI_Comm comm, int *ierr);
+#endif
+
 #endif // LINEARALGEBRA_H
\ No newline at end of file
diff --git a/src/initialization.c b/src/initialization.c
index 4225d72e..d304baeb 100644
--- a/src/initialization.c
+++ b/src/initialization.c
@@ -429,6 +429,13 @@ void Initialize(SPARC_OBJ *pSPARC, int argc, char *argv[]) {
             if (rank == 0) printf("The dimension of subgrid for eigen sovler is (%d x %d).\n", 
                                     pSPARC->eig_paral_subdims[0], pSPARC->eig_paral_subdims[1]);
         #endif
+
+        #ifdef USE_ELPA
+            // No processor can have zero data in ELPA
+            int maxdim = max(pSPARC->eig_paral_subdims[0], pSPARC->eig_paral_subdims[1]);
+            pSPARC->eig_paral_blksz = max(1,min(pSPARC->Nstates/maxdim, pSPARC->eig_paral_blksz));
+            // if (rank == 0) printf("eig_paral_blksz %d\n", pSPARC->eig_paral_blksz);
+        #endif
         }
     }
     
@@ -2601,6 +2608,14 @@ void SPARC_copy_input(SPARC_OBJ *pSPARC, SPARC_INPUT_OBJ *pSPARC_Input) {
         }
     }
 
+#if defined(USE_ELPA)
+    #if !defined(USE_MKL) && !defined(USE_SCALAPACK)
+        if (rank == 0)
+            printf(RED "ERROR: To use ELPA, please turn on MKL or SCALAPACK in makefile!\n"RESET);
+        exit(EXIT_FAILURE); 
+    #endif
+#endif
+
 #if !defined(USE_MKL) && !defined(USE_FFTW)
     if (pSPARC->ixc[3] != 0){
         if (rank == 0)
@@ -3649,7 +3664,7 @@ void write_output_init(SPARC_OBJ *pSPARC) {
     }
 
     fprintf(output_fp,"***************************************************************************\n");
-    fprintf(output_fp,"*                       SPARC (version Dec 4, 2024)                      *\n");
+    fprintf(output_fp,"*                       SPARC (version Dec 10, 2024)                      *\n");
     fprintf(output_fp,"*   Copyright (c) 2020 Material Physics & Mechanics Group, Georgia Tech   *\n");
     fprintf(output_fp,"*           Distributed under GNU General Public License 3 (GPL)          *\n");
     fprintf(output_fp,"*                   Start time: %s                  *\n",c_time_str);
@@ -4281,8 +4296,7 @@ void SPARC_Input_MPI_create(MPI_Datatype *pSPARC_INPUT_MPI) {
                                          MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE,
                                          MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE,
                                          MPI_CHAR, MPI_CHAR, MPI_CHAR, MPI_CHAR, MPI_CHAR,
-                                         MPI_CHAR, MPI_CHAR, MPI_CHAR, MPI_CHAR, MPI_CHAR,
-                                         MPI_CHAR};
+                                         MPI_CHAR, MPI_CHAR, MPI_CHAR, MPI_CHAR, MPI_CHAR};
     int blens[N_MEMBR] = {3, 3, 7,      /* int array */ 
                           1, 1, 1, 1, 1,
                           1, 1, 1, 1, 1,
@@ -4325,8 +4339,7 @@ void SPARC_Input_MPI_create(MPI_Datatype *pSPARC_INPUT_MPI) {
                           1, 1, 1, 1, 1,
                           1,  /* double */
                           32, 32, 32, L_STRING, L_STRING, /* char */
-                          L_STRING, L_STRING, L_STRING, L_STRING,
-                          L_STRING};
+                          L_STRING, L_STRING, L_STRING, L_STRING, L_STRING};
 
     // calculating offsets in an architecture independent manner
     MPI_Aint addr[N_MEMBR],disps[N_MEMBR], base;
diff --git a/src/linearAlgebra.c b/src/linearAlgebra.c
index 9a8cf5f3..1b883b31 100644
--- a/src/linearAlgebra.c
+++ b/src/linearAlgebra.c
@@ -24,6 +24,11 @@
     #include "blacs.h"     // Cblacs_*
     #include "scalapack.h" // ScaLAPACK functions
 #endif
+/* ELPA routines */
+#ifdef USE_ELPA
+#include <elpa/elpa.h>
+#define assert_elpa_ok(x) assert(x == ELPA_OK)
+#endif
 
 #include "linearAlgebra.h"
 #include "parallelization.h"
@@ -530,8 +535,9 @@ void pdsygvx_subcomm_ (
                           "       total number of processors in the provided communicator.\n", dims[0], dims[1]);
         exit(EXIT_FAILURE);
     }
+
     #ifdef DEBUGSUBGRID
-    if (rank == 0) printf("pdsyevx_subcomm_: process grid = (%d, %d)\n", dims[0], dims[1]);
+    if (rank == 0) printf("pdsygvx_subcomm_: process grid = (%d, %d)\n", dims[0], dims[1]); fflush(stdout);
     #endif
 
     // generate a (subset) process grid within comm
@@ -546,6 +552,11 @@ void pdsygvx_subcomm_ (
     // create new context with dimensions: nproc x 1
     Cblacs_gridinit(&ictxt_old, "Row", nproc, 1);
 
+#ifdef USE_ELPA
+    MPI_Comm subcomm;
+    MPI_Comm_split(comm, ictxt >= 0, 0, &subcomm);
+#endif
+
     if (ictxt >= 0) {
         int nprow, npcol, myrow, mycol;
         Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol);
@@ -590,10 +601,14 @@ void pdsygvx_subcomm_ (
         // if abstol is not provided in advance, use the most orthogonal setting
         if (*abstol < 0) *abstol = pdlamch_(&ictxt, "U");
 
+#ifdef USE_ELPA
+        elpa_pdsygvx(A_BLCYC, descA_BLCYC, B_BLCYC, descB_BLCYC, *n, w, Z_BLCYC, subcomm, info);
+#else
         automem_pdsygvx_(ibtype, jobz, range, uplo, n, A_BLCYC, &ONE, &ONE, descA_BLCYC, 
 			 B_BLCYC, &ONE, &ONE, descB_BLCYC, vl, vu, il, iu, abstol, 
 			 m, nz, w, orfac, Z_BLCYC, &ONE, &ONE, descZ_BLCYC, ifail, info);
-        
+#endif
+
         #ifdef DEBUGSUBGRID
         t2 = MPI_Wtime();
         if (!rank) printf("pdsygvx_subcomm: AZ=ZD: %.3f ms\n", (t2-t1)*1e3);
@@ -625,6 +640,12 @@ void pdsygvx_subcomm_ (
     }
     if (dims[0] * dims[1] < nproc)
         Bcast_eigenvalues(w, N, ictxt, dims[0]*dims[1], comm);
+
+#ifdef USE_ELPA
+    if (subcomm != MPI_COMM_NULL)
+        MPI_Comm_free(&subcomm);
+#endif
+
     Cblacs_gridexit(ictxt_old);
 #endif // (USE_MKL or USE_SCALAPACK)
 }
@@ -665,6 +686,11 @@ void pdsyevx_subcomm_ (
     // limitations: only correct when a, b, z are in the same context
     int ictxt_old = desca[1];
 
+#ifdef USE_ELPA
+    MPI_Comm subcomm;
+    MPI_Comm_split(comm, ictxt >= 0, 0, &subcomm);
+#endif
+
     if (ictxt >= 0) {
         int nprow, npcol, myrow, mycol;
         Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol);
@@ -700,10 +726,14 @@ void pdsyevx_subcomm_ (
         t1 = MPI_Wtime();
         #endif
 
+#ifdef USE_ELPA
+        elpa_pdsyevx(A_BLCYC, descA_BLCYC, *n, w, Z_BLCYC, subcomm, info);
+#else
         automem_pdsyevx_ ( 
             jobz, range, uplo, n, A_BLCYC, &ONE, &ONE, descA_BLCYC, 
 	        vl, vu, il, iu, abstol, m, nz, w, orfac, Z_BLCYC, 
             &ONE, &ONE, descZ_BLCYC, ifail, info);
+#endif
 
         #ifdef DEBUGSUBGRID
         t2 = MPI_Wtime();
@@ -734,6 +764,12 @@ void pdsyevx_subcomm_ (
 
     if (dims[0] * dims[1] < nproc)
         Bcast_eigenvalues(w, N, ictxt, dims[0]*dims[1], comm);
+
+#ifdef USE_ELPA
+    if (subcomm != MPI_COMM_NULL)
+        MPI_Comm_free(&subcomm);
+#endif
+
 #endif // (USE_MKL or USE_SCALAPACK)
 }
 
@@ -774,6 +810,11 @@ void pzhegvx_subcomm_ (
     // limitations: only correct when a, b, z are in the same context
     int ictxt_old = desca[1];
 
+#ifdef USE_ELPA
+    MPI_Comm subcomm;
+    MPI_Comm_split(comm, ictxt >= 0, 0, &subcomm);
+#endif
+
     if (ictxt >= 0) {
         int nprow, npcol, myrow, mycol;
         Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol);
@@ -816,10 +857,14 @@ void pzhegvx_subcomm_ (
         t1 = MPI_Wtime();
         #endif
 
+#ifdef USE_ELPA
+        elpa_pzhegvx(A_BLCYC, descA_BLCYC, B_BLCYC, descB_BLCYC, *n, w, Z_BLCYC, subcomm, info);
+#else
         automem_pzhegvx_(ibtype, jobz, range, uplo, n, A_BLCYC, &ONE, &ONE, descA_BLCYC, 
 			 B_BLCYC, &ONE, &ONE, descB_BLCYC, vl, vu, il, iu, abstol, 
 			 m, nz, w, orfac, Z_BLCYC, &ONE, &ONE, descZ_BLCYC, ifail, info);
-        
+#endif 
+
         #ifdef DEBUGSUBGRID
         t2 = MPI_Wtime();
         if (!rank) printf("pzhegvx_subcomm_: AZ=ZD: %.3f ms\n", (t2-t1)*1e3);
@@ -851,6 +896,12 @@ void pzhegvx_subcomm_ (
 
     if (dims[0] * dims[1] < nproc)
         Bcast_eigenvalues(w, N, ictxt, dims[0]*dims[1], comm);
+
+#ifdef USE_ELPA
+    if (subcomm != MPI_COMM_NULL)
+        MPI_Comm_free(&subcomm);
+#endif
+
 #endif // (USE_MKL or USE_SCALAPACK)
 }
 
@@ -1124,3 +1175,202 @@ void pdgemm_subcomm(
     }
 #endif // (USE_MKL or USE_SCALAPACK)
 }
+
+
+#ifdef USE_ELPA
+void elpa_pdsygvx(
+    double *a, int desca[9], double *b, int descb[9],
+    int nev, double *ev, double *z, MPI_Comm comm, int *ierr)
+{
+	int ictxt = desca[1];
+	int nprow, npcol, my_prow, my_pcol;
+	Cblacs_gridinfo(ictxt, &nprow, &npcol, &my_prow, &my_pcol);
+	
+	int na = desca[2];
+	int nblk = desca[4];
+	int ZERO = 0;
+	int na_rows = numroc_(&na, &nblk, &my_prow, &ZERO, &nprow);
+	int na_cols = numroc_(&na, &nblk, &my_pcol, &ZERO, &npcol);
+
+	elpa_t handle;
+
+	if (elpa_init(20171202) != ELPA_OK) {
+		printf("Invalid elpa version\n");
+		exit(1);
+	}
+
+	handle = elpa_allocate(ierr);
+	assert_elpa_ok(*ierr);
+	
+	elpa_set_integer(handle, "cannon_for_generalized", 0,ierr);
+	assert_elpa_ok(*ierr);
+	elpa_set_integer(handle, "na", na,ierr);
+	assert_elpa_ok(*ierr);
+	elpa_set_integer(handle, "nev", nev,ierr);
+	assert_elpa_ok(*ierr);
+	elpa_set_integer(handle, "local_nrows", na_rows,ierr);
+	assert_elpa_ok(*ierr);
+	elpa_set_integer(handle, "local_ncols", na_cols,ierr);
+	assert_elpa_ok(*ierr);
+	elpa_set_integer(handle, "nblk", nblk,ierr);
+	assert_elpa_ok(*ierr);
+	elpa_set_integer(handle, "mpi_comm_parent", (int)(MPI_Comm_c2f(comm)),ierr);
+	assert_elpa_ok(*ierr);
+	elpa_set(handle, "process_row", my_prow, ierr);                             // row coordinate of MPI process
+	assert_elpa_ok(*ierr);
+	elpa_set(handle, "process_col", my_pcol, ierr);                             // column coordinate of MPI process
+	assert_elpa_ok(*ierr);
+
+	elpa_set_integer(handle, "blacs_context", (int) ictxt,ierr);
+	assert_elpa_ok(*ierr);
+
+	assert_elpa_ok(elpa_setup(handle));
+
+	elpa_set_integer(handle, "solver", ELPA_SOLVER_2STAGE, ierr);	
+    assert_elpa_ok(*ierr);
+
+	elpa_generalized_eigenvectors_d(handle, a, b, ev, z, 0, ierr);
+	if (*ierr != ELPA_OK) {
+		printf("Unable to solve elpa_pdsygvx\n");
+		exit(-1);
+	}
+
+	elpa_deallocate(handle,ierr);
+	elpa_uninit(ierr);
+}
+
+
+void elpa_pdsyevx(
+	double *a, int desca[9], int nev, double *ev, double *z, MPI_Comm comm, int *ierr)
+{
+	int ictxt = desca[1];
+	int nprow, npcol, my_prow, my_pcol;
+	Cblacs_gridinfo(ictxt, &nprow, &npcol, &my_prow, &my_pcol);
+
+	int na = desca[2];
+	int nblk = desca[4];
+	int ZERO = 0;
+	int na_rows = numroc_(&na, &nblk, &my_prow, &ZERO, &nprow);
+	int na_cols = numroc_(&na, &nblk, &my_pcol, &ZERO, &npcol);
+
+	elpa_t handle;	
+
+	if (elpa_init(20171201) != ELPA_OK) {                          // put here the API version that you are using
+		printf("Invalid elpa version\n");
+		exit(1);
+	}
+
+	handle = elpa_allocate(ierr);
+    assert_elpa_ok(*ierr);
+
+
+	/* Set parameters the matrix and it's MPI distribution */
+	elpa_set(handle, "na", na, ierr);                                           // size of the na x na matrix
+	assert_elpa_ok(*ierr);
+	elpa_set(handle, "nev", nev, ierr);                                         // number of eigenvectors that should be computed ( 1<= nev <= na)
+	assert_elpa_ok(*ierr);
+	elpa_set(handle, "local_nrows", na_rows, ierr);                             // number of local rows of the distributed matrix on this MPI task 
+	assert_elpa_ok(*ierr);
+	elpa_set(handle, "local_ncols", na_cols, ierr);                             // number of local columns of the distributed matrix on this MPI task
+	assert_elpa_ok(*ierr);
+	elpa_set(handle, "nblk", nblk, ierr);                                       // size of the BLACS block cyclic distribution
+	assert_elpa_ok(*ierr);
+	elpa_set(handle, "mpi_comm_parent", MPI_Comm_c2f(comm), ierr);    // the global MPI communicator
+	assert_elpa_ok(*ierr);
+	elpa_set(handle, "process_row", my_prow, ierr);                             // row coordinate of MPI process
+	assert_elpa_ok(*ierr);
+	elpa_set(handle, "process_col", my_pcol, ierr);                             // column coordinate of MPI process
+	assert_elpa_ok(*ierr);
+
+	/* Setup */
+	elpa_set_integer(handle, "blacs_context", (int) ictxt,ierr);
+	assert_elpa_ok(*ierr);
+
+	assert_elpa_ok(elpa_setup(handle));
+
+	/* if desired, set any number of tunable run-time options */
+	/* look at the list of possible options as detailed later in
+		USERS_GUIDE.md */
+
+	elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, ierr);
+	
+	// set the AVX BLOCK2 kernel, otherwise ELPA_2STAGE_REAL_DEFAULT will
+	// be used
+	// elpa_set(handle, "real_kernel", ELPA_2STAGE_REAL_AVX_BLOCK2, ierr);
+	assert_elpa_ok(*ierr);
+	
+	/* use method solve to solve the eigenvalue problem */
+	/* other possible methods are desribed in USERS_GUIDE.md */
+	elpa_eigenvectors(handle, a, ev, z, ierr);
+	if (*ierr != ELPA_OK) {
+		printf("Unable to solve elpa_pdsyevx\n");
+		exit(-1);
+	}
+
+	/* cleanup */
+	elpa_deallocate(handle, ierr);
+	elpa_uninit(ierr);
+}
+
+
+void elpa_pzhegvx(
+    double _Complex *a, int desca[9], double _Complex *b, int descb[9],
+    int nev, double *ev, double _Complex *z, MPI_Comm comm, int *ierr)
+{
+	int ictxt = desca[1];
+	int nprow, npcol, my_prow, my_pcol;
+	Cblacs_gridinfo(ictxt, &nprow, &npcol, &my_prow, &my_pcol);
+	
+	int na = desca[2];
+	int nblk = desca[4];
+	int ZERO = 0;
+	int na_rows = numroc_(&na, &nblk, &my_prow, &ZERO, &nprow);
+	int na_cols = numroc_(&na, &nblk, &my_pcol, &ZERO, &npcol);
+
+	elpa_t handle;	
+
+	if (elpa_init(20171202) != ELPA_OK) {
+		printf("Invalid elpa version\n");
+		exit(1);
+	}
+
+	handle = elpa_allocate(ierr);
+	assert_elpa_ok(*ierr);
+	
+	elpa_set_integer(handle, "cannon_for_generalized", 0,ierr);
+	assert_elpa_ok(*ierr);
+	elpa_set_integer(handle, "na", na,ierr);
+	assert_elpa_ok(*ierr);
+	elpa_set_integer(handle, "nev", nev,ierr);
+	assert_elpa_ok(*ierr);
+	elpa_set_integer(handle, "local_nrows", na_rows,ierr);
+	assert_elpa_ok(*ierr);
+	elpa_set_integer(handle, "local_ncols", na_cols,ierr);
+	assert_elpa_ok(*ierr);
+	elpa_set_integer(handle, "nblk", nblk,ierr);
+	assert_elpa_ok(*ierr);
+	elpa_set_integer(handle, "mpi_comm_parent", (int)(MPI_Comm_c2f(comm)),ierr);
+	assert_elpa_ok(*ierr);
+	elpa_set(handle, "process_row", my_prow, ierr);                             // row coordinate of MPI process
+	assert_elpa_ok(*ierr);
+	elpa_set(handle, "process_col", my_pcol, ierr);                             // column coordinate of MPI process
+	assert_elpa_ok(*ierr);
+
+	elpa_set_integer(handle, "blacs_context", (int) ictxt,ierr);
+	assert_elpa_ok(*ierr);
+
+	assert_elpa_ok(elpa_setup(handle));
+
+	elpa_set_integer(handle, "solver", ELPA_SOLVER_2STAGE, ierr);	
+    assert_elpa_ok(*ierr);
+
+	elpa_generalized_eigenvectors_dc(handle, a, b, ev, z, 0, ierr);
+	if (*ierr != ELPA_OK) {
+		printf("Unable to solve elpa_pzhegvx\n");
+		exit(-1);
+	}
+
+	elpa_deallocate(handle,ierr);
+	elpa_uninit(ierr);
+}
+#endif
\ No newline at end of file
diff --git a/src/makefile b/src/makefile
index 075e9c58..09616d2d 100644
--- a/src/makefile
+++ b/src/makefile
@@ -16,7 +16,10 @@ USE_FFTW      = 0
 DEBUG_MODE    = 0
 
 # Set USE_SOCKET = 0 to disable compilation with socket support
-USE_SOCKET    = 1
+USE_SOCKET    = 0
+
+# set USE_ELPA = 1 to use ELPA for parallel eigensolver. ELPA requires MKL or Scalapack
+USE_ELPA      = 0
 
 # Enable SIMD vectorization for complex stencil routines
 # CAUTION: for some compilers this results in wrong results! Use for intel/19.0.3 or later versions
@@ -80,6 +83,13 @@ ifeq ($(ENABLE_SIMD_COMPLEX), 1)
 CPPFLAGS += -DENABLE_SIMD_COMPLEX
 endif
 
+# to enable ELPA for parallel eigensolver
+ifeq ($(USE_ELPA), 1)
+CPPFLAGS += -I${ELPA_ROOT}/include/elpa_openmp-2021.11.001 -DUSE_ELPA
+LDFLAGS  += -L${ELPA_ROOT}/lib
+LDLIBS   += -lelpa_openmp 
+endif
+
 # for old Intel compiler, use -qopenmp instead of -fopenmp. ICC 17 and later also accepts -fopenmp. 
 CFLAGS = -std=gnu99 -O3 -fopenmp