From 779df092b103096073b8de0131135b577ad0a493 Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Tue, 4 Feb 2025 07:20:17 +1100
Subject: [PATCH] Enable GPU-aware MPI by default (#4318)

## Summary
This turns on GPU-aware MPI by default.

## Additional background
On all current machines, simulations run faster with GPU-aware MPI
enabled. Two technical issues that prevented this are now resolved:
AMReX now has the communication arena, which does not use managed
memory, and SLURM no longer uses cgroup isolation for GPU bindings by
default.

Closes https://github.com/AMReX-Codes/amrex/issues/2967.

---------

Co-authored-by: Weiqun Zhang <WeiqunZhang@lbl.gov>
---
 Docs/sphinx_documentation/source/GPU.rst | 31 ++++++++++++------------
 Src/Base/AMReX_ParallelDescriptor.cpp    | 31 ++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst
index b4b7aba9b0f..9e370b8ee20 100644
--- a/Docs/sphinx_documentation/source/GPU.rst
+++ b/Docs/sphinx_documentation/source/GPU.rst
@@ -1643,7 +1643,7 @@ Finally, the parallel communication of particle data has been ported and optimiz
 platforms. This includes :cpp:`Redistribute()`, which moves particles back to the proper grids after their positions
 have changed, as well as :cpp:`fillNeighbors()` and :cpp:`updateNeighbors()`, which are used to exchange halo particles.
 As with :cpp:`MultiFab` data, these have been designed to minimize host / device traffic as much as possible, and can
-take advantage of the Cuda-aware MPI implementations available on platforms such as ORNL's Summit.
+take advantage of the GPU-aware MPI implementations available on platforms such as ORNL's Frontier.
 
 
 Profiling with GPUs
@@ -1742,17 +1742,18 @@ Inputs Parameters
 The following inputs parameters control the behavior of amrex when running on GPUs. They should be prefaced
 by "amrex" in your :cpp:`inputs` file.
 
-+----------------------------+-----------------------------------------------------------------------+-------------+----------+
-|                            | Description                                                           |   Type      | Default  |
-+============================+=======================================================================+=============+==========+
-| use_gpu_aware_mpi          | Whether to use GPU memory for communication buffers during MPI calls. | Bool        | 0        |
-|                            | If true, the buffers will use device memory. If false (i.e., 0), they |             |          |
-|                            | will use pinned memory. In practice, we find it is not always worth   |             |          |
-|                            | it to use GPU aware MPI.                                              |             |          |
-+----------------------------+-----------------------------------------------------------------------+-------------+----------+
-| abort_on_out_of_gpu_memory | If the size of free memory on the GPU is less than the size of a      | Bool        | 0        |
-|                            | requested allocation, AMReX will call AMReX::Abort() with an error    |             |          |
-|                            | describing how much free memory there is and what was requested.      |             |          |
-+----------------------------+-----------------------------------------------------------------------+-------------+----------+
-| the_arena_is_managed       | Whether :cpp:`The_Arena()` allocates managed memory.                  | Bool        | 0        |
-+----------------------------+-----------------------------------------------------------------------+-------------+----------+
++----------------------------+-----------------------------------------------------------------------+-------------+----------------+
+|                            | Description                                                           |   Type      | Default        |
++============================+=======================================================================+=============+================+
+| use_gpu_aware_mpi          | Whether to use GPU memory for communication buffers during MPI calls. | Bool        | MPI-dependent  |
+|                            | If true, the buffers will use device memory. If false (i.e., 0), they |             |                |
+|                            | will use pinned memory. It will be activated if AMReX detects that    |             |                |
+|                            | GPU-aware MPI is supported by the MPI library (MPICH, OpenMPI, and    |             |                |
+|                            | derivative implementations).                                          |             |                |
++----------------------------+-----------------------------------------------------------------------+-------------+----------------+
+| abort_on_out_of_gpu_memory | If the size of free memory on the GPU is less than the size of a      | Bool        | 0              |
+|                            | requested allocation, AMReX will call AMReX::Abort() with an error    |             |                |
+|                            | describing how much free memory there is and what was requested.      |             |                |
++----------------------------+-----------------------------------------------------------------------+-------------+----------------+
+| the_arena_is_managed       | Whether :cpp:`The_Arena()` allocates managed memory.                  | Bool        | 0              |
++----------------------------+-----------------------------------------------------------------------+-------------+----------------+
diff --git a/Src/Base/AMReX_ParallelDescriptor.cpp b/Src/Base/AMReX_ParallelDescriptor.cpp
index f6ac26e7984..88e11c3b324 100644
--- a/Src/Base/AMReX_ParallelDescriptor.cpp
+++ b/Src/Base/AMReX_ParallelDescriptor.cpp
@@ -10,6 +10,9 @@
 
 #ifdef BL_USE_MPI
 #include <AMReX_ccse-mpi.H>
+#if __has_include(<mpi-ext.h>) && defined(OPEN_MPI)
+#         include <mpi-ext.h>
+#endif
 #endif
 
 #ifdef AMREX_PMI
@@ -1510,6 +1513,34 @@ ReadAndBcastFile (const std::string& filename, Vector<char>& charBuf,
 void
 Initialize ()
 {
+#if defined(AMREX_USE_CUDA)
+
+#if (defined(OMPI_HAVE_MPI_EXT_CUDA) && OMPI_HAVE_MPI_EXT_CUDA) || (defined(MPICH) && defined(MPIX_GPU_SUPPORT_CUDA))
+    use_gpu_aware_mpi = (bool) MPIX_Query_cuda_support();
+#endif
+
+#elif defined(AMREX_USE_HIP)
+
+#if defined(OMPI_HAVE_MPI_EXT_ROCM) && OMPI_HAVE_MPI_EXT_ROCM
+    use_gpu_aware_mpi = (bool) MPIX_Query_rocm_support();
+#elif defined(MPICH) && defined(MPIX_GPU_SUPPORT_HIP)
+    int is_supported = 0;
+    if (MPIX_GPU_query_support(MPIX_GPU_SUPPORT_HIP, &is_supported) == MPI_SUCCESS) {
+        use_gpu_aware_mpi = (bool) is_supported;
+    }
+#endif
+
+#elif defined(AMREX_USE_SYCL)
+
+#if defined(MPICH) && defined(MPIX_GPU_SUPPORT_ZE)
+    int is_supported = 0;
+    if (MPIX_GPU_query_support(MPIX_GPU_SUPPORT_ZE, &is_supported) == MPI_SUCCESS) {
+        use_gpu_aware_mpi = (bool) is_supported;
+    }
+#endif
+
+#endif
+
 #ifndef BL_AMRPROF
     ParmParse pp("amrex");
     pp.queryAdd("use_gpu_aware_mpi", use_gpu_aware_mpi);