From 779df092b103096073b8de0131135b577ad0a493 Mon Sep 17 00:00:00 2001 From: Ben Wibking Date: Tue, 4 Feb 2025 07:20:17 +1100 Subject: [PATCH] Enable GPU-aware MPI by default (#4318) ## Summary This turns on GPU-aware MPI by default. ## Additional background On all current machines, simulations run faster with GPU-aware MPI enabled. Two technical issues that prevented this are now resolved: AMReX now has the communication arena, which does not use managed memory, and SLURM no longer uses cgroup isolation for GPU bindings by default. Closes https://github.com/AMReX-Codes/amrex/issues/2967. --------- Co-authored-by: Weiqun Zhang --- Docs/sphinx_documentation/source/GPU.rst | 31 ++++++++++++------------ Src/Base/AMReX_ParallelDescriptor.cpp | 31 ++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 15 deletions(-) diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst index b4b7aba9b0f..9e370b8ee20 100644 --- a/Docs/sphinx_documentation/source/GPU.rst +++ b/Docs/sphinx_documentation/source/GPU.rst @@ -1643,7 +1643,7 @@ Finally, the parallel communication of particle data has been ported and optimiz platforms. This includes :cpp:`Redistribute()`, which moves particles back to the proper grids after their positions have changed, as well as :cpp:`fillNeighbors()` and :cpp:`updateNeighbors()`, which are used to exchange halo particles. As with :cpp:`MultiFab` data, these have been designed to minimize host / device traffic as much as possible, and can -take advantage of the Cuda-aware MPI implementations available on platforms such as ORNL's Summit. +take advantage of the GPU-aware MPI implementations available on platforms such as ORNL's Frontier. Profiling with GPUs @@ -1742,17 +1742,18 @@ Inputs Parameters The following inputs parameters control the behavior of amrex when running on GPUs. They should be prefaced by "amrex" in your :cpp:`inputs` file. -+----------------------------+-----------------------------------------------------------------------+-------------+----------+ -| | Description | Type | Default | -+============================+=======================================================================+=============+==========+ -| use_gpu_aware_mpi | Whether to use GPU memory for communication buffers during MPI calls. | Bool | 0 | -| | If true, the buffers will use device memory. If false (i.e., 0), they | | | -| | will use pinned memory. In practice, we find it is not always worth | | | -| | it to use GPU aware MPI. | | | -+----------------------------+-----------------------------------------------------------------------+-------------+----------+ -| abort_on_out_of_gpu_memory | If the size of free memory on the GPU is less than the size of a | Bool | 0 | -| | requested allocation, AMReX will call AMReX::Abort() with an error | | | -| | describing how much free memory there is and what was requested. | | | -+----------------------------+-----------------------------------------------------------------------+-------------+----------+ -| the_arena_is_managed | Whether :cpp:`The_Arena()` allocates managed memory. | Bool | 0 | -+----------------------------+-----------------------------------------------------------------------+-------------+----------+ ++----------------------------+-----------------------------------------------------------------------+-------------+----------------+ +| | Description | Type | Default | ++============================+=======================================================================+=============+================+ +| use_gpu_aware_mpi | Whether to use GPU memory for communication buffers during MPI calls. | Bool | MPI-dependent | +| | If true, the buffers will use device memory. If false (i.e., 0), they | | | +| | will use pinned memory. It will be activated if AMReX detects that | | | +| | GPU-aware MPI is supported by the MPI library (MPICH, OpenMPI, and | | | +| | derivative implementations). | | | ++----------------------------+-----------------------------------------------------------------------+-------------+----------------+ +| abort_on_out_of_gpu_memory | If the size of free memory on the GPU is less than the size of a | Bool | 0 | +| | requested allocation, AMReX will call AMReX::Abort() with an error | | | +| | describing how much free memory there is and what was requested. | | | ++----------------------------+-----------------------------------------------------------------------+-------------+----------------+ +| the_arena_is_managed | Whether :cpp:`The_Arena()` allocates managed memory. | Bool | 0 | ++----------------------------+-----------------------------------------------------------------------+-------------+----------------+ diff --git a/Src/Base/AMReX_ParallelDescriptor.cpp b/Src/Base/AMReX_ParallelDescriptor.cpp index f6ac26e7984..88e11c3b324 100644 --- a/Src/Base/AMReX_ParallelDescriptor.cpp +++ b/Src/Base/AMReX_ParallelDescriptor.cpp @@ -10,6 +10,9 @@ #ifdef BL_USE_MPI #include +#if __has_include() && defined(OPEN_MPI) +# include +#endif #endif #ifdef AMREX_PMI @@ -1510,6 +1513,34 @@ ReadAndBcastFile (const std::string& filename, Vector& charBuf, void Initialize () { +#if defined(AMREX_USE_CUDA) + +#if (defined(OMPI_HAVE_MPI_EXT_CUDA) && OMPI_HAVE_MPI_EXT_CUDA) || (defined(MPICH) && defined(MPIX_GPU_SUPPORT_CUDA)) + use_gpu_aware_mpi = (bool) MPIX_Query_cuda_support(); +#endif + +#elif defined(AMREX_USE_HIP) + +#if defined(OMPI_HAVE_MPI_EXT_ROCM) && OMPI_HAVE_MPI_EXT_ROCM + use_gpu_aware_mpi = (bool) MPIX_Query_rocm_support(); +#elif defined(MPICH) && defined(MPIX_GPU_SUPPORT_HIP) + int is_supported = 0; + if (MPIX_GPU_query_support(MPIX_GPU_SUPPORT_HIP, &is_supported) == MPI_SUCCESS) { + use_gpu_aware_mpi = (bool) is_supported; + } +#endif + +#elif defined(AMREX_USE_SYCL) + +#if defined(MPICH) && defined(MPIX_GPU_SUPPORT_ZE) + int is_supported = 0; + if (MPIX_GPU_query_support(MPIX_GPU_SUPPORT_ZE, &is_supported) == MPI_SUCCESS) { + use_gpu_aware_mpi = (bool) is_supported; + } +#endif + +#endif + #ifndef BL_AMRPROF ParmParse pp("amrex"); pp.queryAdd("use_gpu_aware_mpi", use_gpu_aware_mpi);