diff --git a/CMakeLists.txt b/CMakeLists.txt index 512a7b9baa..abcf27de99 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -404,6 +404,9 @@ endif() find_package(Threads REQUIRED) if(QUDA_OPENMP) + if(${CMAKE_CXX_COMPILER_ID} MATCHES "NVHPC") + message(FATAL_ERROR "Host compiler (nvc++) not supported with QUDA_OPENMP=ON") + endif() find_package(OpenMP REQUIRED) endif() diff --git a/include/dirac_quda.h b/include/dirac_quda.h index d6b123603a..2ceb4f5b70 100644 --- a/include/dirac_quda.h +++ b/include/dirac_quda.h @@ -1911,14 +1911,12 @@ namespace quda { const bool gpu_setup; /** Where to do the coarse-operator construction*/ mutable bool init_gpu; /** Whether this instance did the GPU allocation or not */ mutable bool init_cpu; /** Whether this instance did the CPU allocation or not */ - const bool mapped; /** Whether we allocate Y and X GPU fields in mapped memory or not */ /** @brief Allocate the Y and X fields @param[in] gpu Whether to allocate on gpu (true) or cpu (false) - @param[in] mapped whether to put gpu allocations into mapped memory */ - void createY(bool gpu = true, bool mapped = false) const; + void createY(bool gpu = true) const; /** @brief Allocate the Yhat and Xinv fields @@ -1935,9 +1933,8 @@ namespace quda { /** @param[in] param Parameters defining this operator @param[in] gpu_setup Whether to do the setup on GPU or CPU - @param[in] mapped Set to true to put Y and X fields in mapped memory */ - DiracCoarse(const DiracParam ¶m, bool gpu_setup=true, bool mapped=false); + DiracCoarse(const DiracParam ¶m, bool gpu_setup=true); /** @param[in] param Parameters defining this operator diff --git a/include/quda.h b/include/quda.h index da78a50a3b..99793cb51a 100644 --- a/include/quda.h +++ b/include/quda.h @@ -776,11 +776,6 @@ extern "C" { /** Whether to use eigenvectors for the nullspace or, if the coarsest instance deflate*/ QudaBoolean use_eig_solver[QUDA_MAX_MG_LEVEL]; - /** Minimize device memory allocations during the adaptive setup, - placing temporary fields in mapped memory instad of device - memory */ - QudaBoolean setup_minimize_memory; - /** Whether to compute the null vectors or reload them */ QudaComputeNullVector compute_null_vector; @@ -1814,6 +1809,15 @@ extern "C" { */ void destroyDeflationQuda(void *df_instance); + /** + * @brief Flush the memory pools associated with the supplied type. + * At present this only supports the options QUDA_MEMORY_DEVICE and + * QUDA_MEMORY_HOST_PINNED, and any other type will result in an + * error. + * @param[in] type The memory type whose pool we wish to flush. + */ + void flushPoolQuda(QudaMemoryType type); + void setMPICommHandleQuda(void *mycomm); // Parameter set for quark smearing operations diff --git a/lib/check_params.h b/lib/check_params.h index cdbe36169b..738b310b77 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -1024,12 +1024,6 @@ void printQudaMultigridParam(QudaMultigridParam *param) { #endif } -#ifdef INIT_PARAM - P(setup_minimize_memory, QUDA_BOOLEAN_FALSE); -#else - P(setup_minimize_memory, QUDA_BOOLEAN_INVALID); -#endif - P(compute_null_vector, QUDA_COMPUTE_NULL_VECTOR_INVALID); P(generate_all_levels, QUDA_BOOLEAN_INVALID); diff --git a/lib/dirac_coarse.cpp b/lib/dirac_coarse.cpp index 62f4f7639e..9b27b996ab 100644 --- a/lib/dirac_coarse.cpp +++ b/lib/dirac_coarse.cpp @@ -7,7 +7,7 @@ namespace quda { - DiracCoarse::DiracCoarse(const DiracParam ¶m, bool gpu_setup, bool mapped) : + DiracCoarse::DiracCoarse(const DiracParam ¶m, bool gpu_setup) : Dirac(param), mass(param.mass), mu(param.mu), @@ -23,8 +23,7 @@ namespace quda { enable_cpu(false), gpu_setup(gpu_setup), init_gpu(gpu_setup), - init_cpu(!gpu_setup), - mapped(mapped) + init_cpu(!gpu_setup) { initializeCoarse(); } @@ -59,8 +58,7 @@ namespace quda { enable_cpu(Y_h ? true : false), gpu_setup(true), init_gpu(enable_gpu ? false : true), - init_cpu(enable_cpu ? false : true), - mapped(Y_d->MemType() == QUDA_MEMORY_MAPPED) + init_cpu(enable_cpu ? false : true) { constexpr QudaGaugeFieldOrder gOrder = QUDA_MILC_GAUGE_ORDER; @@ -116,12 +114,11 @@ namespace quda { enable_cpu(dirac.enable_cpu), gpu_setup(dirac.gpu_setup), init_gpu(enable_gpu ? false : true), - init_cpu(enable_cpu ? false : true), - mapped(dirac.mapped) + init_cpu(enable_cpu ? false : true) { } - void DiracCoarse::createY(bool gpu, bool mapped) const + void DiracCoarse::createY(bool gpu) const { int ndim = transfer->Vectors().Ndim(); lat_dim_t x; @@ -146,7 +143,6 @@ namespace quda { gParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; gParam.nFace = 1; gParam.geometry = QUDA_COARSE_GEOMETRY; - if (mapped) gParam.mem_type = QUDA_MEMORY_MAPPED; int pad = std::max( { (x[0]*x[1]*x[2])/2, (x[1]*x[2]*x[3])/2, (x[0]*x[2]*x[3])/2, (x[0]*x[1]*x[3])/2 } ); gParam.pad = gpu ? gParam.nFace * pad * 2 : 0; // factor of 2 since we have to store bi-directional ghost zone @@ -228,7 +224,7 @@ namespace quda { void DiracCoarse::initializeCoarse() { - createY(gpu_setup, mapped); + createY(gpu_setup); if (!gpu_setup) { @@ -318,7 +314,7 @@ namespace quda { switch(location) { case QUDA_CUDA_FIELD_LOCATION: if (enable_gpu) return; - createY(true, mapped); + createY(true); createYhat(true); Y_d->copy(*Y_h); if (need_aos_gauge_copy) { Y_aos_d->copy(*Y_d); } diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 6fd6382488..139ffcfc60 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -1336,6 +1336,20 @@ void freeCloverQuda(void) void flushChronoQuda(int i) { flushChrono(i); } +void flushPoolQuda(QudaMemoryType type) +{ + switch (type) { + case QUDA_MEMORY_DEVICE: + pool::flush_device(); + break; + case QUDA_MEMORY_HOST_PINNED: + pool::flush_pinned(); + break; + default: + errorQuda("MemoryType %d not supported", type); + } +} + void endQuda(void) { if (!initialized) return; @@ -2755,7 +2769,6 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param) Bprec = (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION && Bprec < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : Bprec); csParam.setPrecision(Bprec, Bprec, true); if (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION) csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; - csParam.mem_type = mg_param.setup_minimize_memory == QUDA_BOOLEAN_TRUE ? QUDA_MEMORY_MAPPED : QUDA_MEMORY_DEVICE; B.resize(mg_param.n_vec[0]); if (mg_param.transfer_type[0] == QUDA_TRANSFER_COARSE_KD || mg_param.transfer_type[0] == QUDA_TRANSFER_OPTIMIZED_KD diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp index 2cffc2aeed..199c9d581c 100644 --- a/lib/inv_bicgstab_quda.cpp +++ b/lib/inv_bicgstab_quda.cpp @@ -42,7 +42,6 @@ namespace quda { } else { csParam.create = QUDA_NULL_FIELD_CREATE; resize(r0, b.size(), csParam); - blas::copy(r0, r); } } else { csParam.create = QUDA_NULL_FIELD_CREATE; @@ -126,7 +125,10 @@ namespace quda { if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) { mat(r, x); r2 = blas::xmyNorm(b, r); + for (auto i = 0u; i < b.size(); i++) + if (b2[i] == 0) b2[i] = r2[i]; for (auto i = 0u; i < x.size(); i++) std::swap(y[i], x[i]); + create_alias(x_sloppy, x); // need to update alias since x has been swapped } else { blas::copy(r, b); r2 = b2; @@ -145,6 +147,8 @@ namespace quda { if (param.precision != param.precision_sloppy) { blas::copy(r_sloppy, r); blas::copy(r0, param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_NO ? b : r); + } else { + if (param.compute_null_vector == QUDA_COMPUTE_NULL_VECTOR_YES) blas::copy(r0, r); } getProfile().TPSTOP(QUDA_PROFILE_INIT); diff --git a/lib/inv_bicgstabl_quda.cpp b/lib/inv_bicgstabl_quda.cpp index 98176af28a..5ce3422007 100644 --- a/lib/inv_bicgstabl_quda.cpp +++ b/lib/inv_bicgstabl_quda.cpp @@ -521,6 +521,8 @@ namespace quda { blas::xpay(b, -1.0, r_full); r2 = b2; // dummy setting } + for (auto i = 0u; i < b.size(); i++) + if (b2[i] == 0) b2[i] = r2[i]; blas::copy(y, x); // we accumulate into y } else { blas::copy(r_full, b); // r[0] = b diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp index 23473eaa4b..a4b794712b 100644 --- a/lib/milc_interface.cpp +++ b/lib/milc_interface.cpp @@ -2491,9 +2491,6 @@ void milcSetMultigridParam(milcMultigridPack *mg_pack, QudaPrecision host_precis mg_param.setup_location[i] = QUDA_CUDA_FIELD_LOCATION; // setup_location[i]; } - // whether to run GPU setup but putting temporaries into mapped (slow CPU) memory - mg_param.setup_minimize_memory = QUDA_BOOLEAN_FALSE; - // coarsening the spin on the first restriction is undefined for staggered fields. mg_param.spin_block_size[0] = 0; if (input_struct.optimized_kd == QUDA_TRANSFER_OPTIMIZED_KD diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index f355203c6a..e4137bf4f4 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -412,8 +412,7 @@ namespace quda diracParam.dslash_use_mma = param.mg_global.dslash_use_mma[param.level + 1]; diracParam.allow_truncation = (param.mg_global.allow_truncation == QUDA_BOOLEAN_TRUE) ? true : false; - diracCoarseResidual = new DiracCoarse(diracParam, param.setup_location == QUDA_CUDA_FIELD_LOCATION ? true : false, - param.mg_global.setup_minimize_memory == QUDA_BOOLEAN_TRUE ? true : false); + diracCoarseResidual = new DiracCoarse(diracParam, param.setup_location == QUDA_CUDA_FIELD_LOCATION ? true : false); // create smoothing operators diracParam.dirac = const_cast(param.matSmooth->Expose()); diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp index 2b0d3c97ba..46920505be 100644 --- a/lib/targets/cuda/malloc.cpp +++ b/lib/targets/cuda/malloc.cpp @@ -774,6 +774,7 @@ namespace quda void flush_pinned() { + logQuda(QUDA_DEBUG_VERBOSE, "Flushing host pinned memory pool\n"); if (pinned_memory_pool) { for (auto it : pinnedCache) { host_free(it.second); } pinnedCache.clear(); @@ -782,6 +783,7 @@ namespace quda void flush_device() { + logQuda(QUDA_DEBUG_VERBOSE, "Flushing device memory pool\n"); if (device_memory_pool) { for (auto it : deviceCache) { device_free(it.second); } deviceCache.clear(); diff --git a/lib/targets/hip/malloc.cpp b/lib/targets/hip/malloc.cpp index f46c44c8d0..4107e2efdd 100644 --- a/lib/targets/hip/malloc.cpp +++ b/lib/targets/hip/malloc.cpp @@ -713,6 +713,7 @@ namespace quda void flush_pinned() { if (pinned_memory_pool) { + logQuda(QUDA_DEBUG_VERBOSE, "Flushing host pinned memory pool\n"); std::multimap::iterator it; for (it = pinnedCache.begin(); it != pinnedCache.end(); it++) { void *ptr = it->second; @@ -725,7 +726,8 @@ namespace quda void flush_device() { if (device_memory_pool) { - std::multimap::iterator it; + logQuda(QUDA_DEBUG_VERBOSE, "Flushing device memory pool\n"); + std::multimap::iterator it; for (it = deviceCache.begin(); it != deviceCache.end(); it++) { void *ptr = it->second; device_free(ptr); diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp index 98bf9da590..e71b28793d 100644 --- a/tests/utils/set_params.cpp +++ b/tests/utils/set_params.cpp @@ -609,9 +609,6 @@ void setMultigridParam(QudaMultigridParam &mg_param) mg_param.setup_location[i] = setup_location[i]; } - // whether to run GPU setup but putting temporaries into mapped (slow CPU) memory - mg_param.setup_minimize_memory = QUDA_BOOLEAN_FALSE; - // only coarsen the spin on the first restriction mg_param.spin_block_size[0] = 2; @@ -1226,9 +1223,6 @@ void setStaggeredMultigridParam(QudaMultigridParam &mg_param) nu_post[i] = 2; } - // whether to run GPU setup but putting temporaries into mapped (slow CPU) memory - mg_param.setup_minimize_memory = QUDA_BOOLEAN_FALSE; - // coarsening the spin on the first restriction is undefined for staggered fields. mg_param.spin_block_size[0] = 0;