From a5117a633a73a08fc70490c9868039db82af6875 Mon Sep 17 00:00:00 2001 From: chang-l Date: Thu, 21 Nov 2024 12:06:32 -0800 Subject: [PATCH] Fix format/CI --- cpp/src/nvml_wrap.cpp | 24 ++++++++++-------- cpp/src/nvml_wrap.h | 6 +++-- cpp/src/wholememory/communicator.cpp | 38 +++++++++++++++------------- cpp/src/wholememory/system_info.hpp | 4 +-- 4 files changed, 41 insertions(+), 31 deletions(-) diff --git a/cpp/src/nvml_wrap.cpp b/cpp/src/nvml_wrap.cpp index 4e32fac51..fc2551858 100644 --- a/cpp/src/nvml_wrap.cpp +++ b/cpp/src/nvml_wrap.cpp @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "nvml_wrap.h" + +#if CUDA_VERSION >= 12030 #include -#include #include -#include "nvml_wrap.h" +#include namespace { @@ -23,7 +25,8 @@ void* nvml_handle = nullptr; std::mutex nvml_mutex; bool nvml_loaded = false; -bool LoadNvmlLibrary() { +bool LoadNvmlLibrary() +{ nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW); if (!nvml_handle) { nvml_handle = dlopen("libnvidia-ml.so", RTLD_NOW); @@ -36,11 +39,10 @@ bool LoadNvmlLibrary() { } template -T LoadNvmlSymbol(const char* name) { +T LoadNvmlSymbol(const char* name) +{ void* symbol = dlsym(nvml_handle, name); - if (!symbol) { - return nullptr; - } + if (!symbol) { return nullptr; } return reinterpret_cast(symbol); } @@ -51,10 +53,11 @@ nvmlDeviceGetHandleByIndexFunc nvmlDeviceGetHandleByIndexPtr = nullptr; nvmlDeviceGetGpuFabricInfoFunc nvmlDeviceGetGpuFabricInfoPtr = nullptr; // Ensure NVML is loaded and symbols are initialized -bool NvmlFabricSymbolLoaded() { +bool NvmlFabricSymbolLoaded() +{ std::lock_guard lock(nvml_mutex); if (nvml_loaded) { - return true; // Already loaded + return true; // Already loaded } if (LoadNvmlLibrary()) { @@ -71,4 +74,5 @@ bool NvmlFabricSymbolLoaded() { } } return nvml_loaded; -} \ No newline at end of file +} +#endif diff --git a/cpp/src/nvml_wrap.h b/cpp/src/nvml_wrap.h index c482a0eb9..f8b22fc7f 100644 --- a/cpp/src/nvml_wrap.h +++ b/cpp/src/nvml_wrap.h @@ -13,14 +13,16 @@ // limitations under the License. #pragma once +#include +#if CUDA_VERSION >= 12030 #include - bool NvmlFabricSymbolLoaded(); typedef nvmlReturn_t (*nvmlDeviceGetHandleByIndexFunc)(unsigned int, nvmlDevice_t*); typedef nvmlReturn_t (*nvmlDeviceGetGpuFabricInfoFunc)(nvmlDevice_t, nvmlGpuFabricInfo_t*); extern nvmlDeviceGetHandleByIndexFunc nvmlDeviceGetHandleByIndexPtr; -extern nvmlDeviceGetGpuFabricInfoFunc nvmlDeviceGetGpuFabricInfoPtr; \ No newline at end of file +extern nvmlDeviceGetGpuFabricInfoFunc nvmlDeviceGetGpuFabricInfoPtr; +#endif diff --git a/cpp/src/wholememory/communicator.cpp b/cpp/src/wholememory/communicator.cpp index 2601981f1..34053ad7e 100644 --- a/cpp/src/wholememory/communicator.cpp +++ b/cpp/src/wholememory/communicator.cpp @@ -535,7 +535,7 @@ void exchange_rank_info(wholememory_comm_t wm_comm) wm_comm->clique_info.is_in_clique = 0; #if CUDA_VERSION >= 12030 - if(nvmlFabricSymbolLoaded) { + if (nvmlFabricSymbolLoaded) { memset(&ri.fabric_info, 0, sizeof(ri.fabric_info)); WHOLEMEMORY_CHECK_NOTHROW(GetGpuFabricInfo(wm_comm->dev_id, &ri.fabric_info) == WHOLEMEMORY_SUCCESS); @@ -548,7 +548,9 @@ void exchange_rank_info(wholememory_comm_t wm_comm) wm_comm->clique_info.is_in_clique = 1; } } else { - WHOLEMEMORY_WARN("Some required NVML symbols are missing, likely due to an outdated GPU display driver. MNNVL support will be disabled."); + WHOLEMEMORY_WARN( + "Some required NVML symbols are missing, likely due to an outdated GPU display driver. MNNVL " + "support will be disabled."); } #endif @@ -578,30 +580,32 @@ void exchange_rank_info(wholememory_comm_t wm_comm) } #if CUDA_VERSION >= 12030 - if(nvmlFabricSymbolLoaded) { - if ((memcmp(ri.fabric_info.clusterUuid, - p_rank_info.get()[r].fabric_info.clusterUuid, - NVML_GPU_FABRIC_UUID_LEN) == 0) && - (ri.fabric_info.cliqueId == p_rank_info.get()[r].fabric_info.cliqueId)) { - if (r == wm_comm->world_rank) { - wm_comm->clique_info.clique_rank = wm_comm->clique_info.clique_rank_num; + if (nvmlFabricSymbolLoaded) { + if ((memcmp(ri.fabric_info.clusterUuid, + p_rank_info.get()[r].fabric_info.clusterUuid, + NVML_GPU_FABRIC_UUID_LEN) == 0) && + (ri.fabric_info.cliqueId == p_rank_info.get()[r].fabric_info.cliqueId)) { + if (r == wm_comm->world_rank) { + wm_comm->clique_info.clique_rank = wm_comm->clique_info.clique_rank_num; + } + if (wm_comm->clique_info.clique_rank_num == 0) { + wm_comm->clique_info.clique_first_rank = r; + } + wm_comm->clique_info.clique_rank_num++; } - if (wm_comm->clique_info.clique_rank_num == 0) { wm_comm->clique_info.clique_first_rank = r; } - wm_comm->clique_info.clique_rank_num++; + clique_uuids.insert( + std::string(reinterpret_cast(p_rank_info.get()[r].fabric_info.clusterUuid), + NVML_GPU_FABRIC_UUID_LEN)); } - clique_uuids.insert( - std::string(reinterpret_cast(p_rank_info.get()[r].fabric_info.clusterUuid), - NVML_GPU_FABRIC_UUID_LEN)); - } #endif } #if CUDA_VERSION >= 12030 - if(nvmlFabricSymbolLoaded) { + if (nvmlFabricSymbolLoaded) { wm_comm->clique_info.clique_num = clique_uuids.size(); std::string uuid = std::string(reinterpret_cast(ri.fabric_info.clusterUuid), - NVML_GPU_FABRIC_UUID_LEN); + NVML_GPU_FABRIC_UUID_LEN); int id = 0; for (auto clique_uuid : clique_uuids) { if (clique_uuid == uuid) { wm_comm->clique_info.clique_id = id; } diff --git a/cpp/src/wholememory/system_info.hpp b/cpp/src/wholememory/system_info.hpp index 380a91d31..4d6c52c27 100644 --- a/cpp/src/wholememory/system_info.hpp +++ b/cpp/src/wholememory/system_info.hpp @@ -18,8 +18,8 @@ #include "wholememory/wholememory.h" #if CUDA_VERSION >= 12030 -#include #include "nvml_wrap.h" +#include #endif bool DevAttrPagebleMemoryAccess(); @@ -41,6 +41,6 @@ namespace wholememory { inline bool nvmlFabricSymbolLoaded = NvmlFabricSymbolLoaded(); wholememory_error_code_t GetGpuFabricInfo(int dev, nvmlGpuFabricInfo_t* gpuFabricInfo); -} +} // namespace wholememory #endif