ROCm · isaki001 · Jan 14, 2025 · Jan 2, 2025 · Jan 3, 2025 · Jan 3, 2025
@@ -68,9 +68,13 @@ if(ENABLE_MSCCLPP)
            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
         )
 	execute_process(
-           COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
-           WORKING_DIRECTORY ${MSCCLPP_SOURCE}
-        )
+	   COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
+	   WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+	)
+	execute_process(
+	   COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
+	   WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+	)
 
         message(STATUS "Building mscclpp only for gfx942.")
 
@@ -98,13 +102,18 @@ if(ENABLE_MSCCLPP)
 
         find_package(mscclpp_nccl REQUIRED)
 	execute_process(
-    		COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
-        	WORKING_DIRECTORY ${MSCCLPP_SOURCE}
-    	)
-	execute_process(
-           COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
+           COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
            WORKING_DIRECTORY ${MSCCLPP_SOURCE}
         )
+	execute_process(
+	   COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
+	   WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+	)
+	execute_process(
+	   COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
+	   WORKING_DIRECTORY ${MSCCLPP_SOURCE}
+	)
+
     endif()
 
     execute_process(COMMAND objcopy

@@ -0,0 +1,89 @@
+diff --git a/apps/nccl/include/nccl.h b/apps/nccl/include/nccl.h
+index 7f50792..b8b146d 100644
+--- a/apps/nccl/include/nccl.h
++++ b/apps/nccl/include/nccl.h
+@@ -344,6 +344,12 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun
+ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype,
+                             ncclComm_t comm, cudaStream_t stream);
+
++/*
++ * Register/Deregister
++ */
++ncclResult_t ncclCommRegister(ncclComm_t comm, void* buff, size_t size, void** handle);
++ncclResult_t ncclCommDeregister(ncclComm_t comm, void* handle);
++ncclResult_t ncclBuffIsRegistered(ncclComm_t comm, const void* buff, size_t count, bool* registered);
+ /*
+  * Send
+  *
+diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
+index a697be2..d8497e7 100644
+--- a/apps/nccl/src/nccl.cu
++++ b/apps/nccl/src/nccl.cu
+@@ -577,6 +577,67 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
+   return ncclSuccess;
+ }
+
++NCCL_API ncclResult_t ncclCommRegister(ncclComm_t comm, void* buff, size_t size, void** handle) {
++  size_t buffBytes = size;
++  CUdeviceptr buffBasePtr;
++  MSCCLPP_CUTHROW(cuMemGetAddressRange(&buffBasePtr, &buffBytes, (CUdeviceptr)buff));
++
++  int rank = comm->comm->bootstrap()->getRank();
++  channelKey buffKey{(void*)buffBasePtr, buffBytes};
++
++  std::vector<mscclpp::RegisteredMemory> remoteMemories;
++
++  // Creating the channels
++  auto buffIt = comm->channelScratchInfos.find(buffKey);
++  if (buffIt == comm->channelScratchInfos.end()) {
++     std::vector<mscclpp::SmChannel> channels =
++          setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast<void*>((void*)buffBasePtr));
++     ChannelInfo channelInfo{channels, channels, setupSmChannelDeviceHandles(channels), setupSmChannelDeviceHandles(channels)};
++     buffIt = comm->channelScratchInfos.emplace(buffKey, channelInfo).first;
++  }
++  auto sendIt = comm->channelInInfos.find(buffKey);
++  if (sendIt == comm->channelInInfos.end()) {
++      std::vector<mscclpp::SmChannel> channels =
++          setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast<void*>((void*)buffBasePtr));
++
++      remoteMemories =
++          setupRemoteMemories(comm->comm, rank, (void*)buffBasePtr, buffBytes, mscclpp::Transport::CudaIpc);
++      std::vector<mscclpp::SmChannel> channels1 =
++          setupSmChannels(comm, remoteMemories, const_cast<void*>((void*)buffBasePtr));
++
++      ChannelInfo channelInfo{channels, channels1, setupSmChannelDeviceHandles(channels), setupSmChannelDeviceHandles(channels1)};
++      sendIt = comm->channelInInfos.emplace(buffKey, channelInfo).first;
++  }
++  auto recvIt = comm->channelOutInfos.find(buffKey);
++    if (recvIt == comm->channelOutInfos.end()) {
++      remoteMemories =
++          setupRemoteMemories(comm->comm, rank, (void*)buffBasePtr, buffBytes, mscclpp::Transport::CudaIpc);
++      std::vector<mscclpp::SmChannel> outChannels =
++          setupSmChannels(comm, remoteMemories, const_cast<void*>((void*)buffBasePtr));
++      ChannelInfo channelInfo{outChannels, outChannels, setupSmChannelDeviceHandles(outChannels), setupSmChannelDeviceHandles(outChannels)};
++      recvIt = comm->channelOutInfos.emplace(buffKey, channelInfo).first;
++  }
++  *handle = (void*) buffBasePtr;
++
++  return ncclSuccess;
++}
++
++NCCL_API ncclResult_t ncclCommDeregister(ncclComm_t comm, void* handle) {
++        if (comm && handle) {
++                handle = nullptr;
++        }
-+NCCL_API ncclResult_t ncclCommDeregister(ncclComm_t comm, void* handle) {
-+        if (comm && handle) {
-+                handle = nullptr;
-+        }
+NCCL_API ncclResult_t ncclCommDeregister(ncclComm_t comm, void** handle) {
+        if (comm && handle && *handle) {
+                *handle = nullptr;
+        }
-+NCCL_API ncclResult_t ncclCommDeregister(ncclComm_t comm, void* handle) {
-+        if (comm && handle) {
-+                handle = nullptr;
-+        }
+NCCL_API ncclResult_t ncclCommDeregister(ncclComm_t comm, void** handle) {
+        if (comm && handle && *handle) {
+                *handle = nullptr;
+        }
++        return ncclSuccess;
++}
++
++NCCL_API ncclResult_t ncclBuffIsRegistered(ncclComm_t comm, const void* buff, size_t count, bool* registered){
++  size_t buffBytes;
++  CUdeviceptr buffBasePtr;
++  MSCCLPP_CUTHROW(cuMemGetAddressRange(&buffBasePtr, &buffBytes, (CUdeviceptr)buff));
++  channelKey buffKey{(void*)buffBasePtr, buffBytes};
++  auto buffIt = comm->channelScratchInfos.find(buffKey);
++  *registered =  buffIt != comm->channelScratchInfos.end();
++  return ncclSuccess;
++}
+ NCCL_API ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) {
+   // TODO: implement this function
+   return ncclInternalError;
@@ -38,6 +38,12 @@ extern "C" {
   /* See ncclAllGather. */
   ncclResult_t  mscclpp_ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
       ncclDataType_t datatype, mscclppComm_t comm, hipStream_t stream);
+
+  ncclResult_t mscclpp_ncclCommRegister(mscclppComm_t comm, void* buff, size_t size, void** handle);
+
+  ncclResult_t mscclpp_ncclCommDeregister(mscclppComm_t comm, void* handle);
+
+  ncclResult_t mscclpp_ncclBuffIsRegistered(mscclppComm_t comm, const void* buff, size_t count, bool* registered);
 }
 
 namespace std {

@@ -524,8 +524,15 @@ ncclResult_t mscclEnqueueCheck(
           NCCLCHECK(mscclGetCaptureStatus(comm->rank, stream));
         }
 
+        bool sendBuffRegistered = false; 
+        bool recvBuffRegistered = false; 
+        mscclpp_ncclBuffIsRegistered(comm->mscclpp_comm, sendBuff, count, &sendBuffRegistered);
+        mscclpp_ncclBuffIsRegistered(comm->mscclpp_comm, sendBuff, count, &recvBuffRegistered);
+        const bool graphMode = threadLocalStatus.captureStatus != mscclNoCapture;
+        const bool buffsRegistedNonGraphMode = !graphMode && sendBuffRegistered && recvBuffRegistered;
+
         /* check if one rank per GPU and graph mode is enabled */
-        if ((threadLocalStatus.captureStatus != mscclNoCapture) && comm->mscclCompatible && nBytes > 0 && (nBytes & 31) == 0) {
+        if ((graphMode || buffsRegistedNonGraphMode) && comm->mscclCompatible && nBytes > 0 && (nBytes & 31) == 0) {
           bool isManagedBuffer = false;
           if (sendBuff) CUDACHECK(hipPointerGetAttribute(&isManagedBuffer, HIP_POINTER_ATTRIBUTE_IS_MANAGED, const_cast<void*>(sendBuff)));
           if (!isManagedBuffer && recvBuff) CUDACHECK(hipPointerGetAttribute(&isManagedBuffer, HIP_POINTER_ATTRIBUTE_IS_MANAGED, const_cast<void*>(recvBuff)));
@@ -565,8 +572,15 @@ ncclResult_t mscclEnqueueCheck(
           NCCLCHECK(mscclGetCaptureStatus(comm->rank, stream));
         }
 
+        bool sendBuffRegistered = false; 
+        bool recvBuffRegistered = false; 
+        mscclpp_ncclBuffIsRegistered(comm->mscclpp_comm, sendBuff, count, &sendBuffRegistered);
+        mscclpp_ncclBuffIsRegistered(comm->mscclpp_comm, sendBuff, count, &recvBuffRegistered);
+        const bool graphMode = threadLocalStatus.captureStatus != mscclNoCapture;
+        const bool buffsRegistedNonGraphMode = !graphMode && sendBuffRegistered && recvBuffRegistered;
+
         /* check if one rank per GPU and graph mode is enabled */
-        if ((threadLocalStatus.captureStatus != mscclNoCapture) && comm->mscclCompatible && nBytes > 0 && (nBytes & 31) == 0) {
+        if ((graphMode || buffsRegistedNonGraphMode) && comm->mscclCompatible && nBytes > 0 && (nBytes & 31) == 0) {
           bool isManagedBuffer = false;
           if (sendBuff) CUDACHECK(hipPointerGetAttribute(&isManagedBuffer, HIP_POINTER_ATTRIBUTE_IS_MANAGED, const_cast<void*>(sendBuff)));
           if (!isManagedBuffer && recvBuff) CUDACHECK(hipPointerGetAttribute(&isManagedBuffer, HIP_POINTER_ATTRIBUTE_IS_MANAGED, const_cast<void*>(recvBuff)));

@@ -30,3 +30,6 @@ ncclRedOpDestroy mscclpp_ncclRedOpDestroy
 ncclReduce mscclpp_ncclReduce
 ncclReduceScatter mscclpp_ncclReduceScatter
 ncclSend mscclpp_ncclSend
+ncclCommRegister mscclpp_ncclCommRegister
+ncclCommDeregister mscclpp_ncclCommDeregister
+ncclBuffIsRegistered mscclpp_ncclBuffIsRegistered
@@ -10,6 +10,7 @@
 #include "net.h"
 #include "register.h"
 #include "api_trace.h"
+#include "mscclpp/mscclpp_nccl.h"
 
 ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) {
   struct ncclRegCache* cache = &comm->regCache;
@@ -155,32 +156,59 @@ NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size
 ncclResult_t ncclCommRegister_impl(const ncclComm_t comm, void* buff, size_t size, void** handle) {
   NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
   if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister"));
-  NCCLCHECK(ncclRegister(comm, buff, size, handle));
+
+  #ifdef ENABLE_MSCCLPP
+    if (comm->mscclCompatible && size > 0 && (size & 31) == 0 && size <= comm->mscclpp_threshold){
+      bool isManagedBuffer = false; 
+      CUDACHECK(hipPointerGetAttribute(&isManagedBuffer, HIP_POINTER_ATTRIBUTE_IS_MANAGED, const_cast<void*>(buff)));
+      if(!isManagedBuffer){
+        INFO(NCCL_INIT, "MSCCL++: ncclCommRegister");
+        NCCLCHECK(mscclpp_ncclCommRegister(comm->mscclpp_comm, buff, size, handle));
+      }
+      else{
+        WARN("MSCCL++: Cannot register user-buffers on managed memory");
+      }
+    }
+    else
+    #endif
+    {
+       INFO(NCCL_INIT, "RCCL: ncclCommRegister");
+      NCCLCHECK(ncclRegister(comm, buff, size, handle));
+    }
   return ncclSuccess;
 }
 
 NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
 ncclResult_t ncclCommDeregister_impl(const ncclComm_t comm, void* handle) {
-  NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
-  struct ncclReg* reg = (struct ncclReg*)handle;
-  struct ncclRegCache* cache = &comm->regCache;
-  int slot;
-  for (slot=0; slot<cache->population && cache->slots[slot] != reg; slot++);
-  if (slot == cache->population) {
-    WARN("Deregister: Could not find handle");
-    return ncclInvalidUsage;
-  }
-  if (--reg->refs) return ncclSuccess;
-  NCCLCHECK(ncclNetDeregister(comm, reg));
-  if (reg->state & NVLS_REG_COMPLETE) {
-    NCCLCHECK(ncclNvlsDeregBuffer(&reg->mcHandle, reg->regAddr, reg->dev, reg->regSize));
-    reg->regAddr = (CUdeviceptr)NULL;
-  }
-  if (reg->state & COLLNET_REG_COMPLETE) {
-    NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->proxyconn, reg->collnetHandle));
-  }
-  free(reg);
-  memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*));
-  cache->population -= 1;
+
+  #ifdef ENABLE_MSCCLPP
+    if (comm->mscclCompatible){
+      NCCLCHECK(mscclpp_ncclCommDeregister(comm->mscclpp_comm, handle));
+    }
+    else
+  #endif
+    {
+      NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
+      struct ncclReg* reg = (struct ncclReg*)handle;
+      struct ncclRegCache* cache = &comm->regCache;
+      int slot;
+      for (slot=0; slot<cache->population && cache->slots[slot] != reg; slot++);
+      if (slot == cache->population) {
+        WARN("Deregister: Could not find handle");
+        return ncclInvalidUsage;
+      }
+      if (--reg->refs) return ncclSuccess;
+      NCCLCHECK(ncclNetDeregister(comm, reg));
+      if (reg->state & NVLS_REG_COMPLETE) {
+        NCCLCHECK(ncclNvlsDeregBuffer(&reg->mcHandle, reg->regAddr, reg->dev, reg->regSize));
+        reg->regAddr = (CUdeviceptr)NULL;
+      }
+      if (reg->state & COLLNET_REG_COMPLETE) {
+        NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->proxyconn, reg->collnetHandle));
+      }
+      free(reg);
+      memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*));
+      cache->population -= 1;
+    }
   return ncclSuccess;
 }