Skip to content

Commit

Permalink
init: Avoid hang by forcing SENDRECV in case of neuron v4 API usage
Browse files Browse the repository at this point in the history
v4 API may block infinitively when executed with RDMA protocol because
communicator creation is (a) blocking operation by definition of v4
API and (b) performing 4-way handshake in case of RDMA
protocol. Therefore, we force it to use SENDRECV protocol in case
neuron specific API is used. We do not force SENDRECV protocol in case
of NCCL API, since there is no known platform that uses RDMA protocol
with v4 API. Note, on P5 instances, NCCL needs to needs to support
more recent API anyways.

Signed-off-by: Michael Axtmann <[email protected]>
  • Loading branch information
maxtmann committed Aug 21, 2024
1 parent 7c014f0 commit 2b1d795
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 3 deletions.
3 changes: 3 additions & 0 deletions include/nccl_ofi.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ extern float net_latency;
/* Size of system memory pages */
extern long system_page_size;

/* Indicates whether neuron v4 interface is used for initialization */
extern bool v4_neuron_interface = false;

struct nccl_net_ofi_plugin;
struct nccl_net_ofi_device;
struct nccl_net_ofi_ep;
Expand Down
9 changes: 7 additions & 2 deletions src/nccl_ofi_interface_neuron.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
#include "nccl_ofi.h"
#include "nccl_ofi_api.h"

static ncclResult_t init_v4(ncclDebugLogger_t logFunction)
{
v4_neuron_interface = true;
return nccl_net_ofi_init(logFunction);
}

static ncclResult_t getProperties_v4(int dev_id, ncclNetProperties_v4_t *props)
{
nccl_ofi_properties_t ofi_properties;
Expand All @@ -29,10 +35,9 @@ static ncclResult_t getProperties_v4(int dev_id, ncclNetProperties_v4_t *props)
return ncclSuccess;
}


NCCL_OFI_EXPORT_SYMBOL const ncclNet_v4_t ncclNetPlugin_v4 = {
.name = "AWS Libfabric",
.init = nccl_net_ofi_init,
.init = init_v4,
.devices = nccl_net_ofi_devices,
.getProperties = getProperties_v4,
.listen = nccl_net_ofi_listen_v4,
Expand Down
3 changes: 3 additions & 0 deletions src/nccl_ofi_net.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ float net_latency = .0;
/* Size of a memory page */
long system_page_size = -1;

/* Indicates whether neuron v4 interface is used for initialization */
bool v4_neuron_interface = false;

/*
* @brief Allocate memory region for memory registration
*
Expand Down
10 changes: 9 additions & 1 deletion src/platform-aws.c
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,15 @@ int platform_init(const char **provider_filter)
net_latency);
}

if (select_efa && ofi_nccl_protocol() == NULL && platform_data) {
/* v4 API may block infinitively when executed with RDMA
* protocol because communicator creation is (a) blocking
* operation by definition of v4 interface and (b) performing
* 4-way handshake in case of RDMA protocol. Therefore, we
* force it to use SENDRECV protocol in case of neuron
* specific API is used. */
if (v4_neuron_interface) {
nccl_ofi_selected_protocol = "SENDRECV";
} else if (select_efa && ofi_nccl_protocol() == NULL && platform_data) {
nccl_ofi_selected_protocol = platform_data->default_protocol;
}

Expand Down

0 comments on commit 2b1d795

Please sign in to comment.