Skip to content

Commit

Permalink
RMA: Add implementation of RMA operation to RDMA protocol
Browse files Browse the repository at this point in the history
So far, the neuron V5 API interface does not provide implementations
for the RMA operations. Instead, it set `rma_supported=0` when user
queries device properties.
This commit provides those implementations and reports support to
user.

Signed-off-by: Michael Axtmann <[email protected]>
  • Loading branch information
maxtmann committed Aug 30, 2024
1 parent 05b8d20 commit 7de3626
Show file tree
Hide file tree
Showing 8 changed files with 552 additions and 16 deletions.
26 changes: 26 additions & 0 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ typedef enum nccl_net_ofi_rdma_req_state {
} nccl_net_ofi_rdma_req_state_t;

typedef enum nccl_net_ofi_rdma_req_type {
/* Write request */
NCCL_OFI_RDMA_WRITE,
/* Read request */
NCCL_OFI_RDMA_READ,
/* Send request */
NCCL_OFI_RDMA_SEND,
/* Receive request */
Expand Down Expand Up @@ -200,6 +204,27 @@ typedef struct {
nccl_net_ofi_rdma_ep_t *ep;
} rdma_req_bounce_data_t;

typedef struct {
/* Remote destination buffer address */
uint64_t remote_buff;
/* Remote MR key */
uint64_t remote_mr_key;
/* Number of rails where we have successfully posted the network xfer.
* Used mostly when the network xfer is sliced across multiple rails */
uint64_t xferred_rail_id;
/* Application-provided local src/dst buffer */
void *buff;
/* Length of application-provided buffer */
size_t buff_len;
/* First rail descriptor from memory registration of `buff' */
void *desc;
/* Additional flags */
uint64_t flags;
/* Total number of completions. Expect one completion for receiving the
* control message and one completion for each send segment. */
int total_num_compls;
} rdma_req_rma_op_data_t;

typedef struct {
/* True for eager messages */
bool eager;
Expand Down Expand Up @@ -331,6 +356,7 @@ typedef struct nccl_net_ofi_rdma_req {
int ncompls;

union {
rdma_req_rma_op_data_t rma_op_data;
rdma_req_send_data_t send_data;
rdma_req_recv_data_t recv_data;
rdma_req_send_ctrl_data_t send_ctrl_data;
Expand Down
10 changes: 10 additions & 0 deletions include/nccl_ofi_tracepoint.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,16 @@
NCCL_OFI_TRACE_FLUSH_NVTX(request, nccl_req); \
} while(0)

#define NCCL_OFI_TRACE_READ(request, nccl_req) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, Read, request, nccl_req); \
NCCL_OFI_TRACE_READ_NVTX(request, nccl_req); \
} while(0)

#define NCCL_OFI_TRACE_WRITE(request, nccl_req) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, Write, request, nccl_req); \
NCCL_OFI_TRACE_WRITE_NVTX(request, nccl_req); \
} while(0)

#define NCCL_OFI_TRACE_PENDING_INSERT(request) do { \
lttng_ust_tracepoint(nccl_ofi_plugin, Pending_queue_insert, request); \
NCCL_OFI_TRACE_PENDING_INSERT_NVTX(request); \
Expand Down
26 changes: 26 additions & 0 deletions include/tracing_impl/lttng.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,32 @@ LTTNG_UST_TRACEPOINT_EVENT(
)
)

LTTNG_UST_TRACEPOINT_EVENT(
nccl_ofi_plugin,
Read,
LTTNG_UST_TP_ARGS(
void *, request,
void *, nccl_req
),
LTTNG_UST_TP_FIELDS(
lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request)
lttng_ust_field_integer_hex(uint64_t, nccl_req, (uint64_t)nccl_req)
)
)

LTTNG_UST_TRACEPOINT_EVENT(
nccl_ofi_plugin,
Write,
LTTNG_UST_TP_ARGS(
void *, request,
void *, nccl_req
),
LTTNG_UST_TP_FIELDS(
lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request)
lttng_ust_field_integer_hex(uint64_t, nccl_req, (uint64_t)nccl_req)
)
)


LTTNG_UST_TRACEPOINT_EVENT(
nccl_ofi_plugin,
Expand Down
10 changes: 10 additions & 0 deletions include/tracing_impl/nvtx.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,14 @@ static inline void nvtx_end(nvtxRangeId_t id) {
nvtx_mark_domain(NULL, "Flush", 0xA52A2A); \
} while(0)

#define NCCL_OFI_TRACE_READ_NVTX(request, nccl_req) do { \
nvtx_mark_domain(NULL, "Read", 0xff00ff); \
} while(0)

#define NCCL_OFI_TRACE_WRITE_NVTX(request, nccl_req) do { \
nvtx_mark_domain(NULL, "Write", 0xff00ff); \
} while(0)

#define NCCL_OFI_TRACE_PENDING_INSERT_NVTX(request) do { \
nvtx_mark_domain(NULL, "Pending_insert", 0xFF8C00); \
} while(0)
Expand All @@ -210,6 +218,8 @@ static inline void nvtx_end(nvtxRangeId_t id) {
#define NCCL_OFI_TRACE_RECV_SEGMENT_COMPLETE_NVTX(...)
#define NCCL_OFI_TRACE_EAGER_RECV_NVTX(...)
#define NCCL_OFI_TRACE_FLUSH_NVTX(...)
#define NCCL_OFI_TRACE_READ_NVTX(...)
#define NCCL_OFI_TRACE_WRITE_NVTX(...)
#define NCCL_OFI_TRACE_PENDING_INSERT_NVTX(...)
#define NCCL_OFI_TRACE_PENDING_REMOVE_NVTX(...)

Expand Down
3 changes: 2 additions & 1 deletion m4/check_pkg_libfabric.m4
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ AC_DEFUN([CHECK_PKG_LIBFABRIC], [
FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES,
FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES,
FI_OPT_MAX_MSG_SIZE,
FI_OPT_SHARED_MEMORY_PERMITTED],
FI_OPT_SHARED_MEMORY_PERMITTED,
FI_OPT_INJECT_RMA_SIZE],
[], [], [AC_INCLUDES_DEFAULT
[#include <rdma/fi_endpoint.h>
#ifdef HAVE_RDMA_FI_EXT_H
Expand Down
1 change: 0 additions & 1 deletion src/nccl_ofi_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,6 @@ ncclResult_t nccl_net_ofi_accept_v4(void* listenComm, void** recvComm)
return nccl_net_ofi_retval_translate(ret);
}


ncclResult_t nccl_net_ofi_isend(void *sComm, void* data, int size,
int tag, void *mhandle, void** req)
{
Expand Down
2 changes: 2 additions & 0 deletions src/nccl_ofi_net.c
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,8 @@ int nccl_net_ofi_info_properties(struct fi_info *nic_prov, int dev_id, int num_d
#endif
}

props->max_write_inline_size = nic_prov->tx_attr->inject_size;
props->max_mr_key_size = nic_prov->domain_attr->mr_key_size;
props->rma_supported = 0;

goto exit;
Expand Down
Loading

0 comments on commit 7de3626

Please sign in to comment.