From 4e36a0d60d403520b57a2c0ebfc46532f3c0c1eb Mon Sep 17 00:00:00 2001 From: Michael Axtmann Date: Tue, 3 Sep 2024 23:05:52 +0000 Subject: [PATCH] trace: Add tracepoints to RMA read and write operations In a previous commit, RMA iwrite, iwriteInline, and iread API functions have been implemented. This commit adds nvtx and lttng tracepoints to these functions. Signed-off-by: Michael Axtmann --- include/nccl_ofi_tracepoint.h | 10 ++++++++++ include/tracing_impl/lttng.h | 26 ++++++++++++++++++++++++++ include/tracing_impl/nvtx.h | 10 ++++++++++ src/nccl_ofi_rdma.c | 5 +++++ 4 files changed, 51 insertions(+) diff --git a/include/nccl_ofi_tracepoint.h b/include/nccl_ofi_tracepoint.h index 17f930a9c..aab9b3f61 100644 --- a/include/nccl_ofi_tracepoint.h +++ b/include/nccl_ofi_tracepoint.h @@ -103,6 +103,16 @@ NCCL_OFI_TRACE_FLUSH_NVTX(request, nccl_req); \ } while(0) +#define NCCL_OFI_TRACE_READ(request, nccl_req) do { \ + lttng_ust_tracepoint(nccl_ofi_plugin, Read, request, nccl_req); \ + NCCL_OFI_TRACE_READ_NVTX(request, nccl_req); \ +} while(0) + +#define NCCL_OFI_TRACE_WRITE(request, nccl_req) do { \ + lttng_ust_tracepoint(nccl_ofi_plugin, Write, request, nccl_req); \ + NCCL_OFI_TRACE_WRITE_NVTX(request, nccl_req); \ +} while(0) + #define NCCL_OFI_TRACE_PENDING_INSERT(request) do { \ lttng_ust_tracepoint(nccl_ofi_plugin, Pending_queue_insert, request); \ NCCL_OFI_TRACE_PENDING_INSERT_NVTX(request); \ diff --git a/include/tracing_impl/lttng.h b/include/tracing_impl/lttng.h index b4b56d7ee..8db0838bd 100644 --- a/include/tracing_impl/lttng.h +++ b/include/tracing_impl/lttng.h @@ -234,6 +234,32 @@ LTTNG_UST_TRACEPOINT_EVENT( ) ) +LTTNG_UST_TRACEPOINT_EVENT( + nccl_ofi_plugin, + Read, + LTTNG_UST_TP_ARGS( + void *, request, + void *, nccl_req + ), + LTTNG_UST_TP_FIELDS( + lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request) + lttng_ust_field_integer_hex(uint64_t, nccl_req, (uint64_t)nccl_req) + ) +) + +LTTNG_UST_TRACEPOINT_EVENT( + nccl_ofi_plugin, + Write, + LTTNG_UST_TP_ARGS( + void *, request, + void *, nccl_req + ), + LTTNG_UST_TP_FIELDS( + lttng_ust_field_integer_hex(uint64_t, request, (uint64_t)request) + lttng_ust_field_integer_hex(uint64_t, nccl_req, (uint64_t)nccl_req) + ) +) + LTTNG_UST_TRACEPOINT_EVENT( nccl_ofi_plugin, diff --git a/include/tracing_impl/nvtx.h b/include/tracing_impl/nvtx.h index 1fbfe8b55..54dfadc8a 100644 --- a/include/tracing_impl/nvtx.h +++ b/include/tracing_impl/nvtx.h @@ -186,6 +186,14 @@ static inline void nvtx_end(nvtxRangeId_t id) { nvtx_mark_domain(NULL, "Flush", 0xA52A2A); \ } while(0) +#define NCCL_OFI_TRACE_READ_NVTX(request, nccl_req) do { \ + nvtx_mark_domain(NULL, "Read", 0xff00ff); \ +} while(0) + +#define NCCL_OFI_TRACE_WRITE_NVTX(request, nccl_req) do { \ + nvtx_mark_domain(NULL, "Write", 0xff00ff); \ +} while(0) + #define NCCL_OFI_TRACE_PENDING_INSERT_NVTX(request) do { \ nvtx_mark_domain(NULL, "Pending_insert", 0xFF8C00); \ } while(0) @@ -210,6 +218,8 @@ static inline void nvtx_end(nvtxRangeId_t id) { #define NCCL_OFI_TRACE_RECV_SEGMENT_COMPLETE_NVTX(...) #define NCCL_OFI_TRACE_EAGER_RECV_NVTX(...) #define NCCL_OFI_TRACE_FLUSH_NVTX(...) +#define NCCL_OFI_TRACE_READ_NVTX(...) +#define NCCL_OFI_TRACE_WRITE_NVTX(...) #define NCCL_OFI_TRACE_PENDING_INSERT_NVTX(...) #define NCCL_OFI_TRACE_PENDING_REMOVE_NVTX(...) diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index c9db1d7f5..b8ae8848f 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -3691,6 +3691,8 @@ static int rma_read(nccl_net_ofi_recv_comm_t *recv_comm, void* dest, size_t size */ (r_comm->num_inflight_reqs)++; + NCCL_OFI_TRACE_READ(req, base_req); + /* Try posting RMA read */ ret = receive_progress(req, true); @@ -5304,6 +5306,8 @@ static int rma_write_impl(nccl_net_ofi_send_comm_t *send_comm, void* src, size_t */ (s_comm->num_inflight_reqs)++; + NCCL_OFI_TRACE_WRITE(req, base_req); + /* Try posting RMA write with write_inline interface */ ret = send_progress(req); @@ -5314,6 +5318,7 @@ static int rma_write_impl(nccl_net_ofi_send_comm_t *send_comm, void* src, size_t NCCL_OFI_WARN("Failed to nccl_ofi_deque_insert_back: %d", ret); goto error; } + NCCL_OFI_TRACE_PENDING_INSERT(req); } else if (OFI_UNLIKELY(ret != 0)) { ret = -ENOTSUP; goto error;