Skip to content

Commit

Permalink
TransferBench V1.59 (#162)
Browse files Browse the repository at this point in the history
Adding NIC execution capabilities, various bug fixes introduced by header-only-library refactor
---------
Co-authored-by: Mustafa Abduljabbar <[email protected]>
  • Loading branch information
gilbertlee-amd authored Jan 21, 2025
1 parent fcac6d9 commit 5984f49
Show file tree
Hide file tree
Showing 11 changed files with 1,572 additions and 209 deletions.
24 changes: 24 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,30 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).

## v1.59.00
### Added
- Adding in support for NIC executor, which allows for RDMA copies on NICs that support IBVerbs
By default, NIC executor will be enabled if IBVerbs is found in the dynamic linker cache
- NIC executor can be indexed in two methods
- "I" Ix.y will use NIC x as the source and NIC y as the destination.
E.g. (G0 I0.5 G4)
- "N" Nx.y will use NIC closest to GPU x as source, and NIC closest to GPU y as destination
E.g. (G0 N0.4 N4)
- The closest NIC can be overridden by the environment variable CLOSEST_NIC, which should be a comma-separated
list of NIC indices to use for the corresponding GPU
- This feature can be explicitly disabled at compile time by specifying DISABLE_NIC_EXEC=1

### Modified
- Changing default data size to 256M from 64M
- Adding NUM_QUEUE_PAIRS which enables NIC traffic in A2A. Each GPU will talk to the next GPU via the closest NIC
- Sweep preset now saves last sweep run configuration to /tmp/lastSweep.cfg and can be changed via SWEEP_FILE

### Fixed
- Fixed bug with reporting when using subiterations
- Fixed bug with per-Transfer data size specification
- Fixed bug when using XCC prefered table


## v1.58.00
### Fixed
- Fixed broken specific DMA-engine copies
Expand Down
14 changes: 13 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ else()
endif()
cmake_minimum_required(VERSION 3.5)

project(TransferBench VERSION 1.58.00 LANGUAGES CXX)
project(TransferBench VERSION 1.59.00 LANGUAGES CXX)

# Default GPU architectures to build
#==================================================================================================
Expand Down Expand Up @@ -56,6 +56,18 @@ set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")

set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
include_directories(${ROCM_PATH}/include)
find_library(IBVERBS_LIBRARY ibverbs)
if (IBVERBS_LIBRARY)
if (DEFINED ENV{DISABLE_NIC_EXEC})
message(STATUS "Disabling NIC Executor support")
else()
message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
add_definitions(-DNIC_EXEC_ENABLED)
link_libraries(ibverbs)
endif()
else()
message(WARNING "IBVerbs library not found. Building without NIC executor support")
endif()
link_libraries(numa hsa-runtime64 pthread)
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
add_executable(TransferBench src/client/Client.cpp)
Expand Down
25 changes: 21 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,40 @@ NVCC=$(CUDA_PATH)/bin/nvcc

# Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
EXE=TransferBenchCuda
EXE=TransferBenchCuda
else
EXE=TransferBench
EXE=TransferBench
endif

CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
NVFLAGS = -x cu -lnuma -arch=native
COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets
LDFLAGS += -lpthread

# Compile RDMA executor if IBVerbs is found in the Dynamic Linker cache
NIC_ENABLED = 0
ifneq ($(DISABLE_NIC_EXEC),1)
ifneq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
LDFLAGS += -libverbs -DNIC_EXEC_ENABLED
NVFLAGS += -libverbs -DNIC_EXEC_ENABLED
NIC_ENABLED = 1
endif
endif

all: $(EXE)

TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus
$(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)

TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus
$(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)

clean:
rm -f *.o ./TransferBench ./TransferBenchCuda

NicStatus:
ifeq ($(NIC_ENABLED), 1)
$(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
else
$(info Building without NIC executor support)
endif
14 changes: 10 additions & 4 deletions examples/example.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# 1) CPU CPU thread
# 2) GPU GPU threadblock/Compute Unit (CU)
# 3) DMA N/A. (May only be used for copies (single SRC/DST)
# 4) NIC Queue Pair

# Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel

Expand All @@ -34,9 +35,11 @@
# #SEs : Number of SubExectors to use (CPU threads/ GPU threadblocks)
# srcMemL : Source memory locations (Where the data is to be read from)
# Executor : Executor is specified by a character indicating type, followed by device index (0-indexed)
# - C: CPU-executed (Indexed from 0 to # NUMA nodes - 1)
# - G: GPU-executed (Indexed from 0 to # GPUs - 1)
# - D: DMA-executor (Indexed from 0 to # GPUs - 1)
# - C: CPU-executed (Indexed from 0 to # NUMA nodes - 1)
# - G: GPU-executed (Indexed from 0 to # GPUs - 1)
# - D: DMA-executor (Indexed from 0 to # GPUs - 1)
# - I#.#: NIC executor (Indexed from 0 to # NICs - 1)
# - N#.#: Nearest NIC executor (Indexed from 0 to # GPUs - 1)
# dstMemL : Destination memory locations (Where the data is to be written to)
# bytesL : Number of bytes to copy (0 means use command-line specified size)
# Must be a multiple of 4 and may be suffixed with ('K','M', or 'G')
Expand All @@ -56,7 +59,10 @@
# 1 4 (C1->G2->G0) Uses 4 CUs on GPU2 to copy from CPU1 to GPU0
# 2 4 G0->G0->G1 G1->G1->G0 Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 SEs
# -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 SEs, and 2Mb from GPU1 to GPU0 with 2 SEs

# 1 2 (F0->I0.2->F1) Uses 2 QPs to transfer data from GPU0 via NIC0 to GPU1 via NIC2
# 1 1 (F0->N0.1->F1) Uses 1 QP to transfer data from GPU0 via GPU0's closest NIC to GPU1 via GPU1's closest NIC
# -2 (G0->N0.1->G1 2 128M) (G1->N1.0->G0 1 256M) Uses Nearest NIC executor to copy 128Mb from GPU0 to GPU1 with 2 QPs,
# and 256Mb from GPU1 to GPU0 with 1 QP
# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
# Lines starting with # will be ignored. Lines starting with ## will be echoed to output

Expand Down
41 changes: 28 additions & 13 deletions src/client/Client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,23 @@ int main(int argc, char **argv) {
}
}

// Track which transfers have already numBytes specified
std::vector<bool> bytesSpecified(transfers.size());
int hasUnspecified = false;
for (int i = 0; i < transfers.size(); i++) {
bytesSpecified[i] = (transfers[i].numBytes != 0);
if (transfers[i].numBytes == 0) hasUnspecified = true;
}

// Run the specified numbers of bytes otherwise generate a range of values
for (size_t bytes = (1<<10); bytes <= (1<<29); bytes *= 2) {
size_t deltaBytes = std::max(1UL, bytes / ev.samplingFactor);
size_t currBytes = (numBytesPerTransfer == 0) ? bytes : numBytesPerTransfer;
do {
for (auto& t : transfers)
t.numBytes = currBytes;
for (int i = 0; i < transfers.size(); i++) {
if (!bytesSpecified[i])
transfers[i].numBytes = currBytes;
}

if (maxVarCount == 0) {
if (TransferBench::RunTransfers(cfgOptions, transfers, results)) {
Expand Down Expand Up @@ -162,17 +172,21 @@ int main(int argc, char **argv) {
PrintResults(ev, ++testNum, bestTransfers, bestResults);
PrintErrors(bestResults.errResults);
}
if (numBytesPerTransfer != 0) break;
if (numBytesPerTransfer != 0 || !hasUnspecified) break;
currBytes += deltaBytes;
} while (currBytes < bytes * 2);
if (numBytesPerTransfer != 0) break;
if (numBytesPerTransfer != 0 || !hasUnspecified) break;
}
}
}

void DisplayUsage(char const* cmdName)
{
printf("TransferBench v%s.%s\n", TransferBench::VERSION, CLIENT_VERSION);
std::string nicSupport = "";
#if NIC_EXEC_ENABLED
nicSupport = " (with NIC support)";
#endif
printf("TransferBench v%s.%s%s\n", TransferBench::VERSION, CLIENT_VERSION, nicSupport.c_str());
printf("========================================\n");

if (numa_available() == -1) {
Expand Down Expand Up @@ -218,7 +232,7 @@ void PrintResults(EnvVars const& ev, int const testNum,
ExeType const exeType = exeDevice.exeType;
int32_t const exeIndex = exeDevice.exeIndex;

printf(" Executor: %3s %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
printf(" Executor: %3s %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
ExeTypeName[exeType], exeIndex, sep, exeResult.avgBandwidthGbPerSec, sep,
exeResult.avgDurationMsec, sep, exeResult.numBytes, sep, exeResult.sumBandwidthGbPerSec);

Expand All @@ -230,14 +244,15 @@ void PrintResults(EnvVars const& ev, int const testNum,
char exeSubIndexStr[32] = "";
if (t.exeSubIndex != -1)
sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);

printf(" Transfer %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %s%02d%s:%03d -> %s\n",
printf(" Transfer %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %c%03d%s:%03d -> %s\n",
idx, sep,
r.avgBandwidthGbPerSec, sep,
r.avgDurationMsec, sep,
r.numBytes, sep,
MemDevicesToStr(t.srcs).c_str(), ExeTypeName[exeType], exeIndex,
exeSubIndexStr, t.numSubExecs, MemDevicesToStr(t.dsts).c_str());
MemDevicesToStr(t.srcs).c_str(),
TransferBench::ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
exeSubIndexStr, t.numSubExecs,
MemDevicesToStr(t.dsts).c_str());

// Show per-iteration timing information
if (ev.showIterations) {
Expand Down Expand Up @@ -269,7 +284,7 @@ void PrintResults(EnvVars const& ev, int const testNum,
for (auto& time : times) {
double iterDurationMsec = time.first;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d %c %7.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
printf(" Iter %03d %c %8.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);

std::set<int> usedXccs;
if (time.second - 1 < r.perIterCUs.size()) {
Expand All @@ -285,11 +300,11 @@ void PrintResults(EnvVars const& ev, int const testNum,
printf(" %02d", x);
printf("\n");
}
printf(" StandardDev %c %7.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
printf(" StandardDev %c %8.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
}
}
}
printf(" Aggregate (CPU) %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n",
printf(" Aggregate (CPU) %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n",
sep, results.avgTotalBandwidthGbPerSec,
sep, results.avgTotalDurationMsec,
sep, results.totalBytesTransferred,
Expand Down
4 changes: 2 additions & 2 deletions src/client/Client.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ THE SOFTWARE.
#include "TransferBench.hpp"
#include "EnvVars.hpp"

size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);
size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<28);

char const ExeTypeName[4][4] = {"CPU", "GPU", "DMA", "IBV"};
char const ExeTypeName[5][4] = {"CPU", "GPU", "DMA", "NIC", "NIC"};

// Display detected hardware
void DisplayTopology(bool outputToCsv);
Expand Down
Loading

0 comments on commit 5984f49

Please sign in to comment.