Skip to content

Commit

Permalink
tuner: move show_tuner_decisions tool to v3 API
Browse files Browse the repository at this point in the history
Changing the show_tuner_decisions tool to use the tuner v3 API.

Signed-off-by: Amedeo Sapio <[email protected]>
  • Loading branch information
AmedeoSapio authored and rajachan committed Aug 21, 2024
1 parent 51c9151 commit d4a76d9
Showing 1 changed file with 38 additions and 13 deletions.
51 changes: 38 additions & 13 deletions tests/unit/show_tuner_decisions.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,48 +17,73 @@ void dummy_logger(ncclDebugLogLevel level, unsigned long flags, const char *file

int main(int argc, const char **argv)
{
float collCostTable[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];

printf("nodes,ranks,size,channels,algorithm,protocol\n");
for (size_t nodes = 1; nodes <= 1024; nodes <<= 1) {
for (size_t ranks_per_node = 8; ranks_per_node <= 8; ranks_per_node <<= 1) {
void *context = NULL;
if (ncclTunerPlugin_v2.init(ranks_per_node * nodes,
if (ncclTunerPlugin_v3.init(ranks_per_node * nodes,
nodes,
dummy_logger,
&context) != 0) {
return 1;
}

for (size_t nmibytes = 1; nmibytes <= 32 * 1024; nmibytes <<= 1) {
/*NCCL_ALGO_UNDEF is -1 */
int algorithm = -1;
/* NCCL_PROTO_UNDEF is -1 */
int protocol = -1;
int algorithm = NCCL_ALGO_UNDEF;
int protocol = NCCL_ALGO_UNDEF;

/* NCCL calls getCollInfo() with nChannels=0 and ignores this
* variable if it is unchanged.
*/
int nChannels = 0;

if (ncclTunerPlugin_v2.getCollInfo(context,
/* Init cost table with large values */
for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++) {
for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) {
collCostTable[a][p] = 3600000000.0; // 1 hour;
}
}

if (ncclTunerPlugin_v3.getCollInfo(context,
ncclFuncAllReduce,
nmibytes * 1024 * 1024,
false,
true,
1,
&algorithm,
&protocol,
(float **)collCostTable,
NCCL_NUM_ALGORITHMS,
NCCL_NUM_PROTOCOLS,
&nChannels) != 0) {
return 1;
}

/* Find the combination with minimum cost */
float minTime = 3600000000.0;
for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++) {
for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++) {
if (collCostTable[a][p] == NCCL_ALGO_PROTO_IGNORE) {
continue;
}
if (collCostTable[a][p] >= 0.0 && collCostTable[a][p] < minTime) {
algorithm = a;
protocol = p;
minTime = collCostTable[a][p];
}
}
}

printf("%lu,%lu,%luMiB,%d,%s,%s\n",
nodes,
nodes * ranks_per_node,
nmibytes,
nChannels,
algorithm >= 0 && algorithm <= 5 ? algo_names[algorithm] : "none",
protocol >= 0 && protocol <= 2 ? proto_names[protocol] : "none");
algorithm >= 0 && algorithm <= NCCL_NUM_ALGORITHMS ? algo_names[algorithm]
: "none",
protocol >= 0 && protocol <= NCCL_NUM_PROTOCOLS ? proto_names[protocol]
: "none");
}
ncclTunerPlugin_v2.destroy(context);

ncclTunerPlugin_v3.destroy(context);
}
}
return 0;
Expand Down

0 comments on commit d4a76d9

Please sign in to comment.