Skip to content

Commit

Permalink
Limit kernel timeout messages to 100
Browse files Browse the repository at this point in the history
  • Loading branch information
wenkaidu committed Jan 10, 2025
1 parent 01a2966 commit 974a3de
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion src/device/prims_simple.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,16 @@ class Primitives<
if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) ||
((flags & (Send*RoleWaitSend)) && !noSendWait)) {
int spins = 0;
static int repeat = 100;
while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
__builtin_amdgcn_s_sleep(1);
connStepCache = loadStepValue(connStepPtr);
if (checkAbort(spins)) break;
//if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
if (spins == 0) traceData(__LINE__, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
if (spins == 0 && repeat > 0) {
repeat --;
traceData(__LINE__, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
}
}
__asm__ __volatile__("s_wakeup");
}
Expand Down

0 comments on commit 974a3de

Please sign in to comment.