Skip to content

Commit

Permalink
Merge pull request NVIDIA#1165 from elezar/ignore-xid-109
Browse files Browse the repository at this point in the history
Ignore XID error 109
  • Loading branch information
elezar authored Feb 20, 2025
2 parents 831c31e + f74a958 commit 23bc08a
Showing 1 changed file with 8 additions and 7 deletions.
15 changes: 8 additions & 7 deletions internal/rm/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,16 +62,17 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
// FIXME: formalize the full list and document it.
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
// Application errors: the GPU should still be healthy
applicationErrorXids := []uint64{
13, // Graphics Engine Exception
31, // GPU memory page fault
43, // GPU stopped processing
45, // Preemptive cleanup, due to previous errors
68, // Video processor exception
ignoredXids := []uint64{
13, // Graphics Engine Exception
31, // GPU memory page fault
43, // GPU stopped processing
45, // Preemptive cleanup, due to previous errors
68, // Video processor exception
109, // Context Switch Timeout Error
}

skippedXids := make(map[uint64]bool)
for _, id := range applicationErrorXids {
for _, id := range ignoredXids {
skippedXids[id] = true
}

Expand Down

0 comments on commit 23bc08a

Please sign in to comment.