diff --git a/internal/rm/health.go b/internal/rm/health.go index 3a308ff3e..453b46e57 100644 --- a/internal/rm/health.go +++ b/internal/rm/health.go @@ -62,16 +62,17 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic // FIXME: formalize the full list and document it. // http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4 // Application errors: the GPU should still be healthy - applicationErrorXids := []uint64{ - 13, // Graphics Engine Exception - 31, // GPU memory page fault - 43, // GPU stopped processing - 45, // Preemptive cleanup, due to previous errors - 68, // Video processor exception + ignoredXids := []uint64{ + 13, // Graphics Engine Exception + 31, // GPU memory page fault + 43, // GPU stopped processing + 45, // Preemptive cleanup, due to previous errors + 68, // Video processor exception + 109, // Context Switch Timeout Error } skippedXids := make(map[uint64]bool) - for _, id := range applicationErrorXids { + for _, id := range ignoredXids { skippedXids[id] = true }