From b6fac76315e4ddf0dac9ba43823ee2e908bc1ea4 Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@redpanda.com>
Date: Mon, 9 Oct 2023 23:00:55 -0300
Subject: [PATCH] Make total_steal_time() monotonic.

total_steal_time() can decrease in value from call to call, a violation
of the rule that a counter metric must never decrease.

This happens because steal time is "awake time (wall clock)" minus
"cpu thread time (get_rusage style CPU time)". The awake time is itself
composed of "time since reactor start" minus "accumulated sleep time",
but it can be under-counted because sleep time is over-counted: as it's
not possible to determine exactly the true sleep time, only get
timestamps before and after a period you think might involve a sleep.

Currently, sleep is even more significantly over-counted than the error
described above as it is measured at a point which includes significant
non-sleep work.

The result is that when there is little to no true steal, CPU time will
exceed measured awake wall clock time, resulting in negative steal.

This change "fixes" this by enforcing that steal time is monotonically
increasing. This occurs at measurement time by checking if "true steal"
(i.e., the old definition of steal) has increased since the last
measurement and adding that delta to our monotonic steal counter if so.
Otherwise the delta is dropped.

While not totally ideal this leads to a useful metric which mostly
clamps away the error related to negative steal times, and more
importantly avoids the catastrophic failure of PromQL functions when
used on non-monotonic functions.

Fixes #1521.
Fixes #2374.
---
 include/seastar/core/reactor.hh |  9 +++++++++
 src/core/reactor.cc             | 28 +++++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/include/seastar/core/reactor.hh b/include/seastar/core/reactor.hh
index b3c17dca14..63e3fc0a39 100644
--- a/include/seastar/core/reactor.hh
+++ b/include/seastar/core/reactor.hh
@@ -302,6 +302,15 @@ private:
     const bool _reuseport;
     circular_buffer<double> _loads;
     double _load = 0;
+    // Next two fields are required to enforce the monotonicity of total_steal_time()
+    // see that method for details.
+
+    // Last measured accumulated steal time, i.e., the simple difference of accumulated
+    // awake time and consumed thread CPU time.
+    sched_clock::duration _last_true_steal{0};
+    // Accumulated steal time forced to be monotinic by rejecting any updates that would
+    // decrease it. See total_steal_time() for details.
+    sched_clock::duration _last_mono_steal{0};
     sched_clock::duration _total_idle{0};
     sched_clock::duration _total_sleep{0};
     sched_clock::time_point _start_time = now();
diff --git a/src/core/reactor.cc b/src/core/reactor.cc
index 7870c8dc7b..4ac5a02c9b 100644
--- a/src/core/reactor.cc
+++ b/src/core/reactor.cc
@@ -4812,7 +4812,33 @@ std::chrono::nanoseconds reactor::total_steal_time() {
     // unexpectedly blocked (since CPU time won't tick up when that occurs).
     //
     // But what we have here should be good enough and at least has a well defined meaning.
-    return total_awake_time() - total_cpu_time();
+    //
+    // Because we calculate sleep time with timestamps around polling methods that may sleep, like
+    // io_getevents, we systematically over-count sleep time, since there is CPU usage within the
+    // period timed as sleep, before and after an actual sleep occurs (and no sleep may occur at all,
+    // e.g., if there are events immediately available). Over-counting sleep means we under-count the 
+    // wall-clock awake time, and so if there is no "true" steal, we will generally have a small
+    // *negative* steal time, because we under-count awake wall clock time while thread CPU time does
+    // not have a corresponding error.
+    //
+    // Becuase we claim "steal" is a counter, we must ensure that it never deceases, because PromQL
+    // functions which use counters will produce non-sensical results if they do. Therefore we clamp
+    // the output such that it never decreases.
+    //
+    // Finally, we don't just clamp difference of awake and CPU time since proces start at 0, but
+    // take the last value we returned from this function and then calculate the incremental steal
+    // time since that measurement, clamped to 0. This means that as soon as steal time becomes 
+    // positive, it will be reflected in the measurement, rather than needing to "consume" all the
+    // accumulated negative steal time before positive steal times start showing up.
+
+
+    auto true_steal = total_awake_time() - total_cpu_time();
+    auto mono_steal = _last_mono_steal + std::max(true_steal - _last_true_steal, 0ns);
+
+    _last_true_steal = true_steal;
+    _last_mono_steal = mono_steal;
+
+    return mono_steal;
 }
 
 static std::atomic<unsigned long> s_used_scheduling_group_ids_bitmap{3}; // 0=main, 1=atexit