Create token metrics only when they are available

This avoids generation of useless token/request histogram metrics for services that use Orchestrator class, but never call its token processing functionality. (Helps in differentiating frontend megaservice metrics from backend megaservice ones, especially when multiple OPEA applications run in the same cluster.) Also fix to metrics description in README. Signed-off-by: Eero Tamminen <[email protected]>
opea-project · Jan 3, 2025 · 62e24c2 · 62e24c2
1 parent 6419ace
commit 62e24c2
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 11 deletions.
diff --git a/comps/cores/mega/orchestrator.py b/comps/cores/mega/orchestrator.py
@@ -25,28 +25,42 @@
 
 
 class OrchestratorMetrics:
-    # Because:
+    # Metrics use "megaservice_" name prefix, and are class variables because:
     # - CI creates several orchestrator instances
     # - Prometheus requires metrics to be singletons
-    # - Oorchestror instances are not provided their own names
-    # Metrics are class members with "megaservice" name prefix
-    first_token_latency = Histogram("megaservice_first_token_latency", "First token latency (histogram)")
-    inter_token_latency = Histogram("megaservice_inter_token_latency", "Inter-token latency (histogram)")
-    request_latency = Histogram("megaservice_request_latency", "Whole request/reply latency (histogram)")
+    # - Orchestror instances do not have their own names
     request_pending = Gauge("megaservice_request_pending", "Count of currently pending requests (gauge)")
+    # Metrics related to token processing are created on demand,
+    # to avoid bogus ones for services that never handle tokens.
+    first_token_latency = None
+    inter_token_latency = None
+    request_latency = None
 
     def __init__(self) -> None:
-        pass
-
-    def token_update(self, token_start: float, is_first: bool) -> float:
+        # initially directed to metric creation
+        self.token_update = self._token_update_create
+        self.request_update = self._request_update_create
+
+    def _token_update_create(self, token_start: float, is_first: bool) -> float:
+        self.first_token_latency = Histogram("megaservice_first_token_latency", "First token latency (histogram)")
+        self.inter_token_latency = Histogram("megaservice_inter_token_latency", "Inter-token latency (histogram)")
+        self.token_update = self._token_update_real
+        return self.token_update(token_start, is_first)
+
+    def _request_update_create(self, req_start: float) -> None:
+        self.request_latency = Histogram("megaservice_request_latency", "Whole LLM request/reply latency (histogram)")
+        self.request_update = self._request_update_real
+        self.request_update(req_start)
+
+    def _token_update_real(self, token_start: float, is_first: bool) -> float:
         now = time.time()
         if is_first:
             self.first_token_latency.observe(now - token_start)
         else:
             self.inter_token_latency.observe(now - token_start)
         return now
 
-    def request_update(self, req_start: float) -> None:
+    def _request_update_real(self, req_start: float) -> None:
         self.request_latency.observe(time.time() - req_start)
 
     def pending_update(self, increase: bool) -> None:

diff --git a/comps/cores/telemetry/README.md b/comps/cores/telemetry/README.md
@@ -48,7 +48,7 @@ Applications' megaservice `ServiceOrchectrator` provides following metrics:
 - `megaservice_first_token_latency`: time to first token (TTFT)
 - `megaservice_inter_token_latency`: inter-token latency (ITL ~ TPOT)
 - `megaservice_request_latency`: whole request E2E latency = TTFT + ITL \* tokens
-- `megaservice_request_pending`: how many LLM requests are still in progress
+- `megaservice_request_pending`: how many requests are still in progress
 
 Latency ones are histogram metrics i.e. include count, total value and set of value buckets for each item.