From 62e24c271e901a9219fcc2ff2a2b7cc7a07c6a8c Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Mon, 23 Dec 2024 12:24:31 +0200 Subject: [PATCH] Create token metrics only when they are available This avoids generation of useless token/request histogram metrics for services that use Orchestrator class, but never call its token processing functionality. (Helps in differentiating frontend megaservice metrics from backend megaservice ones, especially when multiple OPEA applications run in the same cluster.) Also fix to metrics description in README. Signed-off-by: Eero Tamminen --- comps/cores/mega/orchestrator.py | 34 ++++++++++++++++++++++---------- comps/cores/telemetry/README.md | 2 +- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/comps/cores/mega/orchestrator.py b/comps/cores/mega/orchestrator.py index 803965f6e1..95bc24d5f7 100644 --- a/comps/cores/mega/orchestrator.py +++ b/comps/cores/mega/orchestrator.py @@ -25,20 +25,34 @@ class OrchestratorMetrics: - # Because: + # Metrics use "megaservice_" name prefix, and are class variables because: # - CI creates several orchestrator instances # - Prometheus requires metrics to be singletons - # - Oorchestror instances are not provided their own names - # Metrics are class members with "megaservice" name prefix - first_token_latency = Histogram("megaservice_first_token_latency", "First token latency (histogram)") - inter_token_latency = Histogram("megaservice_inter_token_latency", "Inter-token latency (histogram)") - request_latency = Histogram("megaservice_request_latency", "Whole request/reply latency (histogram)") + # - Orchestror instances do not have their own names request_pending = Gauge("megaservice_request_pending", "Count of currently pending requests (gauge)") + # Metrics related to token processing are created on demand, + # to avoid bogus ones for services that never handle tokens. + first_token_latency = None + inter_token_latency = None + request_latency = None def __init__(self) -> None: - pass - - def token_update(self, token_start: float, is_first: bool) -> float: + # initially directed to metric creation + self.token_update = self._token_update_create + self.request_update = self._request_update_create + + def _token_update_create(self, token_start: float, is_first: bool) -> float: + self.first_token_latency = Histogram("megaservice_first_token_latency", "First token latency (histogram)") + self.inter_token_latency = Histogram("megaservice_inter_token_latency", "Inter-token latency (histogram)") + self.token_update = self._token_update_real + return self.token_update(token_start, is_first) + + def _request_update_create(self, req_start: float) -> None: + self.request_latency = Histogram("megaservice_request_latency", "Whole LLM request/reply latency (histogram)") + self.request_update = self._request_update_real + self.request_update(req_start) + + def _token_update_real(self, token_start: float, is_first: bool) -> float: now = time.time() if is_first: self.first_token_latency.observe(now - token_start) @@ -46,7 +60,7 @@ def token_update(self, token_start: float, is_first: bool) -> float: self.inter_token_latency.observe(now - token_start) return now - def request_update(self, req_start: float) -> None: + def _request_update_real(self, req_start: float) -> None: self.request_latency.observe(time.time() - req_start) def pending_update(self, increase: bool) -> None: diff --git a/comps/cores/telemetry/README.md b/comps/cores/telemetry/README.md index 35a3710744..80d7174a43 100644 --- a/comps/cores/telemetry/README.md +++ b/comps/cores/telemetry/README.md @@ -48,7 +48,7 @@ Applications' megaservice `ServiceOrchectrator` provides following metrics: - `megaservice_first_token_latency`: time to first token (TTFT) - `megaservice_inter_token_latency`: inter-token latency (ITL ~ TPOT) - `megaservice_request_latency`: whole request E2E latency = TTFT + ITL \* tokens -- `megaservice_request_pending`: how many LLM requests are still in progress +- `megaservice_request_pending`: how many requests are still in progress Latency ones are histogram metrics i.e. include count, total value and set of value buckets for each item.