Skip to content

Commit

Permalink
Create token metrics only when they are available
Browse files Browse the repository at this point in the history
This avoids generation of useless token/request histogram metrics
for services that use Orchestrator class, but never call its token
processing functionality.

(Helps in differentiating frontend megaservice metrics from backend
megaservice ones, especially when multiple OPEA applications run in
the same cluster.)

Also fix to metrics description in README.

Signed-off-by: Eero Tamminen <[email protected]>
  • Loading branch information
eero-t committed Jan 3, 2025
1 parent 6419ace commit 62e24c2
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 11 deletions.
34 changes: 24 additions & 10 deletions comps/cores/mega/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,28 +25,42 @@


class OrchestratorMetrics:
# Because:
# Metrics use "megaservice_" name prefix, and are class variables because:
# - CI creates several orchestrator instances
# - Prometheus requires metrics to be singletons
# - Oorchestror instances are not provided their own names
# Metrics are class members with "megaservice" name prefix
first_token_latency = Histogram("megaservice_first_token_latency", "First token latency (histogram)")
inter_token_latency = Histogram("megaservice_inter_token_latency", "Inter-token latency (histogram)")
request_latency = Histogram("megaservice_request_latency", "Whole request/reply latency (histogram)")
# - Orchestror instances do not have their own names
request_pending = Gauge("megaservice_request_pending", "Count of currently pending requests (gauge)")
# Metrics related to token processing are created on demand,
# to avoid bogus ones for services that never handle tokens.
first_token_latency = None
inter_token_latency = None
request_latency = None

def __init__(self) -> None:
pass

def token_update(self, token_start: float, is_first: bool) -> float:
# initially directed to metric creation
self.token_update = self._token_update_create
self.request_update = self._request_update_create

def _token_update_create(self, token_start: float, is_first: bool) -> float:
self.first_token_latency = Histogram("megaservice_first_token_latency", "First token latency (histogram)")
self.inter_token_latency = Histogram("megaservice_inter_token_latency", "Inter-token latency (histogram)")
self.token_update = self._token_update_real
return self.token_update(token_start, is_first)

def _request_update_create(self, req_start: float) -> None:
self.request_latency = Histogram("megaservice_request_latency", "Whole LLM request/reply latency (histogram)")
self.request_update = self._request_update_real
self.request_update(req_start)

def _token_update_real(self, token_start: float, is_first: bool) -> float:
now = time.time()
if is_first:
self.first_token_latency.observe(now - token_start)
else:
self.inter_token_latency.observe(now - token_start)
return now

def request_update(self, req_start: float) -> None:
def _request_update_real(self, req_start: float) -> None:
self.request_latency.observe(time.time() - req_start)

def pending_update(self, increase: bool) -> None:
Expand Down
2 changes: 1 addition & 1 deletion comps/cores/telemetry/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ Applications' megaservice `ServiceOrchectrator` provides following metrics:
- `megaservice_first_token_latency`: time to first token (TTFT)
- `megaservice_inter_token_latency`: inter-token latency (ITL ~ TPOT)
- `megaservice_request_latency`: whole request E2E latency = TTFT + ITL \* tokens
- `megaservice_request_pending`: how many LLM requests are still in progress
- `megaservice_request_pending`: how many requests are still in progress

Latency ones are histogram metrics i.e. include count, total value and set of value buckets for each item.

Expand Down

0 comments on commit 62e24c2

Please sign in to comment.