Skip to content

Commit

Permalink
[V1][Metrics] Add request_success_total counter, labelled with finish…
Browse files Browse the repository at this point in the history
… reason (#12579)

Signed-off-by: Mark McLoughlin <[email protected]>
  • Loading branch information
markmc authored Feb 5, 2025
1 parent 18016a5 commit 233df6f
Show file tree
Hide file tree
Showing 7 changed files with 66 additions and 27 deletions.
1 change: 1 addition & 0 deletions tests/entrypoints/openai/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
"vllm:gpu_cache_usage_perc",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:request_success_total",
"vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count",
Expand Down
21 changes: 19 additions & 2 deletions vllm/v1/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,23 @@
from vllm.sampling_params import SamplingParams


class RequestFinishedReason(enum.IntEnum):
"""
Reason a request finished - stop, length, or abort.
stop - a stop string was emitted
length - max_tokens was consumed, or max_model_len was reached
abort - aborted for another reason
"""
STOP = 0
LENGTH = 1
ABORT = 2

def __str__(self):
return self.name.lower()


@dataclass
class EngineCoreRequest:

Expand Down Expand Up @@ -45,7 +62,7 @@ class EngineCoreOutput(
request_id: str
new_token_ids: List[int]
finished: bool
finish_reason: Optional[str] = None
finish_reason: Optional[RequestFinishedReason] = None
stop_reason: Union[int, str, None] = None


Expand All @@ -56,7 +73,7 @@ class EngineCoreOutputs(
gc=False): # type: ignore[call-arg]

#NOTE(Nick): We could consider ways to make this more compact,
# e.g. columnwise layout and using an int enum for finish/stop reason
# e.g. columnwise layout

# [num_reqs]
outputs: List[EngineCoreOutput]
Expand Down
9 changes: 5 additions & 4 deletions vllm/v1/engine/detokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from vllm.sampling_params import RequestOutputKind
from vllm.transformers_utils.detokenizer_utils import (
AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest,
RequestFinishedReason)

logger = init_logger(__name__)

Expand All @@ -18,7 +19,7 @@ class DetokenizerOutput:
output_text: str
token_ids: List[int]
finished: bool
finish_reason: Optional[str] = None
finish_reason: Optional[RequestFinishedReason] = None
stop_reason: Union[int, str, None] = None


Expand Down Expand Up @@ -147,13 +148,13 @@ def update_from_output(
stop_str, truncate_to = stop
if truncate_to != -1:
self.output_text = self.output_text[:truncate_to]
finish_reason = "stop" # TODO: use constant
finish_reason = RequestFinishedReason.STOP
stop_reason = stop_str

# TODO: handle stop_token_ids here too?

# 3) Update the RequestOutput object with the new text.
finished = bool(finish_reason)
finished = finish_reason is not None
if self.output_kind == RequestOutputKind.FINAL_ONLY \
and not finished:
return None
Expand Down
22 changes: 12 additions & 10 deletions vllm/v1/engine/output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,10 @@ def process_outputs(
engine_core_output)

# 3) Create and handle RequestOutput objects.
if request_output := self._make_request_output(
req_state, detokenizer_output):
if detokenizer_output is not None:
request_output = self._make_request_output(
req_state, detokenizer_output)

if req_state.queue is not None:
# AsyncLLM: put into queue for handling by generate().
req_state.queue.put_nowait(request_output)
Expand All @@ -172,6 +174,8 @@ def process_outputs(

# Free completed requests.
if request_output.finished:
assert detokenizer_output.finish_reason is not None

self.request_states.pop(req_id)
if not engine_core_output.finished:
# If req not finished in EngineCore, but Detokenizer
Expand All @@ -180,7 +184,8 @@ def process_outputs(

# Track per-request stats
iteration_stats.update_from_finished_request(
request_output, req_state.stats)
detokenizer_output.finish_reason, request_output,
req_state.stats)

return OutputProcessorOutput(
request_outputs=request_outputs,
Expand All @@ -191,12 +196,8 @@ def process_outputs(
@staticmethod
def _make_request_output(
request_state: RequestState,
detokenizer_output: Optional[DetokenizerOutput],
) -> Optional[RequestOutput]:

if detokenizer_output is None:
return None

detokenizer_output: DetokenizerOutput,
) -> RequestOutput:
request_output = RequestOutput.new(
request_state.request_id,
request_state.prompt,
Expand All @@ -207,7 +208,8 @@ def _make_request_output(
)
if detokenizer_output.finished:
completion_output = request_output.outputs[0]
completion_output.finish_reason = detokenizer_output.finish_reason
completion_output.finish_reason = str(
detokenizer_output.finish_reason)
completion_output.stop_reason = detokenizer_output.stop_reason

return request_output
15 changes: 14 additions & 1 deletion vllm/v1/metrics/loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@

import time
from abc import ABC, abstractmethod
from typing import List
from typing import Dict, List

import numpy as np
import prometheus_client

from vllm.config import ModelConfig
from vllm.logger import init_logger
from vllm.v1.engine import RequestFinishedReason
from vllm.v1.metrics.stats import IterationStats, SchedulerStats

logger = init_logger(__name__)
Expand Down Expand Up @@ -116,6 +117,17 @@ def __init__(self, model_config: ModelConfig):
documentation="Number of generation tokens processed.",
labelnames=labelnames).labels(*labelvalues)

self.counter_request_success: Dict[RequestFinishedReason,
prometheus_client.Counter] = {}
counter_request_success_base = prometheus_client.Counter(
name="vllm:request_success_total",
documentation="Count of successfully processed requests.",
labelnames=labelnames + ["finished_reason"])
for reason in RequestFinishedReason:
self.counter_request_success[
reason] = counter_request_success_base.labels(*(labelvalues +
[str(reason)]))

self.histogram_num_prompt_tokens_request = \
prometheus_client.Histogram(
name="vllm:request_prompt_tokens",
Expand Down Expand Up @@ -163,6 +175,7 @@ def log(self, scheduler_stats: SchedulerStats,
iteration_stats.num_generation_tokens)

for finished_request in iteration_stats.finished_requests:
self.counter_request_success[finished_request.finish_reason].inc()
self.histogram_num_prompt_tokens_request.observe(
finished_request.num_prompt_tokens)
self.histogram_num_generation_tokens_request.observe(
Expand Down
10 changes: 7 additions & 3 deletions vllm/v1/metrics/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

if TYPE_CHECKING:
from vllm.outputs import RequestOutput
from vllm.v1.engine import EngineCoreOutput
from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason


@dataclass
Expand All @@ -32,6 +32,7 @@ class RequestStateStats:
class FinishedRequestStats:
"""Stats associated with a finished request."""

finish_reason: "RequestFinishedReason"
num_prompt_tokens: int = 0
num_generation_tokens: int = 0

Expand Down Expand Up @@ -73,8 +74,11 @@ def update_from_output(self, output: "EngineCoreOutput",
request_state_stats.num_generation_tokens += num_new_generation_tokens
request_state_stats.last_token_time = now

def update_from_finished_request(self, request_output: "RequestOutput",
def update_from_finished_request(self,
finish_reason: "RequestFinishedReason",
request_output: "RequestOutput",
request_state_stats: RequestStateStats):
self.finished_requests.append(
FinishedRequestStats(len(request_output.prompt_token_ids),
FinishedRequestStats(finish_reason,
len(request_output.prompt_token_ids),
request_state_stats.num_generation_tokens))
15 changes: 8 additions & 7 deletions vllm/v1/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams
from vllm.sequence import RequestMetrics
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason
from vllm.v1.utils import ConstantList

if TYPE_CHECKING:
Expand Down Expand Up @@ -109,7 +109,7 @@ def num_output_tokens(self) -> int:
def is_finished(self) -> bool:
return RequestStatus.is_finished(self.status)

def get_finished_reason(self) -> Union[str, None]:
def get_finished_reason(self) -> Union[RequestFinishedReason, None]:
return RequestStatus.get_finished_reason(self.status)

def has_encoder_inputs(self) -> bool:
Expand Down Expand Up @@ -149,7 +149,8 @@ def is_finished(status: "RequestStatus") -> bool:
return status > RequestStatus.PREEMPTED

@staticmethod
def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
def get_finished_reason(
status: "RequestStatus") -> Union[RequestFinishedReason, None]:
return _FINISHED_REASON_MAP.get(status)


Expand All @@ -158,8 +159,8 @@ def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
# are longer than the model's length cap. Therefore, the stop
# reason should also be "length" as in OpenAI API.
_FINISHED_REASON_MAP = {
RequestStatus.FINISHED_STOPPED: "stop",
RequestStatus.FINISHED_LENGTH_CAPPED: "length",
RequestStatus.FINISHED_ABORTED: "abort",
RequestStatus.FINISHED_IGNORED: "length",
RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP,
RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH,
RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT,
RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH,
}

0 comments on commit 233df6f

Please sign in to comment.