[Metrics] Add --show-hidden-metrics-for-version CLI arg

Add some infrastructure to help us deprecate and remove metrics in a less user-hostile way. Our deprecation process will now be: 1) Deprecate the metric in 0.N.0 - document the deprecation in release notes, user-facing docs, and the help text in `/metrics` 2) Hide the metric in 0.N+1.0 - users can still re-enable the metrics using `--show-hidden-metrics-for-version=0.N.0` as an escape hatch 3) Remove the metric completely in 0.N+2.0 `--show-hidden-metrics` takes a version string argument so that users cannot fall into the habit of always enabling all deprecated metrics, which would defeat the purpose. This approach is copied directly from kubernetes/kubernetes#85270 Signed-off-by: Mark McLoughlin <[email protected]>
vllm-project · Feb 14, 2025 · 52c20bf · 52c20bf
1 parent c9e2d64
commit 52c20bf
Show file tree

Hide file tree

Showing 8 changed files with 98 additions and 6 deletions.
diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
@@ -36,3 +36,11 @@ The following metrics are exposed:
 :language: python
 :start-after: begin-metrics-definitions
 :::
+
+The following metrics are deprecated and due to be removed in a future version:
+
+- *(No metrics are currently deprecated)*
+
+Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
+but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
+and are then removed in version `X.Y+2`.
diff --git a/tests/test_version.py b/tests/test_version.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from unittest.mock import patch
+
+import pytest
+
+from vllm import version
+
+
+def test_version_is_defined():
+    assert version.__version__ is not None
+
+
+def test_version_tuple():
+    assert len(version.__version_tuple__) in (3, 4, 5)
+
+
+@pytest.mark.parametrize(
+    "version_tuple, version_str, expected",
+    [
+        ((0, 0, "dev"), "0.0", True),
+        ((0, 0, "dev"), "foobar", True),
+        ((0, 7, 4), "0.6", True),
+        ((0, 7, 4), "0.5", False),
+        ((0, 7, 4), "0.7", False),
+        ((1, 2, 3), "1.1", True),
+        ((1, 2, 3), "1.0", False),
+        ((1, 2, 3), "1.2", False),
+        # This won't work as expected
+        ((1, 0, 0), "1.-1", True),
+        ((1, 0, 0), "0.9", False),
+        ((1, 0, 0), "0.17", False),
+    ])
+def test_prev_minor_version_was(version_tuple, version_str, expected):
+    with patch("vllm.version.__version_tuple__", version_tuple):
+        assert version._prev_minor_version_was(version_str) == expected
diff --git a/vllm/config.py b/vllm/config.py
@@ -2556,7 +2556,9 @@ def __post_init__(self):
 
 @dataclass
 class ObservabilityConfig:
-    """Configuration for observability."""
+    """Configuration for observability - metrics and tracing."""
+    show_hidden_metrics: bool = False
+
     otlp_traces_endpoint: Optional[str] = None
 
     # Collecting detailed timing information for each request can be expensive.

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -10,6 +10,7 @@
 import torch
 
 import vllm.envs as envs
+from vllm import version
 from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
                          DecodingConfig, DeviceConfig, HfOverrides,
                          KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
@@ -185,6 +186,7 @@ class EngineArgs:
     qlora_adapter_name_or_path: Optional[str] = None
     disable_logprobs_during_spec_decoding: Optional[bool] = None
 
+    show_hidden_metrics_for_version: Optional[str] = None
     otlp_traces_endpoint: Optional[str] = None
     collect_detailed_traces: Optional[str] = None
     disable_async_output_proc: bool = False
@@ -877,6 +879,18 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             default=None,
                             help='Name or path of the QLoRA adapter.')
 
+        parser.add_argument('--show-hidden-metrics-for-version',
+                            type=str,
+                            default=None,
+                            help='Enable deprecated Prometheus metrics that '
+                            'have been hidden since the specified version. '
+                            'For example, if a previously deprecated metric '
+                            'has been hidden since the v0.7.0 release, you '
+                            'use --show-hidden-metrics-for-version=0.7 as a '
+                            'temporary escape hatch while you migrate to new '
+                            'metrics. The metric is likely to be removed '
+                            'completely in an upcoming release.')
+
         parser.add_argument(
             '--otlp-traces-endpoint',
             type=str,
@@ -1273,6 +1287,11 @@ def create_engine_config(self,
         decoding_config = DecodingConfig(
             guided_decoding_backend=self.guided_decoding_backend)
 
+        show_hidden_metrics = False
+        if self.show_hidden_metrics_for_version is not None:
+            show_hidden_metrics = version._prev_minor_version_was(
+                self.show_hidden_metrics_for_version)
+
         detailed_trace_modules = []
         if self.collect_detailed_traces is not None:
             detailed_trace_modules = self.collect_detailed_traces.split(",")
@@ -1282,6 +1301,7 @@ def create_engine_config(self,
                     f"Invalid module {m} in collect_detailed_traces. "
                     f"Valid modules are {ALLOWED_DETAILED_TRACE_MODULES}")
         observability_config = ObservabilityConfig(
+            show_hidden_metrics=show_hidden_metrics,
             otlp_traces_endpoint=self.otlp_traces_endpoint,
             collect_model_forward_time="model" in detailed_trace_modules
             or "all" in detailed_trace_modules,

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
@@ -517,6 +517,10 @@ def __init__(self, local_interval: float, labels: Dict[str, str],
         self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
                                          vllm_config=vllm_config)
 
+        # Use this flag to hide metrics that were deprecated in
+        # a previous release and which will be removed future
+        self.show_hidden_metrics = vllm_config.show_hidden_metrics
+
     def _log_gauge(self, gauge, data: Union[int, float]) -> None:
         # Convenience function for logging to gauge.
         gauge.labels(**self.labels).set(data)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -57,7 +57,7 @@ def __init__(
         if self.log_stats:
             self.stat_loggers.extend([
                 LoggingStatLogger(),
-                PrometheusStatLogger(vllm_config.model_config),
+                PrometheusStatLogger(vllm_config),
             ])
 
         # Tokenizer (+ ensure liveness if running in another process).

diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
@@ -7,7 +7,7 @@
 import numpy as np
 import prometheus_client
 
-from vllm.config import ModelConfig
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
 from vllm.v1.engine import FinishReason
@@ -92,13 +92,17 @@ def log(self, scheduler_stats: SchedulerStats,
 
 class PrometheusStatLogger(StatLoggerBase):
 
-    def __init__(self, model_config: ModelConfig):
+    def __init__(self, vllm_config: VllmConfig):
         self._unregister_vllm_metrics()
 
+        # Use this flag to hide metrics that were deprecated in
+        # a previous release and which will be removed future
+        self.show_hidden_metrics = vllm_config.show_hidden_metrics
+
         labelnames = ["model_name"]
-        labelvalues = [model_config.served_model_name]
+        labelvalues = [vllm_config.model_config.served_model_name]
 
-        max_model_len = model_config.max_model_len
+        max_model_len = vllm_config.model_config.max_model_len
 
         self.gauge_scheduler_running = prometheus_client.Gauge(
             name="vllm:num_requests_running",

diff --git a/vllm/version.py b/vllm/version.py
@@ -11,3 +11,21 @@
 
     __version__ = "dev"
     __version_tuple__ = (0, 0, __version__)
+
+
+def _prev_minor_version_was(version_str):
+    """Check whether a given version matches the previous minor version.
+
+    Return True if version_str matches the previous minor version.
+
+    For example - return True if the current version if 0.7.4 and the
+    supplied version_str is '0.6'.
+
+    Used for --show-hidden-metrics-for-version.
+    """
+    # Match anything if this is a dev tree
+    if __version_tuple__[0:2] == (0, 0):
+        return True
+
+    # Note - this won't do the right thing when we release 1.0!
+    return version_str == f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"