diff --git a/genai-perf/genai_perf/constants.py b/genai-perf/genai_perf/constants.py index b951524b..024dcd74 100644 --- a/genai-perf/genai_perf/constants.py +++ b/genai-perf/genai_perf/constants.py @@ -26,7 +26,7 @@ DEFAULT_HTTP_URL = "localhost:8000" DEFAULT_GRPC_URL = "localhost:8001" - +DEFAULT_TRITON_METRICS_URL = "localhost:8002/metrics" OPEN_ORCA = "openorca" CNN_DAILY_MAIL = "cnn_dailymail" diff --git a/genai-perf/genai_perf/main.py b/genai-perf/genai_perf/main.py index caf3e804..27eb6182 100755 --- a/genai-perf/genai_perf/main.py +++ b/genai-perf/genai_perf/main.py @@ -112,6 +112,7 @@ def report_output(data_parser: ProfileDataParser, args: Namespace) -> None: else: raise GenAIPerfException("No valid infer mode specified") + # TPA-274 - Integrate telemetry metrics with other metrics for export stats = data_parser.get_statistics(infer_mode, load_level) reporter = OutputReporter(stats, args) reporter.report_output() diff --git a/genai-perf/genai_perf/metrics/__init__.py b/genai-perf/genai_perf/metrics/__init__.py index 01ca53c5..b3cdd6dc 100644 --- a/genai-perf/genai_perf/metrics/__init__.py +++ b/genai-perf/genai_perf/metrics/__init__.py @@ -27,3 +27,4 @@ from genai_perf.metrics.llm_metrics import LLMMetrics from genai_perf.metrics.metrics import MetricMetadata, Metrics from genai_perf.metrics.statistics import Statistics +from genai_perf.metrics.telemetry_metrics import TelemetryMetrics diff --git a/genai-perf/genai_perf/metrics/telemetry_metrics.py b/genai-perf/genai_perf/metrics/telemetry_metrics.py new file mode 100755 index 00000000..8caed57a --- /dev/null +++ b/genai-perf/genai_perf/metrics/telemetry_metrics.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from typing import List + +from genai_perf.metrics.metrics import MetricMetadata + + +class TelemetryMetrics: + """ + A class that contains common telemetry metrics. + Metrics are stored as lists where each inner list corresponds to multiple measurements per GPU. + Each measurement is recorded every second. + """ + + TELEMETRY_METRICS = [ + MetricMetadata("gpu_power_usage", "watts"), + MetricMetadata("gpu_power_limit", "watts"), + MetricMetadata("energy_consumption", "joules"), + MetricMetadata("gpu_utilization", "percentage"), + MetricMetadata("total_gpu_memory", "bytes"), + MetricMetadata("gpu_memory_used", "bytes"), + ] + + def __init__( + self, + gpu_power_usage: List[List[float]] = [], # Multiple measurements per GPU + gpu_power_limit: List[List[float]] = [], + energy_consumption: List[List[float]] = [], + gpu_utilization: List[List[float]] = [], + total_gpu_memory: List[List[float]] = [], + gpu_memory_used: List[List[float]] = [], + ) -> None: + self.gpu_power_usage = gpu_power_usage + self.gpu_power_limit = gpu_power_limit + self.energy_consumption = energy_consumption + self.gpu_utilization = gpu_utilization + self.total_gpu_memory = total_gpu_memory + self.gpu_memory_used = gpu_memory_used + + def update_metrics(self, measurement_data: dict) -> None: + """Update the metrics with new measurement data""" + for metric in self.TELEMETRY_METRICS: + metric_key = metric.name + if metric_key in measurement_data: + getattr(self, metric_key).append(measurement_data[metric_key]) + + def __repr__(self): + attr_strs = [] + for k, v in self.__dict__.items(): + if not k.startswith("_"): + attr_strs.append(f"{k}={v}") + return f"TelemetryMetrics({','.join(attr_strs)})" + + @property + def telemetry_metrics(self) -> List[MetricMetadata]: + return self.TELEMETRY_METRICS diff --git a/genai-perf/genai_perf/parser.py b/genai-perf/genai_perf/parser.py index c135313d..72fdc1d0 100644 --- a/genai-perf/genai_perf/parser.py +++ b/genai-perf/genai_perf/parser.py @@ -38,6 +38,7 @@ CNN_DAILY_MAIL, DEFAULT_ARTIFACT_DIR, DEFAULT_COMPARE_DIR, + DEFAULT_TRITON_METRICS_URL, OPEN_ORCA, ) from genai_perf.llm_inputs.llm_inputs import ( @@ -765,9 +766,23 @@ def compare_handler(args: argparse.Namespace): def profile_handler(args, extra_args): + from genai_perf.telemetry_data.triton_telemetry_data_collector import ( + TritonTelemetryDataCollector, + ) from genai_perf.wrapper import Profiler - Profiler.run(args=args, extra_args=extra_args) + telemetry_data_collector = None + if args.service_kind == "triton": + # TPA-275: pass server url as a CLI option in non-default case + telemetry_data_collector = TritonTelemetryDataCollector( + server_metrics_url=DEFAULT_TRITON_METRICS_URL + ) + + Profiler.run( + args=args, + extra_args=extra_args, + telemetry_data_collector=telemetry_data_collector, + ) ### Parser Initialization ### diff --git a/genai-perf/genai_perf/telemetry_data/__init__.py b/genai-perf/genai_perf/telemetry_data/__init__.py new file mode 100644 index 00000000..62ed57f8 --- /dev/null +++ b/genai-perf/genai_perf/telemetry_data/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from genai_perf.telemetry_data.telemetry_data_collector import TelemetryDataCollector diff --git a/genai-perf/genai_perf/telemetry_data/telemetry_data_collector.py b/genai-perf/genai_perf/telemetry_data/telemetry_data_collector.py new file mode 100755 index 00000000..79002ce4 --- /dev/null +++ b/genai-perf/genai_perf/telemetry_data/telemetry_data_collector.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import time +from abc import ABC, abstractmethod +from threading import Event, Thread +from typing import Optional + +import requests +from genai_perf.metrics.telemetry_metrics import TelemetryMetrics + + +class TelemetryDataCollector(ABC): + def __init__( + self, server_metrics_url: str, collection_interval: float = 1.0 # in seconds + ) -> None: + self._server_metrics_url = server_metrics_url + self._collection_interval = collection_interval + self._metrics = TelemetryMetrics() + self._stop_event = Event() + self._thread: Optional[Thread] = None + + def start(self) -> None: + """Start the telemetry data collection thread.""" + if self._thread is None or not self._thread.is_alive(): + self._stop_event.clear() + self._thread = Thread(target=self._collect_metrics) + self._thread.start() + + def stop(self) -> None: + """Stop the telemetry data collection thread.""" + if self._thread is not None and self._thread.is_alive(): + self._stop_event.set() + self._thread.join() + + def _fetch_metrics(self) -> str: + """Fetch metrics from the metrics endpoint""" + response = requests.get(self._server_metrics_url) + response.raise_for_status() + return response.text + + @abstractmethod + def _process_and_update_metrics(self, metrics_data: str) -> None: + """This method should be implemented by subclasses.""" + pass + + def _collect_metrics(self) -> None: + """Continuously collect telemetry metrics at for every second""" + while not self._stop_event.is_set(): + metrics_data = self._fetch_metrics() + self._process_and_update_metrics(metrics_data) + time.sleep(self._collection_interval) + + @property + def metrics(self) -> TelemetryMetrics: + """Return the collected metrics.""" + return self._metrics diff --git a/genai-perf/genai_perf/telemetry_data/triton_telemetry_data_collector.py b/genai-perf/genai_perf/telemetry_data/triton_telemetry_data_collector.py new file mode 100755 index 00000000..6f2fb8a9 --- /dev/null +++ b/genai-perf/genai_perf/telemetry_data/triton_telemetry_data_collector.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from typing import Dict, List + +import genai_perf.logging as logging +from genai_perf.telemetry_data.telemetry_data_collector import TelemetryDataCollector + +logger = logging.getLogger(__name__) + + +class TritonTelemetryDataCollector(TelemetryDataCollector): + """Class to collect telemetry metrics from Triton server""" + + """Mapping from Triton metric names to GenAI-Perf telemetry metric names""" + METRIC_NAME_MAPPING = { + "nv_gpu_power_usage": "gpu_power_usage", + "nv_gpu_power_limit": "gpu_power_limit", + "nv_energy_consumption": "energy_consumption", + "nv_gpu_utilization": "gpu_utilization", + "nv_gpu_memory_total_bytes": "total_gpu_memory", + "nv_gpu_memory_used_bytes": "gpu_memory_used", + } + + def _process_and_update_metrics(self, metrics_data: str) -> None: + """Process the response from Triton metrics endpoint and update metrics. + + This method extracts metric names and values from the raw data. Metric names + are extracted from the start of each line up to the '{' character, as all metrics + follow the format 'metric_name{labels} value'. Only metrics defined in + METRIC_NAME_MAPPING are processed. + + Args: + data (str): Raw metrics data from the Triton endpoint. + + Example: + Given the metric data: + ``` + nv_gpu_power_usage{gpu_uuid="GPU-abschdinjacgdo65gdj7"} 27.01 + nv_gpu_utilization{gpu_uuid="GPU-abcdef123456"} 75.5 + nv_energy_consumption{gpu_uuid="GPU-xyz789"} 1234.56 + ``` + + The method will extract and process: + - `nv_gpu_power_usage` as `gpu_power_usage` + - `nv_gpu_utilization` as `gpu_utilization` + - `nv_energy_consumption` as `energy_consumption` + """ + + if not metrics_data.strip(): + logger.info("Response from Triton metrics endpoint is empty") + return + + current_measurement_interval = { + metric.name: [] for metric in self.metrics.TELEMETRY_METRICS + } # type: Dict[str, List[float]] + + for line in metrics_data.splitlines(): + line = line.strip() + if not line: + continue + + parts = line.split() + if len(parts) < 2: + continue + + triton_metric_key = parts[0].split("{")[0] + metric_value = parts[1] + + metric_key = self.METRIC_NAME_MAPPING.get(triton_metric_key, None) + + if metric_key and metric_key in current_measurement_interval: + current_measurement_interval[metric_key].append(float(metric_value)) + + self.metrics.update_metrics(current_measurement_interval) diff --git a/genai-perf/genai_perf/wrapper.py b/genai-perf/genai_perf/wrapper.py index c7b27a6b..8951cd35 100644 --- a/genai-perf/genai_perf/wrapper.py +++ b/genai-perf/genai_perf/wrapper.py @@ -32,6 +32,9 @@ import genai_perf.utils as utils from genai_perf.constants import DEFAULT_GRPC_URL, DEFAULT_INPUT_DATA_JSON from genai_perf.llm_inputs.llm_inputs import OutputFormat +from genai_perf.telemetry_data.triton_telemetry_data_collector import ( + TelemetryDataCollector, +) logger = logging.getLogger(__name__) @@ -142,10 +145,20 @@ def build_cmd(args: Namespace, extra_args: Optional[List[str]] = None) -> List[s return cmd @staticmethod - def run(args: Namespace, extra_args: Optional[List[str]]) -> None: - cmd = Profiler.build_cmd(args, extra_args) - logger.info(f"Running Perf Analyzer : '{' '.join(cmd)}'") - if args and args.verbose: - subprocess.run(cmd, check=True, stdout=None) - else: - subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL) + def run( + args: Namespace, + extra_args: Optional[List[str]], + telemetry_data_collector: Optional[TelemetryDataCollector] = None, + ) -> None: + try: + if telemetry_data_collector is not None: + telemetry_data_collector.start() + cmd = Profiler.build_cmd(args, extra_args) + logger.info(f"Running Perf Analyzer : '{' '.join(cmd)}'") + if args and args.verbose: + subprocess.run(cmd, check=True, stdout=None) + else: + subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL) + finally: + if telemetry_data_collector is not None: + telemetry_data_collector.stop() diff --git a/genai-perf/tests/test_telemetry_data_collector.py b/genai-perf/tests/test_telemetry_data_collector.py new file mode 100755 index 00000000..441c6e31 --- /dev/null +++ b/genai-perf/tests/test_telemetry_data_collector.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from unittest.mock import MagicMock, patch + +import pytest +import requests +from genai_perf.telemetry_data.telemetry_data_collector import TelemetryDataCollector + + +class MockTelemetryDataCollector(TelemetryDataCollector): + def _process_and_update_metrics(self, metrics_data: str) -> None: + pass + + +class TestTelemetryDataCollector: + + TEST_SERVER_URL = "http://testserver:8080/metrics" + + triton_metrics_response = """\ + nv_gpu_power_usage{gpu="0",uuid="GPU-1234"} 123.45 + nv_gpu_power_usage{gpu="1",uuid="GPU-5678"} 234.56 + nv_gpu_utilization{gpu="0",uuid="GPU-1234"} 76.3 + nv_gpu_utilization{gpu="1",uuid="GPU-5678"} 88.1 + nv_gpu_memory_total_bytes{gpu="0",uuid="GPU-1234"} 8589934592 + nv_gpu_memory_total_bytes{gpu="1",uuid="GPU-5678"} 8589934592 + nv_gpu_memory_used_bytes{gpu="0",uuid="GPU-1234"} 2147483648 + nv_gpu_memory_used_bytes{gpu="1",uuid="GPU-5678"} 3221225472 + """ + + @pytest.fixture + def collector(self) -> MockTelemetryDataCollector: + return MockTelemetryDataCollector(self.TEST_SERVER_URL) + + @patch("genai_perf.telemetry_data.telemetry_data_collector.Thread") + def test_start( + self, mock_thread_class: MagicMock, collector: MockTelemetryDataCollector + ) -> None: + mock_thread_instance = MagicMock() + mock_thread_class.return_value = mock_thread_instance + + collector.start() + + assert collector._thread is not None + assert collector._thread.is_alive() + mock_thread_class.assert_called_once_with(target=collector._collect_metrics) + mock_thread_instance.start.assert_called_once() + + @patch("genai_perf.telemetry_data.telemetry_data_collector.Thread") + def test_stop( + self, mock_thread_class: MagicMock, collector: MockTelemetryDataCollector + ) -> None: + mock_thread_instance = MagicMock() + mock_thread_class.return_value = mock_thread_instance + + collector.start() + + assert collector._thread is not None + assert collector._thread.is_alive() + + collector.stop() + + assert collector._stop_event.is_set() + mock_thread_instance.join.assert_called_once() + + mock_thread_instance.is_alive.return_value = False + assert not collector._thread.is_alive() + + @patch("requests.get") + def test_fetch_metrics_success( + self, mock_requests_get: MagicMock, collector: MockTelemetryDataCollector + ) -> None: + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = self.triton_metrics_response + mock_requests_get.return_value = mock_response + + result = collector._fetch_metrics() + + mock_requests_get.assert_called_once_with(self.TEST_SERVER_URL) + + assert result == self.triton_metrics_response + + @patch("requests.get") + def test_fetch_metrics_failure( + self, mock_requests_get: MagicMock, collector: MockTelemetryDataCollector + ) -> None: + mock_requests_get.side_effect = requests.exceptions.HTTPError("Not Found") + + with pytest.raises(requests.exceptions.HTTPError): + collector._fetch_metrics() + + @patch.object(MockTelemetryDataCollector, "_fetch_metrics") + @patch("genai_perf.telemetry_data.telemetry_data_collector.time.sleep") + def test_collect_metrics( + self, + mock_sleep: MagicMock, + mock_fetch_metrics: MagicMock, + collector: MockTelemetryDataCollector, + ) -> None: + + mock_fetch_metrics.return_value = self.triton_metrics_response + + with patch.object( + collector, "_process_and_update_metrics", new_callable=MagicMock + ) as mock_process_and_update_metrics: + # Mock _stop_event.is_set + collector._stop_event = MagicMock() + collector._stop_event.is_set = MagicMock( + side_effect=[False, True] + ) # Ensure loop exits immediately + + collector._collect_metrics() + + mock_fetch_metrics.assert_called_once() + mock_process_and_update_metrics.assert_called_once_with( + self.triton_metrics_response + ) + mock_sleep.assert_called_once() diff --git a/genai-perf/tests/test_telemetry_metrics.py b/genai-perf/tests/test_telemetry_metrics.py new file mode 100755 index 00000000..12cb7210 --- /dev/null +++ b/genai-perf/tests/test_telemetry_metrics.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from typing import Dict, List + +from genai_perf.metrics.telemetry_metrics import MetricMetadata, TelemetryMetrics + + +class TestTelemetryMetrics: + + def test_update_metrics(self) -> None: + """Test update_metrics method.""" + telemetry = TelemetryMetrics() + measurement_data: Dict[str, List[float]] = { + "gpu_power_usage": [11.1, 11.2], + "gpu_power_limit": [101.2, 101.2], + "energy_consumption": [1004.0, 1005.0], + "gpu_utilization": [85.0, 90.0], + "total_gpu_memory": [9000.0, 9000.0], + "gpu_memory_used": [4500.0, 4500.0], + } + telemetry.update_metrics(measurement_data) + + assert telemetry.gpu_power_usage == [[11.1, 11.2]] + assert telemetry.gpu_power_limit == [[101.2, 101.2]] + assert telemetry.energy_consumption == [[1004.0, 1005.0]] + assert telemetry.gpu_utilization == [[85.0, 90.0]] + assert telemetry.total_gpu_memory == [[9000.0, 9000.0]] + assert telemetry.gpu_memory_used == [[4500.0, 4500.0]] + + def test_telemetry_metrics_property(self) -> None: + """Test telemetry_metrics property.""" + telemetry = TelemetryMetrics() + telemetry_metrics: List[MetricMetadata] = telemetry.telemetry_metrics + + assert len(telemetry_metrics) == 6 + assert telemetry_metrics[0].name == "gpu_power_usage" + assert telemetry_metrics[0].unit == "watts" + assert telemetry_metrics[1].name == "gpu_power_limit" + assert telemetry_metrics[1].unit == "watts" + assert telemetry_metrics[2].name == "energy_consumption" + assert telemetry_metrics[2].unit == "joules" + assert telemetry_metrics[3].name == "gpu_utilization" + assert telemetry_metrics[3].unit == "percentage" + assert telemetry_metrics[4].name == "total_gpu_memory" + assert telemetry_metrics[4].unit == "bytes" + assert telemetry_metrics[5].name == "gpu_memory_used" + assert telemetry_metrics[5].unit == "bytes" diff --git a/genai-perf/tests/test_triton_telemetry_data_collector.py b/genai-perf/tests/test_triton_telemetry_data_collector.py new file mode 100755 index 00000000..35ad3ec9 --- /dev/null +++ b/genai-perf/tests/test_triton_telemetry_data_collector.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from unittest.mock import MagicMock, PropertyMock, patch + +import pytest +from genai_perf.metrics.telemetry_metrics import MetricMetadata, TelemetryMetrics +from genai_perf.telemetry_data.triton_telemetry_data_collector import ( + TritonTelemetryDataCollector, +) + + +class TestTritonTelemetryDataCollector: + + TEST_SERVER_URL: str = "http://tritonserver:8002/metrics" + + @pytest.fixture + def triton_collector(self) -> TritonTelemetryDataCollector: + return TritonTelemetryDataCollector(self.TEST_SERVER_URL) + + @pytest.fixture + def mock_telemetry_metrics(self) -> MagicMock: + mock_telemetry_metrics = MagicMock(spec=TelemetryMetrics) + mock_telemetry_metrics.TELEMETRY_METRICS = [ + MetricMetadata("gpu_power_usage", "watts"), + MetricMetadata("gpu_power_limit", "watts"), + MetricMetadata("energy_consumption", "joules"), + MetricMetadata("gpu_utilization", "percentage"), + MetricMetadata("total_gpu_memory", "bytes"), + MetricMetadata("gpu_memory_used", "bytes"), + ] + return mock_telemetry_metrics + + @patch.object(TritonTelemetryDataCollector, "metrics", new_callable=PropertyMock) + def test_process_and_update_metrics_single_gpu( + self, + mock_metrics: PropertyMock, + triton_collector: TritonTelemetryDataCollector, + mock_telemetry_metrics: MagicMock, + ) -> None: + + mock_metrics.return_value = mock_telemetry_metrics + + triton_metrics_data = """nv_gpu_power_usage{gpu_uuid="GPU-1234"} 35.0 + nv_gpu_power_limit{gpu_uuid="GPU-1234"} 250.0 + nv_gpu_utilization{gpu_uuid="GPU-1234"} 85.0 + nv_energy_consumption{gpu_uuid="GPU-1234"} 1500.0 + nv_gpu_memory_total_bytes{gpu_uuid="GPU-1234"} 8000000000.0 + nv_gpu_memory_used_bytes{gpu_uuid="GPU-1234"} 4000000000.0""" + + triton_collector._process_and_update_metrics(triton_metrics_data) + + expected_data = { + "gpu_power_usage": [35.0], + "gpu_power_limit": [250.0], + "gpu_utilization": [85.0], + "energy_consumption": [1500.0], + "total_gpu_memory": [8000000000.0], + "gpu_memory_used": [4000000000.0], + } + + mock_metrics.return_value.update_metrics.assert_called_once_with(expected_data) + + @patch.object(TritonTelemetryDataCollector, "metrics", new_callable=PropertyMock) + def test_process_and_update_metrics_multiple_gpus( + self, + mock_metrics: PropertyMock, + triton_collector: TritonTelemetryDataCollector, + mock_telemetry_metrics: MagicMock, + ) -> None: + + mock_metrics.return_value = mock_telemetry_metrics + + triton_metrics_data = """nv_gpu_power_usage{gpu_uuid="GPU-1234"} 35.0 + nv_gpu_power_usage{gpu_uuid="GPU-5678"} 40.0 + nv_gpu_power_limit{gpu_uuid="GPU-1234"} 250.0 + nv_gpu_power_limit{gpu_uuid="GPU-1234"} 300.0 + nv_gpu_utilization{gpu_uuid="GPU-1234"} 85.0 + nv_gpu_utilization{gpu_uuid="GPU-5678"} 90.0 + nv_energy_consumption{gpu_uuid="GPU-1234"} 1500.0 + nv_energy_consumption{gpu_uuid="GPU-1234"} 1600.0 + nv_gpu_memory_total_bytes{gpu_uuid="GPU-1234"} 8000000000.0 + nv_gpu_memory_total_bytes{gpu_uuid="GPU-1234"} 9000000000.0 + nv_gpu_memory_used_bytes{gpu_uuid="GPU-1234"} 4000000000.0 + nv_gpu_memory_used_bytes{gpu_uuid="GPU-1234"} 4500000000.0""" + + triton_collector._process_and_update_metrics(triton_metrics_data) + + expected_data = { + "gpu_power_usage": [35.0, 40.0], + "gpu_power_limit": [250.0, 300.0], + "gpu_utilization": [85.0, 90.0], + "energy_consumption": [1500.0, 1600.0], + "total_gpu_memory": [8000000000.0, 9000000000.0], + "gpu_memory_used": [4000000000.0, 4500000000.0], + } + + mock_metrics.return_value.update_metrics.assert_called_once_with(expected_data) + + @patch.object(TritonTelemetryDataCollector, "metrics", new_callable=PropertyMock) + def test_process_and_update_metrics_empty_data( + self, + mock_metrics: PropertyMock, + triton_collector: TritonTelemetryDataCollector, + mock_telemetry_metrics: MagicMock, + ) -> None: + + mock_metrics.return_value = mock_telemetry_metrics + + trtion_metrics_data = "" + + triton_collector._process_and_update_metrics(trtion_metrics_data) + + mock_telemetry_metrics.update_metrics.assert_not_called() diff --git a/genai-perf/tests/test_wrapper.py b/genai-perf/tests/test_wrapper.py index fd4c34b5..0521e589 100644 --- a/genai-perf/tests/test_wrapper.py +++ b/genai-perf/tests/test_wrapper.py @@ -147,11 +147,17 @@ def test_service_openai(self, monkeypatch, arg): assert cmd_string.count(" -i http") == 1 @patch("genai_perf.wrapper.subprocess.run") - def test_stdout_verbose(self, mock_subprocess_run): + @patch("genai_perf.wrapper.TelemetryDataCollector") + def test_stdout_verbose(self, mock_telemetry_collector, mock_subprocess_run): args = MagicMock() args.model = "test_model" args.verbose = True - Profiler.run(args=args, extra_args=None) + telemetry_data_collector = mock_telemetry_collector.return_value + Profiler.run( + args=args, + extra_args=None, + telemetry_data_collector=telemetry_data_collector, + ) # Check that standard output was not redirected. for call_args in mock_subprocess_run.call_args_list: @@ -161,11 +167,17 @@ def test_stdout_verbose(self, mock_subprocess_run): ), "With the verbose flag, stdout should not be redirected." @patch("genai_perf.wrapper.subprocess.run") - def test_stdout_not_verbose(self, mock_subprocess_run): + @patch("genai_perf.wrapper.TelemetryDataCollector") + def test_stdout_not_verbose(self, mock_telemetry_collector, mock_subprocess_run): args = MagicMock() args.model = "test_model" args.verbose = False - Profiler.run(args=args, extra_args=None) + telemetry_data_collector = mock_telemetry_collector.return_value + Profiler.run( + args=args, + extra_args=None, + telemetry_data_collector=telemetry_data_collector, + ) # Check that standard output was redirected. for call_args in mock_subprocess_run.call_args_list: