From f4be42640fda2a9621c61572f56ca7b653eada65 Mon Sep 17 00:00:00 2001
From: Bangtian Liu <liubangtian@gmail.com>
Date: Thu, 9 Jan 2025 21:58:23 -0600
Subject: [PATCH] address reviewer comments

Signed-off-by: Bangtian Liu <liubangtian@gmail.com>
---
 tuner/tuner/libtuner.py      | 53 ++++++++++++++++-----------
 tuner/tuner/libtuner_test.py | 70 +++++++++++++++++++++---------------
 2 files changed, 74 insertions(+), 49 deletions(-)

diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py
index 95c488cc4..63c9c5fbc 100644
--- a/tuner/tuner/libtuner.py
+++ b/tuner/tuner/libtuner.py
@@ -221,9 +221,12 @@ def validate_devices(user_devices: list[str]) -> None:
         )
 
 
-def validate_benchmark_results(
+def get_valid_benchmark_results(
     benchmark_results: list[BenchmarkResult],
 ) -> list[BenchmarkResult]:
+    """
+    Filter the benchmark_results list to return list with finite `time` values.
+    """
     filtered_benchmark_results = [r for r in benchmark_results if math.isfinite(r.time)]
     if len(filtered_benchmark_results) == 0:
         logging.error("No successful candidate benchmarks.")
@@ -231,23 +234,29 @@ def validate_benchmark_results(
     return filtered_benchmark_results
 
 
-def map_baseline_by_device(baseline_results: list[BenchmarkResult]) -> dict[str, float]:
-    return {r.device_id: r.time for r in baseline_results}
+def check_baseline_devices_uniqueness(baseline_results: list[BenchmarkResult]) -> bool:
+    seen = set()
+    for result in baseline_results:
+        if result.device_id in seen:
+            return False
+        seen.add(result.device_id)
+    return True
 
 
-def validate_baselines_device_ids_match(
-    first_baseline_by_device: dict[str, float],
-    second_baseline_by_device: dict[str, float],
-) -> bool:
-    return first_baseline_by_device.keys() == second_baseline_by_device.keys()
+def map_baseline_by_device(baseline_results: list[BenchmarkResult]) -> dict[str, float]:
+    return {r.device_id: r.time for r in baseline_results}
 
 
-def validate_baseline_regression(
-    first_baseline_by_device: dict[str, float],
-    second_baseline_by_device: dict[str, float],
+def detect_baseline_regression(
+    first_baseline_results: list[BenchmarkResult],
+    second_baseline_results: list[BenchmarkResult],
 ) -> list[str]:
+    """
+    Detects performance regressions between two sets of baseline results.
+    """
     regression_device_ids = []
-
+    first_baseline_by_device = map_baseline_by_device(first_baseline_results)
+    second_baseline_by_device = map_baseline_by_device(second_baseline_results)
     for device_id in first_baseline_by_device:
         if device_id not in second_baseline_by_device:
             continue
@@ -899,7 +908,7 @@ def select_best_benchmark_results(
     baseline_results: list[BenchmarkResult],
     num_candidates: Optional[int],
 ) -> list[BenchmarkResult]:
-    filtered_candidate_results = validate_benchmark_results(candidate_results)
+    filtered_candidate_results = get_valid_benchmark_results(candidate_results)
     if len(filtered_candidate_results) == 0:
         logging.error("No successful candidate benchmarks.")
         return []
@@ -972,8 +981,8 @@ def benchmark(
         tuning_client=tuning_client,
         candidate_trackers=candidate_trackers,
     )
-
-    baseline_times_by_device = map_baseline_by_device(baseline_results)
+    if not check_baseline_devices_uniqueness(baseline_results):
+        logging.warning("Duplicate device IDs detected in the first baseline results.")
 
     candidate_indices = [i for i in compiled_candidates if i != 0]
     candidate_results = benchmark_candidates(
@@ -992,15 +1001,17 @@ def benchmark(
         tuning_client=tuning_client,
         candidate_trackers=candidate_trackers,
     )
-    post_baseline_times_by_device = map_baseline_by_device(post_baseline_results)
 
-    if validate_baselines_device_ids_match(
-        baseline_times_by_device, post_baseline_times_by_device
-    ):
+    if not check_baseline_devices_uniqueness:
+        logging.warning("Duplicate device IDs detected in the second baseline results.")
+
+    first_baseline_by_device = map_baseline_by_device(baseline_results)
+    second_baseline_by_device = map_baseline_by_device(post_baseline_results)
+    if first_baseline_by_device.keys() != second_baseline_by_device.keys():
         logging.warning("Device ID mismatch between baseline runs.")
 
-    regression_devices = validate_baseline_regression(
-        baseline_times_by_device, post_baseline_results
+    regression_devices = detect_baseline_regression(
+        baseline_results, post_baseline_results
     )
     if regression_devices:
         logging.warning(
diff --git a/tuner/tuner/libtuner_test.py b/tuner/tuner/libtuner_test.py
index cda738470..00c661a42 100644
--- a/tuner/tuner/libtuner_test.py
+++ b/tuner/tuner/libtuner_test.py
@@ -238,57 +238,71 @@ def test_validate_benchmark_results():
         libtuner.BenchmarkResult(0, math.inf, "hip://0"),
     ]
 
-    result = libtuner.validate_benchmark_results(benchmark_results)
+    result = libtuner.get_valid_benchmark_results(benchmark_results)
     assert result == []
 
     benchmark_results = [
         libtuner.BenchmarkResult(0, math.inf, "hip://0"),
         libtuner.BenchmarkResult(0, 0.1, "hip://1"),
     ]
-    result = libtuner.validate_benchmark_results(benchmark_results)
+    result = libtuner.get_valid_benchmark_results(benchmark_results)
     assert len(result) == 1
     assert result[0].candidate_id == 0
     assert result[0].time == 0.1
     assert result[0].device_id == "hip://1"
 
 
-def test_validate_baselines_device_id_match():
-    first_baseline = {"hip://0": 1000.0, "hip://1": 2000.0}
-    second_baseline = {"hip://1": 1500.0, "hip://2": 2500.0}
-
-    result = libtuner.validate_baselines_device_ids_match(
-        first_baseline, second_baseline
-    )
-    assert result is False
-
-    first_baseline = {"hip://0": 1000.0, "hip://1": 2000.0}
-    second_baseline = {"hip://0": 1500.0, "hip://1": 2500.0}
+def test_check_baseline_devices_uniqueness():
+    baseline_results = [
+        libtuner.BenchmarkResult(0, 1000.0, "hip://0"),
+        libtuner.BenchmarkResult(0, 2000.0, "hip://1"),
+        libtuner.BenchmarkResult(0, 3000.0, "hip://2"),
+    ]
+    assert libtuner.check_baseline_devices_uniqueness(baseline_results)
 
-    result = libtuner.validate_baselines_device_ids_match(
-        first_baseline, second_baseline
-    )
-    assert result is True
+    baseline_results = [
+        libtuner.BenchmarkResult(0, 1000.0, "hip://0"),
+        libtuner.BenchmarkResult(0, 2000.0, "hip://0"),
+        libtuner.BenchmarkResult(0, 3000.0, "hip://2"),
+    ]
+    assert not libtuner.check_baseline_devices_uniqueness(baseline_results)
 
 
-def test_validate_baseline_regression():
-    first_baseline = {"hip://0": 1000.0, "hip://1": 2000.0}
-    second_baseline = {"hip://0": 1100.0, "hip://1": 1900.0}
-    regression_devices = libtuner.validate_baseline_regression(
+def test_detect_baseline_regression():
+    first_baseline = [
+        libtuner.BenchmarkResult(0, 1000.0, "hip://0"),
+        libtuner.BenchmarkResult(0, 2000.0, "hip://1"),
+    ]
+    second_baseline = [
+        libtuner.BenchmarkResult(0, 1100.0, "hip://0"),
+        libtuner.BenchmarkResult(0, 1900.0, "hip://1"),
+    ]
+    regression_devices = libtuner.detect_baseline_regression(
         first_baseline, second_baseline
     )
     assert regression_devices == ["hip://0"]
 
-    first_baseline = {"hip://0": 1000.0, "hip://1": 2000.0}
-    second_baseline = {"hip://0": 1000.0, "hip://1": 2000.0}
-
-    regression_devices = libtuner.validate_baseline_regression(
+    first_baseline = [
+        libtuner.BenchmarkResult(0, 1000.0, "hip://0"),
+        libtuner.BenchmarkResult(0, 2000.0, "hip://1"),
+    ]
+    second_baseline = [
+        libtuner.BenchmarkResult(0, 1000.0, "hip://0"),
+        libtuner.BenchmarkResult(0, 1000.0, "hip://1"),
+    ]
+    regression_devices = libtuner.detect_baseline_regression(
         first_baseline, second_baseline
     )
     assert regression_devices == []
 
-    first_baseline = {"hip://0": 1000.0, "hip://1": 2000.0}
-    second_baseline = {"hip://0": 1100.0}
-    regression_devices = libtuner.validate_baseline_regression(
+    first_baseline = [
+        libtuner.BenchmarkResult(0, 1000.0, "hip://0"),
+        libtuner.BenchmarkResult(0, 2000.0, "hip://1"),
+    ]
+    second_baseline = [
+        libtuner.BenchmarkResult(0, 1100.0, "hip://0"),
+    ]
+    regression_devices = libtuner.detect_baseline_regression(
         first_baseline, second_baseline
     )
     assert regression_devices == ["hip://0"]