Merge pull request #12 from emirfabio/docs/benchmarks

docs: added benchmarks and result table
hotosm · Feb 14, 2025 · efd4d86 · efd4d86
2 parents 112d89c + b7fbe1c
commit efd4d86
Show file tree

Hide file tree

Showing 6 changed files with 490 additions and 0 deletions.
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
@@ -0,0 +1,138 @@
+from typing import List, Tuple
+import asyncio
+from benchmarker_lib import BenchmarkSession
+
+from pg_nearest_city.base_nearest_city import Location
+
+
+async def benchmark_voronoi(
+    test_points: List[Tuple[float, float]], warmup_runs: int = 5, test_runs: int = 1000
+) -> Tuple[List[Location], BenchmarkSession]:
+    """Run benchmark for Voronoi implementation"""
+    from pg_nearest_city import AsyncNearestCity
+
+    session = BenchmarkSession("voronoi_geocoding", test_runs)
+    results = []
+
+    # Initial memory snapshot
+    session.mark_memory("initial_state")
+
+    # Initialize geocoder
+    async with AsyncNearestCity() as geocoder:
+        session.mark("geocoder_initialized")
+
+        # Warmup runs - only care about time
+        session.mark_time("warmup_start")
+        for _ in range(warmup_runs):
+            await geocoder.query(test_points[0][0], test_points[0][1])
+        session.mark_time("warmup_complete")
+
+        # Test runs - measure time for the batch, with periodic memory checks
+        session.mark("test_runs_start")
+        for i, (lat, lon) in enumerate(test_points[:test_runs]):
+            result = await geocoder.query(lat, lon)
+            results.append(result)
+            # Check memory every 1000 points
+            if i > 0 and i % 1000 == 0:
+                session.mark_memory(f"progress_{i}")
+
+        session.mark("test_runs_complete")
+
+    # Final memory state
+    session.mark_memory("final_state")
+
+    return (results, session)
+
+
+def benchmark_kdtree(
+    test_points: List[Tuple[float, float]], warmup_runs: int = 5, test_runs: int = 1000
+) -> Tuple[List[Location], BenchmarkSession]:
+    """Run benchmark for KDTree implementation"""
+    import reverse_geocoder
+
+    session = BenchmarkSession("kdtree_geocoding", test_runs)
+    results = []
+
+    # Initial memory snapshot
+    session.mark_memory("initial_state")
+
+    # Initialize geocoder - measure both time and memory
+    rg = reverse_geocoder.RGeocoder(mode=2, verbose=False)
+    session.mark("geocoder_initialized")
+
+    # Warmup runs - only care about time
+    session.mark_time("warmup_start")
+    for _ in range(warmup_runs):
+        reverse_geocoder.get(test_points[0])
+    session.mark_time("warmup_complete")
+
+    # Test runs - measure time for the batch, with periodic memory checks
+    session.mark("test_runs_start")
+    for i, (lat, lon) in enumerate(test_points[:test_runs]):
+        result = reverse_geocoder.get((lat, lon))
+        results.append(
+            Location(lat=lat, lon=lon, city=result["name"], country=result["cc"])
+        )
+
+        # Check memory every 1000 points
+        if i > 0 and i % 1000 == 0:
+            session.mark_memory(f"progress_{i}")
+
+    session.mark("test_runs_complete")
+
+    # Final memory state
+    session.mark_memory("final_state")
+
+    return (results, session)
+
+
+def generate_test_points(count: int = 10000) -> List[Tuple[float, float]]:
+    """Generate a consistent set of test points"""
+    import random
+
+    random.seed(42)
+    points = [
+        (random.uniform(-90, 90), random.uniform(-180, 180)) for _ in range(count)
+    ]
+    random.seed()
+    return points
+
+
+async def main():
+    test_points = generate_test_points()
+
+    # Run both benchmarks in separate processes
+    from multiprocessing import Process, Queue
+
+    def kdtree_process(queue):
+        results = benchmark_kdtree(test_points)
+        queue.put(results)
+
+    def voronoi_process(queue):
+        results = asyncio.run(benchmark_voronoi(test_points))
+        queue.put(results)
+
+    kdtree_queue = Queue()
+    kdtree_p = Process(target=kdtree_process, args=(kdtree_queue,))
+    kdtree_p.start()
+
+    voronoi_queue = Queue()
+    voronoi_p = Process(target=voronoi_process, args=(voronoi_queue,))
+    voronoi_p.start()
+
+    kdtree_p.join()
+    voronoi_p.join()
+
+    # Get results
+    (kdtree_results, kdtree_session) = kdtree_queue.get()
+    (voronoi_results, voronoi_session) = voronoi_queue.get()
+
+    kdtree_session.print_summary()
+    voronoi_session.print_summary()
+
+    kdtree_session.save()
+    voronoi_session.save()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/benchmarks/benchmarker_lib.py b/benchmarks/benchmarker_lib.py
@@ -0,0 +1,156 @@
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+import time
+import psutil
+import tracemalloc
+from pathlib import Path
+import json
+from datetime import datetime
+
+
+@dataclass
+class BenchmarkPoint:
+    """A single measurement point during benchmarking"""
+
+    timestamp: float
+    label: str
+    memory_mb: Optional[float] = None
+    duration_ms: Optional[float] = None
+    memory_delta_mb: Optional[float] = None
+    duration_delta_ms: Optional[float] = None
+
+
+class BenchmarkSession:
+    def __init__(self, name: str, test_runs: int):
+        self.name = name
+        self.test_runs = test_runs
+        self.points: List[BenchmarkPoint] = []
+        self.start_time = time.perf_counter()
+        self.last_time = self.start_time
+        self.last_memory = None
+        tracemalloc.start()
+
+    def _get_current_memory(self) -> float:
+        """Get current memory usage in MB"""
+        return psutil.Process().memory_info().rss / (1024 * 1024)
+
+    def _get_time_metrics(self, current_time: float) -> tuple[float, float]:
+        """Calculate total and delta time in milliseconds"""
+        total_duration = (current_time - self.start_time) * 1000
+        delta_duration = (current_time - self.last_time) * 1000
+        self.last_time = current_time
+        return total_duration, delta_duration
+
+    def _get_memory_metrics(
+        self, current_memory: float
+    ) -> tuple[float, Optional[float]]:
+        """Calculate memory and delta memory in MB"""
+        memory_delta = None
+        if self.last_memory is not None:
+            memory_delta = current_memory - self.last_memory
+        self.last_memory = current_memory
+        return current_memory, memory_delta
+
+    def mark_time(self, label: str) -> BenchmarkPoint:
+        """Create a benchmark point measuring only time"""
+        current_time = time.perf_counter()
+        duration, duration_delta = self._get_time_metrics(current_time)
+
+        point = BenchmarkPoint(
+            timestamp=current_time,
+            label=label,
+            duration_ms=duration,
+            duration_delta_ms=duration_delta,
+        )
+        self.points.append(point)
+        return point
+
+    def mark_memory(self, label: str) -> BenchmarkPoint:
+        """Create a benchmark point measuring only memory"""
+        current_time = time.perf_counter()
+        current_memory = self._get_current_memory()
+        memory, memory_delta = self._get_memory_metrics(current_memory)
+
+        point = BenchmarkPoint(
+            timestamp=current_time,
+            label=label,
+            memory_mb=memory,
+            memory_delta_mb=memory_delta,
+        )
+        self.points.append(point)
+        return point
+
+    def mark(self, label: str) -> BenchmarkPoint:
+        """Create a benchmark point measuring both time and memory"""
+        current_time = time.perf_counter()
+        current_memory = self._get_current_memory()
+
+        duration, duration_delta = self._get_time_metrics(current_time)
+        memory, memory_delta = self._get_memory_metrics(current_memory)
+
+        point = BenchmarkPoint(
+            timestamp=current_time,
+            label=label,
+            memory_mb=memory,
+            duration_ms=duration,
+            memory_delta_mb=memory_delta,
+            duration_delta_ms=duration_delta,
+        )
+        self.points.append(point)
+        return point
+
+    def get_results(self) -> Dict:
+        """Get results in a structured format"""
+        return {
+            "name": self.name,
+            "timestamp": datetime.now().isoformat(),
+            "test_runs": self.test_runs,
+            "points": [
+                {
+                    "label": p.label,
+                    "memory_mb": round(p.memory_mb, 2)
+                    if p.memory_mb is not None
+                    else None,
+                    "memory_delta_mb": round(p.memory_delta_mb, 2)
+                    if p.memory_delta_mb is not None
+                    else None,
+                    "duration_ms": round(p.duration_ms, 2)
+                    if p.duration_ms is not None
+                    else None,
+                    "duration_delta_ms": round(p.duration_delta_ms, 2)
+                    if p.duration_delta_ms is not None
+                    else None,
+                }
+                for p in self.points
+            ],
+        }
+
+    def save(self, directory: str = "benchmarks/benchmark_results") -> str:
+        """Save results to a JSON file"""
+        Path(directory).mkdir(exist_ok=True)
+
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"{self.name}_{timestamp}.json"
+        filepath = Path(directory) / filename
+
+        with open(filepath, "w") as f:
+            json.dump(self.get_results(), f, indent=2)
+
+        return str(filepath)
+
+    def print_summary(self):
+        """Print a human-readable summary"""
+        print(f"\nBenchmark Summary: {self.name}")
+        print(f"\nTest Runs: {self.test_runs}")
+        print("-" * 50)
+
+        for point in self.points:
+            print(f"\n{point.label}:")
+            if point.memory_mb is not None:
+                print(f"  Memory: {point.memory_mb:.2f} MB")
+                if point.memory_delta_mb is not None:
+                    print(f"  Memory Δ: {point.memory_delta_mb:+.2f} MB")
+            if point.duration_ms is not None:
+                print(f"  Duration: {point.duration_ms:.2f} ms")
+                if point.duration_delta_ms is not None:
+                    print(f"  Duration Δ: {point.duration_delta_ms:.2f} ms")
diff --git a/benchmarks/benchmarks-results.md b/benchmarks/benchmarks-results.md
@@ -0,0 +1,22 @@
+# Benchmarking Results
+
+### Test Run Performance (1000 geocoding operations)
+
+| Implementation | Test Run Time (ms) | Std Dev (ms) | Min (ms) | Max (ms) | Avg Time Per Operation (ms) |
+|----------------|------------------:|-------------:|---------:|---------:|---------------------------:|
+| KD-tree        | 45,560.73        | 3,359.99     | 39,796.90| 47,936.96| 45.56                     |
+| Voronoi        | 1,831.31         | 400.08       | 1,431.14 | 2,496.77 | 1.83                      |
+
+### Memory Footprint After Initialization
+
+| Implementation | Stable Memory (MB) | Memory Std Dev (MB) | Initial Memory (MB) | Memory Growth |
+|----------------|------------------:|-------------------:|-------------------:|---------------:|
+| KD-tree        | 336.29           | 0.08              | ~73                | +263 MB        |
+| Voronoi        | 33.00            | 0.23              | ~25                | +8 MB          |
+
+### Initialization Times
+
+| Implementation | Init Time (ms) | Warmup Time (ms) | Total Startup (ms) |
+|----------------|---------------:|----------------:|-------------------:|
+| KD-tree        | ~1,380        | ~350            | ~1,730            |
+| Voronoi        | ~16,200       | ~15             | ~16,215           |
diff --git a/compose.yml b/compose.yml
@@ -63,3 +63,28 @@ services:
       interval: 10s
       timeout: 5s
       retries: 3
+
+  benchmark:
+    image: "ghcr.io/hotosm/pg-nearest-city:${TAG_OVERRIDE:-ci}"
+    build:
+      target: ci
+    container_name: pg-nearest-city-benchmark
+    volumes:
+      # Mount project config
+      - ./pyproject.toml:/data/pyproject.toml:ro
+      # Mount local package
+      - ./pg_nearest_city:/opt/python/lib/python3.10/site-packages/pg_nearest_city:ro
+      # Mount benchmarks directory
+      - ./benchmarks:/data/benchmarks
+    depends_on:
+      db:
+        condition: service_healthy
+    networks:
+      - net
+    restart: "no"
+    environment:
+      - PGNEAREST_DB_HOST=db
+      - PGNEAREST_DB_USER=cities
+      - PGNEAREST_DB_PASSWORD=dummycipassword
+      - PGNEAREST_DB_NAME=cities
+    command: "python /data/benchmarks/benchmark.py"
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,6 +43,10 @@ docs = [
     "mkdocs-material>=9.5.49",
     "mkdocstrings-python>=1.13.0",
 ]
+dev = [
+    "psutil>=6.1.1",
+    "reverse-geocoder>=1.5.1",
+]
 
 [build-system]
 requires = ["hatchling"]