Skip to content

Commit

Permalink
docs: added benchmarks and result table
Browse files Browse the repository at this point in the history
  • Loading branch information
emirfabio committed Feb 13, 2025
1 parent 112d89c commit e91032a
Show file tree
Hide file tree
Showing 6 changed files with 489 additions and 0 deletions.
139 changes: 139 additions & 0 deletions benchmarks/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from dataclasses import dataclass
from typing import List, Tuple, Optional
import asyncio
from benchmarker_lib import BenchmarkSession

from pg_nearest_city.base_nearest_city import Location


async def benchmark_voronoi(
test_points: List[Tuple[float, float]], warmup_runs: int = 5, test_runs: int = 1000
) -> Tuple[List[Location], BenchmarkSession]:
"""Run benchmark for Voronoi implementation"""
from pg_nearest_city import AsyncNearestCity

session = BenchmarkSession("voronoi_geocoding", test_runs)
results = []

# Initial memory snapshot
session.mark_memory("initial_state")

# Initialize geocoder
async with AsyncNearestCity() as geocoder:
session.mark("geocoder_initialized")

# Warmup runs - only care about time
session.mark_time("warmup_start")
for _ in range(warmup_runs):
await geocoder.query(test_points[0][0], test_points[0][1])
session.mark_time("warmup_complete")

# Test runs - measure time for the batch, with periodic memory checks
session.mark("test_runs_start")
for i, (lat, lon) in enumerate(test_points[:test_runs]):
result = await geocoder.query(lat, lon)
results.append(result)
# Check memory every 1000 points
if i > 0 and i % 1000 == 0:
session.mark_memory(f"progress_{i}")

session.mark("test_runs_complete")

# Final memory state
session.mark_memory("final_state")

return (results, session)


def benchmark_kdtree(
test_points: List[Tuple[float, float]], warmup_runs: int = 5, test_runs: int = 1000
) -> Tuple[List[Location], BenchmarkSession]:
"""Run benchmark for KDTree implementation"""
import reverse_geocoder

session = BenchmarkSession("kdtree_geocoding", test_runs)
results = []

# Initial memory snapshot
session.mark_memory("initial_state")

# Initialize geocoder - measure both time and memory
rg = reverse_geocoder.RGeocoder(mode=2, verbose=False)
session.mark("geocoder_initialized")

# Warmup runs - only care about time
session.mark_time("warmup_start")
for _ in range(warmup_runs):
reverse_geocoder.get(test_points[0])
session.mark_time("warmup_complete")

# Test runs - measure time for the batch, with periodic memory checks
session.mark("test_runs_start")
for i, (lat, lon) in enumerate(test_points[:test_runs]):
result = reverse_geocoder.get((lat, lon))
results.append(
Location(lat=lat, lon=lon, city=result["name"], country=result["cc"])
)

# Check memory every 1000 points
if i > 0 and i % 1000 == 0:
session.mark_memory(f"progress_{i}")

session.mark("test_runs_complete")

# Final memory state
session.mark_memory("final_state")

return (results, session)


def generate_test_points(count: int = 10000) -> List[Tuple[float, float]]:
"""Generate a consistent set of test points"""
import random

random.seed(42)
points = [
(random.uniform(-90, 90), random.uniform(-180, 180)) for _ in range(count)
]
random.seed()
return points


async def main():
test_points = generate_test_points()

# Run both benchmarks in separate processes
from multiprocessing import Process, Queue

def kdtree_process(queue):
results = benchmark_kdtree(test_points)
queue.put(results)

def voronoi_process(queue):
results = asyncio.run(benchmark_voronoi(test_points))
queue.put(results)

kdtree_queue = Queue()
kdtree_p = Process(target=kdtree_process, args=(kdtree_queue,))
kdtree_p.start()

voronoi_queue = Queue()
voronoi_p = Process(target=voronoi_process, args=(voronoi_queue,))
voronoi_p.start()

kdtree_p.join()
voronoi_p.join()

# Get results
(kdtree_results, kdtree_session) = kdtree_queue.get()
(voronoi_results, voronoi_session) = voronoi_queue.get()

kdtree_session.print_summary()
voronoi_session.print_summary()

kdtree_session.save()
voronoi_session.save()


if __name__ == "__main__":
asyncio.run(main())
157 changes: 157 additions & 0 deletions benchmarks/benchmarker_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
from dataclasses import dataclass
from typing import Dict, List, Optional
from enum import Flag, auto
import time
import psutil
import tracemalloc
from pathlib import Path
import json
from datetime import datetime


@dataclass
class BenchmarkPoint:
"""A single measurement point during benchmarking"""

timestamp: float
label: str
memory_mb: Optional[float] = None
duration_ms: Optional[float] = None
memory_delta_mb: Optional[float] = None
duration_delta_ms: Optional[float] = None


class BenchmarkSession:
def __init__(self, name: str, test_runs: int):
self.name = name
self.test_runs = test_runs
self.points: List[BenchmarkPoint] = []
self.start_time = time.perf_counter()
self.last_time = self.start_time
self.last_memory = None
tracemalloc.start()

def _get_current_memory(self) -> float:
"""Get current memory usage in MB"""
return psutil.Process().memory_info().rss / (1024 * 1024)

def _get_time_metrics(self, current_time: float) -> tuple[float, float]:
"""Calculate total and delta time in milliseconds"""
total_duration = (current_time - self.start_time) * 1000
delta_duration = (current_time - self.last_time) * 1000
self.last_time = current_time
return total_duration, delta_duration

def _get_memory_metrics(
self, current_memory: float
) -> tuple[float, Optional[float]]:
"""Calculate memory and delta memory in MB"""
memory_delta = None
if self.last_memory is not None:
memory_delta = current_memory - self.last_memory
self.last_memory = current_memory
return current_memory, memory_delta

def mark_time(self, label: str) -> BenchmarkPoint:
"""Create a benchmark point measuring only time"""
current_time = time.perf_counter()
duration, duration_delta = self._get_time_metrics(current_time)

point = BenchmarkPoint(
timestamp=current_time,
label=label,
duration_ms=duration,
duration_delta_ms=duration_delta,
)
self.points.append(point)
return point

def mark_memory(self, label: str) -> BenchmarkPoint:
"""Create a benchmark point measuring only memory"""
current_time = time.perf_counter()
current_memory = self._get_current_memory()
memory, memory_delta = self._get_memory_metrics(current_memory)

point = BenchmarkPoint(
timestamp=current_time,
label=label,
memory_mb=memory,
memory_delta_mb=memory_delta,
)
self.points.append(point)
return point

def mark(self, label: str) -> BenchmarkPoint:
"""Create a benchmark point measuring both time and memory"""
current_time = time.perf_counter()
current_memory = self._get_current_memory()

duration, duration_delta = self._get_time_metrics(current_time)
memory, memory_delta = self._get_memory_metrics(current_memory)

point = BenchmarkPoint(
timestamp=current_time,
label=label,
memory_mb=memory,
duration_ms=duration,
memory_delta_mb=memory_delta,
duration_delta_ms=duration_delta,
)
self.points.append(point)
return point

def get_results(self) -> Dict:
"""Get results in a structured format"""
return {
"name": self.name,
"timestamp": datetime.now().isoformat(),
"test_runs": self.test_runs,
"points": [
{
"label": p.label,
"memory_mb": round(p.memory_mb, 2)
if p.memory_mb is not None
else None,
"memory_delta_mb": round(p.memory_delta_mb, 2)
if p.memory_delta_mb is not None
else None,
"duration_ms": round(p.duration_ms, 2)
if p.duration_ms is not None
else None,
"duration_delta_ms": round(p.duration_delta_ms, 2)
if p.duration_delta_ms is not None
else None,
}
for p in self.points
],
}

def save(self, directory: str = "benchmarks/benchmark_results") -> str:
"""Save results to a JSON file"""
Path(directory).mkdir(exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{self.name}_{timestamp}.json"
filepath = Path(directory) / filename

with open(filepath, "w") as f:
json.dump(self.get_results(), f, indent=2)

return str(filepath)

def print_summary(self):
"""Print a human-readable summary"""
print(f"\nBenchmark Summary: {self.name}")
print(f"\nTest Runs: {self.test_runs}")
print("-" * 50)

for point in self.points:
print(f"\n{point.label}:")
if point.memory_mb is not None:
print(f" Memory: {point.memory_mb:.2f} MB")
if point.memory_delta_mb is not None:
print(f" Memory Δ: {point.memory_delta_mb:+.2f} MB")
if point.duration_ms is not None:
print(f" Duration: {point.duration_ms:.2f} ms")
if point.duration_delta_ms is not None:
print(f" Duration Δ: {point.duration_delta_ms:.2f} ms")
19 changes: 19 additions & 0 deletions benchmarks/benchmarks-results.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Benchmarking Results

### Test Run Performance (1000 geocoding operations)
| Implementation | Test Run Time (ms) | Std Dev (ms) | Min (ms) | Max (ms) | Avg Time Per Operation (ms) |
|----------------|------------------:|-------------:|---------:|---------:|---------------------------:|
| KD-tree | 45,560.73 | 3,359.99 | 39,796.90| 47,936.96| 45.56 |
| Voronoi | 1,831.31 | 400.08 | 1,431.14 | 2,496.77 | 1.83 |

### Memory Footprint After Initialization
| Implementation | Stable Memory (MB) | Memory Std Dev (MB) | Initial Memory (MB) | Memory Growth |
|----------------|------------------:|-------------------:|-------------------:|---------------:|
| KD-tree | 336.29 | 0.08 | ~73 | +263 MB |
| Voronoi | 33.00 | 0.23 | ~25 | +8 MB |

### Initialization Times
| Implementation | Init Time (ms) | Warmup Time (ms) | Total Startup (ms) |
|----------------|---------------:|----------------:|-------------------:|
| KD-tree | ~1,380 | ~350 | ~1,730 |
| Voronoi | ~16,200 | ~15 | ~16,215 |
25 changes: 25 additions & 0 deletions compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,28 @@ services:
interval: 10s
timeout: 5s
retries: 3

benchmark:
image: "ghcr.io/hotosm/pg-nearest-city:${TAG_OVERRIDE:-ci}"
build:
target: ci
container_name: pg-nearest-city-benchmark
volumes:
# Mount project config
- ./pyproject.toml:/data/pyproject.toml:ro
# Mount local package
- ./pg_nearest_city:/opt/python/lib/python3.10/site-packages/pg_nearest_city:ro
# Mount benchmarks directory
- ./benchmarks:/data/benchmarks
depends_on:
db:
condition: service_healthy
networks:
- net
restart: "no"
environment:
- PGNEAREST_DB_HOST=db
- PGNEAREST_DB_USER=cities
- PGNEAREST_DB_PASSWORD=dummycipassword
- PGNEAREST_DB_NAME=cities
command: "python /data/benchmarks/benchmark.py"
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ docs = [
"mkdocs-material>=9.5.49",
"mkdocstrings-python>=1.13.0",
]
dev = [
"psutil>=6.1.1",
"reverse-geocoder>=1.5.1",
]

[build-system]
requires = ["hatchling"]
Expand Down
Loading

0 comments on commit e91032a

Please sign in to comment.