Skip to content

Commit

Permalink
Merge pull request #12 from emirfabio/docs/benchmarks
Browse files Browse the repository at this point in the history
docs: added benchmarks and result table
  • Loading branch information
spwoodcock authored Feb 14, 2025
2 parents 112d89c + b7fbe1c commit efd4d86
Show file tree
Hide file tree
Showing 6 changed files with 490 additions and 0 deletions.
138 changes: 138 additions & 0 deletions benchmarks/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from typing import List, Tuple
import asyncio
from benchmarker_lib import BenchmarkSession

from pg_nearest_city.base_nearest_city import Location


async def benchmark_voronoi(
test_points: List[Tuple[float, float]], warmup_runs: int = 5, test_runs: int = 1000
) -> Tuple[List[Location], BenchmarkSession]:
"""Run benchmark for Voronoi implementation"""
from pg_nearest_city import AsyncNearestCity

session = BenchmarkSession("voronoi_geocoding", test_runs)
results = []

# Initial memory snapshot
session.mark_memory("initial_state")

# Initialize geocoder
async with AsyncNearestCity() as geocoder:
session.mark("geocoder_initialized")

# Warmup runs - only care about time
session.mark_time("warmup_start")
for _ in range(warmup_runs):
await geocoder.query(test_points[0][0], test_points[0][1])
session.mark_time("warmup_complete")

# Test runs - measure time for the batch, with periodic memory checks
session.mark("test_runs_start")
for i, (lat, lon) in enumerate(test_points[:test_runs]):
result = await geocoder.query(lat, lon)
results.append(result)
# Check memory every 1000 points
if i > 0 and i % 1000 == 0:
session.mark_memory(f"progress_{i}")

session.mark("test_runs_complete")

# Final memory state
session.mark_memory("final_state")

return (results, session)


def benchmark_kdtree(
test_points: List[Tuple[float, float]], warmup_runs: int = 5, test_runs: int = 1000
) -> Tuple[List[Location], BenchmarkSession]:
"""Run benchmark for KDTree implementation"""
import reverse_geocoder

session = BenchmarkSession("kdtree_geocoding", test_runs)
results = []

# Initial memory snapshot
session.mark_memory("initial_state")

# Initialize geocoder - measure both time and memory
rg = reverse_geocoder.RGeocoder(mode=2, verbose=False)
session.mark("geocoder_initialized")

# Warmup runs - only care about time
session.mark_time("warmup_start")
for _ in range(warmup_runs):
reverse_geocoder.get(test_points[0])
session.mark_time("warmup_complete")

# Test runs - measure time for the batch, with periodic memory checks
session.mark("test_runs_start")
for i, (lat, lon) in enumerate(test_points[:test_runs]):
result = reverse_geocoder.get((lat, lon))
results.append(
Location(lat=lat, lon=lon, city=result["name"], country=result["cc"])
)

# Check memory every 1000 points
if i > 0 and i % 1000 == 0:
session.mark_memory(f"progress_{i}")

session.mark("test_runs_complete")

# Final memory state
session.mark_memory("final_state")

return (results, session)


def generate_test_points(count: int = 10000) -> List[Tuple[float, float]]:
"""Generate a consistent set of test points"""
import random

random.seed(42)
points = [
(random.uniform(-90, 90), random.uniform(-180, 180)) for _ in range(count)
]
random.seed()
return points


async def main():
test_points = generate_test_points()

# Run both benchmarks in separate processes
from multiprocessing import Process, Queue

def kdtree_process(queue):
results = benchmark_kdtree(test_points)
queue.put(results)

def voronoi_process(queue):
results = asyncio.run(benchmark_voronoi(test_points))
queue.put(results)

kdtree_queue = Queue()
kdtree_p = Process(target=kdtree_process, args=(kdtree_queue,))
kdtree_p.start()

voronoi_queue = Queue()
voronoi_p = Process(target=voronoi_process, args=(voronoi_queue,))
voronoi_p.start()

kdtree_p.join()
voronoi_p.join()

# Get results
(kdtree_results, kdtree_session) = kdtree_queue.get()
(voronoi_results, voronoi_session) = voronoi_queue.get()

kdtree_session.print_summary()
voronoi_session.print_summary()

kdtree_session.save()
voronoi_session.save()


if __name__ == "__main__":
asyncio.run(main())
156 changes: 156 additions & 0 deletions benchmarks/benchmarker_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
from dataclasses import dataclass
from typing import Dict, List, Optional
import time
import psutil
import tracemalloc
from pathlib import Path
import json
from datetime import datetime


@dataclass
class BenchmarkPoint:
"""A single measurement point during benchmarking"""

timestamp: float
label: str
memory_mb: Optional[float] = None
duration_ms: Optional[float] = None
memory_delta_mb: Optional[float] = None
duration_delta_ms: Optional[float] = None


class BenchmarkSession:
def __init__(self, name: str, test_runs: int):
self.name = name
self.test_runs = test_runs
self.points: List[BenchmarkPoint] = []
self.start_time = time.perf_counter()
self.last_time = self.start_time
self.last_memory = None
tracemalloc.start()

def _get_current_memory(self) -> float:
"""Get current memory usage in MB"""
return psutil.Process().memory_info().rss / (1024 * 1024)

def _get_time_metrics(self, current_time: float) -> tuple[float, float]:
"""Calculate total and delta time in milliseconds"""
total_duration = (current_time - self.start_time) * 1000
delta_duration = (current_time - self.last_time) * 1000
self.last_time = current_time
return total_duration, delta_duration

def _get_memory_metrics(
self, current_memory: float
) -> tuple[float, Optional[float]]:
"""Calculate memory and delta memory in MB"""
memory_delta = None
if self.last_memory is not None:
memory_delta = current_memory - self.last_memory
self.last_memory = current_memory
return current_memory, memory_delta

def mark_time(self, label: str) -> BenchmarkPoint:
"""Create a benchmark point measuring only time"""
current_time = time.perf_counter()
duration, duration_delta = self._get_time_metrics(current_time)

point = BenchmarkPoint(
timestamp=current_time,
label=label,
duration_ms=duration,
duration_delta_ms=duration_delta,
)
self.points.append(point)
return point

def mark_memory(self, label: str) -> BenchmarkPoint:
"""Create a benchmark point measuring only memory"""
current_time = time.perf_counter()
current_memory = self._get_current_memory()
memory, memory_delta = self._get_memory_metrics(current_memory)

point = BenchmarkPoint(
timestamp=current_time,
label=label,
memory_mb=memory,
memory_delta_mb=memory_delta,
)
self.points.append(point)
return point

def mark(self, label: str) -> BenchmarkPoint:
"""Create a benchmark point measuring both time and memory"""
current_time = time.perf_counter()
current_memory = self._get_current_memory()

duration, duration_delta = self._get_time_metrics(current_time)
memory, memory_delta = self._get_memory_metrics(current_memory)

point = BenchmarkPoint(
timestamp=current_time,
label=label,
memory_mb=memory,
duration_ms=duration,
memory_delta_mb=memory_delta,
duration_delta_ms=duration_delta,
)
self.points.append(point)
return point

def get_results(self) -> Dict:
"""Get results in a structured format"""
return {
"name": self.name,
"timestamp": datetime.now().isoformat(),
"test_runs": self.test_runs,
"points": [
{
"label": p.label,
"memory_mb": round(p.memory_mb, 2)
if p.memory_mb is not None
else None,
"memory_delta_mb": round(p.memory_delta_mb, 2)
if p.memory_delta_mb is not None
else None,
"duration_ms": round(p.duration_ms, 2)
if p.duration_ms is not None
else None,
"duration_delta_ms": round(p.duration_delta_ms, 2)
if p.duration_delta_ms is not None
else None,
}
for p in self.points
],
}

def save(self, directory: str = "benchmarks/benchmark_results") -> str:
"""Save results to a JSON file"""
Path(directory).mkdir(exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{self.name}_{timestamp}.json"
filepath = Path(directory) / filename

with open(filepath, "w") as f:
json.dump(self.get_results(), f, indent=2)

return str(filepath)

def print_summary(self):
"""Print a human-readable summary"""
print(f"\nBenchmark Summary: {self.name}")
print(f"\nTest Runs: {self.test_runs}")
print("-" * 50)

for point in self.points:
print(f"\n{point.label}:")
if point.memory_mb is not None:
print(f" Memory: {point.memory_mb:.2f} MB")
if point.memory_delta_mb is not None:
print(f" Memory Δ: {point.memory_delta_mb:+.2f} MB")
if point.duration_ms is not None:
print(f" Duration: {point.duration_ms:.2f} ms")
if point.duration_delta_ms is not None:
print(f" Duration Δ: {point.duration_delta_ms:.2f} ms")
22 changes: 22 additions & 0 deletions benchmarks/benchmarks-results.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Benchmarking Results

### Test Run Performance (1000 geocoding operations)

| Implementation | Test Run Time (ms) | Std Dev (ms) | Min (ms) | Max (ms) | Avg Time Per Operation (ms) |
|----------------|------------------:|-------------:|---------:|---------:|---------------------------:|
| KD-tree | 45,560.73 | 3,359.99 | 39,796.90| 47,936.96| 45.56 |
| Voronoi | 1,831.31 | 400.08 | 1,431.14 | 2,496.77 | 1.83 |

### Memory Footprint After Initialization

| Implementation | Stable Memory (MB) | Memory Std Dev (MB) | Initial Memory (MB) | Memory Growth |
|----------------|------------------:|-------------------:|-------------------:|---------------:|
| KD-tree | 336.29 | 0.08 | ~73 | +263 MB |
| Voronoi | 33.00 | 0.23 | ~25 | +8 MB |

### Initialization Times

| Implementation | Init Time (ms) | Warmup Time (ms) | Total Startup (ms) |
|----------------|---------------:|----------------:|-------------------:|
| KD-tree | ~1,380 | ~350 | ~1,730 |
| Voronoi | ~16,200 | ~15 | ~16,215 |
25 changes: 25 additions & 0 deletions compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,28 @@ services:
interval: 10s
timeout: 5s
retries: 3

benchmark:
image: "ghcr.io/hotosm/pg-nearest-city:${TAG_OVERRIDE:-ci}"
build:
target: ci
container_name: pg-nearest-city-benchmark
volumes:
# Mount project config
- ./pyproject.toml:/data/pyproject.toml:ro
# Mount local package
- ./pg_nearest_city:/opt/python/lib/python3.10/site-packages/pg_nearest_city:ro
# Mount benchmarks directory
- ./benchmarks:/data/benchmarks
depends_on:
db:
condition: service_healthy
networks:
- net
restart: "no"
environment:
- PGNEAREST_DB_HOST=db
- PGNEAREST_DB_USER=cities
- PGNEAREST_DB_PASSWORD=dummycipassword
- PGNEAREST_DB_NAME=cities
command: "python /data/benchmarks/benchmark.py"
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ docs = [
"mkdocs-material>=9.5.49",
"mkdocstrings-python>=1.13.0",
]
dev = [
"psutil>=6.1.1",
"reverse-geocoder>=1.5.1",
]

[build-system]
requires = ["hatchling"]
Expand Down
Loading

0 comments on commit efd4d86

Please sign in to comment.