From e3c417c34fc57be01f9edb2d7175357cf9891cf0 Mon Sep 17 00:00:00 2001 From: Conor Schaefer Date: Mon, 27 Jan 2025 13:32:52 -0800 Subject: [PATCH] test(pd): check for emitted metrics Tacks on a new integration test to catch metrics regressions, rather than waiting until post-deploy. The test is straightforward: GET the metrics endpoint, confirm a few matches. More subtle is the reordering of the smoke test suite: the pd tests come last now, since the metrics will be empty on pd start, if no work has been performed. Refs #3780, #5004. --- crates/bin/pd/Cargo.toml | 1 + crates/bin/pd/tests/network_integration.rs | 37 +++++++++++++++++++ .../compose/process-compose-smoke-test.yml | 35 +++++++++++------- 3 files changed, 59 insertions(+), 14 deletions(-) diff --git a/crates/bin/pd/Cargo.toml b/crates/bin/pd/Cargo.toml index 219ed1620f..8fa497111b 100644 --- a/crates/bin/pd/Cargo.toml +++ b/crates/bin/pd/Cargo.toml @@ -126,3 +126,4 @@ penumbra-sdk-proof-params = { workspace = true, features = [ assert_cmd = { workspace = true } predicates = "2.1" prost-reflect = "0.14.3" +regex = { workspace = true } diff --git a/crates/bin/pd/tests/network_integration.rs b/crates/bin/pd/tests/network_integration.rs index 388c19caed..691358aeb4 100644 --- a/crates/bin/pd/tests/network_integration.rs +++ b/crates/bin/pd/tests/network_integration.rs @@ -9,8 +9,45 @@ use http::StatusCode; use penumbra_sdk_proto::FILE_DESCRIPTOR_SET; use predicates::prelude::*; use prost_reflect::{DescriptorPool, ServiceDescriptor}; +use regex::Regex; use url::Url; +/// Specific patterns for spot-checking the metrics emitted by pd. +/// It's a smattering of metrics from the various components, including +/// some from outside the workspace, e.g. `cnidarium`. +const PD_METRICS_PATTERNS: &[&str] = &[ + r"^cnidarium_get_raw_duration_seconds_count_seconds \d+", + r"^cnidarium_nonverifiable_get_raw_duration_seconds_count_seconds \d+", + r"^pd_async_sleep_drift_microseconds \d+", + r"^penumbra_funding_streams_total_processing_time_milliseconds_count_milliseconds \d+", + r"^penumbra_dex_path_search_duration_seconds_count_seconds \d+", +]; + +#[ignore] +#[tokio::test] +/// Confirm that prometheus metrics are being exported for scraping. +/// Several times while bumping related crates we've missed a breakage +/// to metrics, and only noticed when we checked the grafana boards +/// for the preview environment post-deploy. +async fn confirm_metrics_emission() -> anyhow::Result<()> { + let client = reqwest::Client::new(); + let metrics_url = std::env::var("PENUMBRA_NODE_PD_METRICS_URL") + .unwrap_or("http://localhost:9000/metrics".to_string()); + let r = client.get(metrics_url).send().await?; + let status = r.status(); + let body = r.text().await?; + // Assert 200 + assert_eq!(status, StatusCode::OK); + + // Check specific metrics in the combined output + for pattern in PD_METRICS_PATTERNS { + // Enable multi-line support in the regex matching. + let re = Regex::new(&format!(r"(?m){}", pattern))?; + assert!(re.is_match(&body), "pd metric missing: {}", pattern); + } + Ok(()) +} + #[ignore] #[tokio::test] /// Confirm that permissive CORS headers are returned in HTTP responses diff --git a/deployments/compose/process-compose-smoke-test.yml b/deployments/compose/process-compose-smoke-test.yml index f46b3a1bc2..64ff3598ea 100644 --- a/deployments/compose/process-compose-smoke-test.yml +++ b/deployments/compose/process-compose-smoke-test.yml @@ -65,18 +65,12 @@ processes: pd: condition: process_healthy - # Run `pd` integration tests. - test-pd: - command: >- - cargo test --release --package pd -- --ignored --test-threads 1 --nocapture - depends_on: - pd: - condition: process_healthy - cometbft: - condition: process_started - availability: - restart: exit_on_failure - + # The order of the integration tests is (unfortunately) important: + # + # 1. First up are the pclientd tests, because + # 2. The pcli tests assume the pclientd tests have been run first + # 3. Finally, we run the pd tests, which need work to have been performed for metrics to be emitted. + # # Run `pclientd` integration tests. test-pclientd: command: >- @@ -88,8 +82,6 @@ processes: condition: process_healthy cometbft: condition: process_started - test-pd: - condition: process_completed availability: restart: exit_on_failure @@ -109,6 +101,21 @@ processes: availability: restart: exit_on_failure + # Run `pd` integration tests. These run last, as the metrics checks + # will be empty unless actual work has been performed. + test-pd: + command: >- + cargo test --release --package pd -- --ignored --test-threads 1 --nocapture + depends_on: + pd: + condition: process_healthy + cometbft: + condition: process_started + test-pcli: + condition: process_completed + availability: + restart: exit_on_failure + # Finalizer task, which will wait until all test suites have finished. # This allows us to ensure that. summary: