From 79cb859889a1be9e31f9738d195ad2b8e52beef7 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Fri, 15 Nov 2024 01:11:43 -0500 Subject: [PATCH 01/68] chore: update snarkos/snarkvm to latest canary --- Cargo.lock | 152 ++++++++++++++++---------------- Cargo.toml | 8 +- crates/aot/src/auth/auth_fee.rs | 4 +- crates/aot/src/genesis.rs | 1 + crates/aot/src/ledger/util.rs | 6 +- 5 files changed, 87 insertions(+), 84 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a99ee2d4..b417dadf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3299,7 +3299,7 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "snarkos-account" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "anyhow", "colored", @@ -3353,7 +3353,7 @@ dependencies = [ [[package]] name = "snarkos-node" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "aleo-std", "anyhow", @@ -3361,6 +3361,7 @@ dependencies = [ "colored", "futures-util", "indexmap 2.6.0", + "lru", "num_cpus", "once_cell", "parking_lot 0.12.3", @@ -3386,7 +3387,7 @@ dependencies = [ [[package]] name = "snarkos-node-bft" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "aleo-std", "anyhow", @@ -3419,7 +3420,7 @@ dependencies = [ [[package]] name = "snarkos-node-bft-events" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "anyhow", "bytes", @@ -3436,7 +3437,7 @@ dependencies = [ [[package]] name = "snarkos-node-bft-ledger-service" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "async-trait", "indexmap 2.6.0", @@ -3452,7 +3453,7 @@ dependencies = [ [[package]] name = "snarkos-node-bft-storage-service" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "aleo-std", "anyhow", @@ -3466,7 +3467,7 @@ dependencies = [ [[package]] name = "snarkos-node-cdn" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "anyhow", "bincode", @@ -3485,7 +3486,7 @@ dependencies = [ [[package]] name = "snarkos-node-consensus" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "aleo-std", "anyhow", @@ -3507,7 +3508,7 @@ dependencies = [ [[package]] name = "snarkos-node-metrics" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "metrics-exporter-prometheus", "parking_lot 0.12.3", @@ -3520,7 +3521,7 @@ dependencies = [ [[package]] name = "snarkos-node-rest" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "anyhow", "axum", @@ -3548,7 +3549,7 @@ dependencies = [ [[package]] name = "snarkos-node-router" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "anyhow", "async-trait", @@ -3579,7 +3580,7 @@ dependencies = [ [[package]] name = "snarkos-node-router-messages" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "anyhow", "bytes", @@ -3597,7 +3598,7 @@ dependencies = [ [[package]] name = "snarkos-node-sync" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "anyhow", "indexmap 2.6.0", @@ -3610,6 +3611,7 @@ dependencies = [ "snarkos-node-router", "snarkos-node-sync-communication-service", "snarkos-node-sync-locators", + "snarkos-node-tcp", "snarkvm", "tokio", "tracing", @@ -3618,7 +3620,7 @@ dependencies = [ [[package]] name = "snarkos-node-sync-communication-service" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "async-trait", "tokio", @@ -3627,7 +3629,7 @@ dependencies = [ [[package]] name = "snarkos-node-sync-locators" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "anyhow", "indexmap 2.6.0", @@ -3639,7 +3641,7 @@ dependencies = [ [[package]] name = "snarkos-node-tcp" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" dependencies = [ "async-trait", "bytes", @@ -3655,7 +3657,7 @@ dependencies = [ [[package]] name = "snarkvm" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "anstyle", "anyhow", @@ -3686,7 +3688,7 @@ dependencies = [ [[package]] name = "snarkvm-algorithms" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "aleo-std", "anyhow", @@ -3717,7 +3719,7 @@ dependencies = [ [[package]] name = "snarkvm-algorithms-cuda" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "blst", "cc", @@ -3728,7 +3730,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-circuit-account", "snarkvm-circuit-algorithms", @@ -3742,7 +3744,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-account" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-circuit-algorithms", "snarkvm-circuit-network", @@ -3753,7 +3755,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-algorithms" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-circuit-types", "snarkvm-console-algorithms", @@ -3763,7 +3765,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-collections" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-circuit-algorithms", "snarkvm-circuit-types", @@ -3773,7 +3775,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-environment" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "indexmap 2.6.0", "itertools 0.11.0", @@ -3791,12 +3793,12 @@ dependencies = [ [[package]] name = "snarkvm-circuit-environment-witness" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" [[package]] name = "snarkvm-circuit-network" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-circuit-algorithms", "snarkvm-circuit-collections", @@ -3807,7 +3809,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-program" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "paste", "snarkvm-circuit-account", @@ -3822,7 +3824,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-address", @@ -3837,7 +3839,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-address" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3850,7 +3852,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-boolean" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-circuit-environment", "snarkvm-console-types-boolean", @@ -3859,7 +3861,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-field" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3869,7 +3871,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-group" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3881,7 +3883,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-integers" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3893,7 +3895,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-scalar" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3904,7 +3906,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-string" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3916,7 +3918,7 @@ dependencies = [ [[package]] name = "snarkvm-console" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-console-account", "snarkvm-console-algorithms", @@ -3929,7 +3931,7 @@ dependencies = [ [[package]] name = "snarkvm-console-account" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "bs58", "snarkvm-console-network", @@ -3940,7 +3942,7 @@ dependencies = [ [[package]] name = "snarkvm-console-algorithms" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "blake2s_simd", "smallvec", @@ -3953,7 +3955,7 @@ dependencies = [ [[package]] name = "snarkvm-console-collections" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "aleo-std", "rayon", @@ -3964,7 +3966,7 @@ dependencies = [ [[package]] name = "snarkvm-console-network" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "anyhow", "indexmap 2.6.0", @@ -3987,7 +3989,7 @@ dependencies = [ [[package]] name = "snarkvm-console-network-environment" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "anyhow", "bech32", @@ -4005,7 +4007,7 @@ dependencies = [ [[package]] name = "snarkvm-console-program" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "enum-iterator", "enum_index", @@ -4027,7 +4029,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-address", @@ -4042,7 +4044,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-address" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4053,7 +4055,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-boolean" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-console-network-environment", ] @@ -4061,7 +4063,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-field" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4071,7 +4073,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-group" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4082,7 +4084,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-integers" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4093,7 +4095,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-scalar" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4104,7 +4106,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-string" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4115,7 +4117,7 @@ dependencies = [ [[package]] name = "snarkvm-curves" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "rand", "rayon", @@ -4129,7 +4131,7 @@ dependencies = [ [[package]] name = "snarkvm-fields" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "aleo-std", "anyhow", @@ -4146,7 +4148,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "aleo-std", "anyhow", @@ -4170,7 +4172,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-authority" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "anyhow", "rand", @@ -4182,7 +4184,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-block" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4202,7 +4204,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-committee" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4215,7 +4217,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-ledger-narwhal-batch-certificate", "snarkvm-ledger-narwhal-batch-header", @@ -4228,7 +4230,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-batch-certificate" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4241,7 +4243,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-batch-header" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4253,7 +4255,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-data" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "bytes", "serde_json", @@ -4264,7 +4266,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-subdag" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4279,7 +4281,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-transmission" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "bytes", "serde_json", @@ -4292,7 +4294,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-transmission-id" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "snarkvm-console", "snarkvm-ledger-puzzle", @@ -4301,7 +4303,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-puzzle" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "aleo-std", "anyhow", @@ -4321,7 +4323,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-puzzle-epoch" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "aleo-std", "anyhow", @@ -4342,7 +4344,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-query" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "async-trait", "reqwest 0.11.27", @@ -4355,7 +4357,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-store" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "aleo-std-storage", "anyhow", @@ -4382,7 +4384,7 @@ dependencies = [ [[package]] name = "snarkvm-metrics" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "metrics", "metrics-exporter-prometheus", @@ -4391,7 +4393,7 @@ dependencies = [ [[package]] name = "snarkvm-parameters" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "aleo-std", "anyhow", @@ -4416,7 +4418,7 @@ dependencies = [ [[package]] name = "snarkvm-synthesizer" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "aleo-std", "anyhow", @@ -4447,7 +4449,7 @@ dependencies = [ [[package]] name = "snarkvm-synthesizer-process" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "aleo-std", "colored", @@ -4473,7 +4475,7 @@ dependencies = [ [[package]] name = "snarkvm-synthesizer-program" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "indexmap 2.6.0", "paste", @@ -4487,7 +4489,7 @@ dependencies = [ [[package]] name = "snarkvm-synthesizer-snark" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "bincode", "once_cell", @@ -4500,7 +4502,7 @@ dependencies = [ [[package]] name = "snarkvm-utilities" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "aleo-std", "anyhow", @@ -4521,7 +4523,7 @@ dependencies = [ [[package]] name = "snarkvm-utilities-derives" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" dependencies = [ "proc-macro2", "quote 1.0.37", diff --git a/Cargo.toml b/Cargo.toml index b8a30609..43ca943d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -129,9 +129,9 @@ snops-common = { path = "./crates/common" } # snarkos-node-metrics = { version = "3.0" } # snarkvm = { version = "1.0", features = ["rocks"] } -snarkos-account = { git = "https://github.com/AleoNet/snarkOS", rev = "c6de459" } -snarkos-node = { git = "https://github.com/AleoNet/snarkOS", rev = "c6de459" } -snarkos-node-metrics = { git = "https://github.com/AleoNet/snarkOS", rev = "c6de459" } -snarkvm = { git = "https://github.com/AleoNet/snarkVM", rev = "4eb83d7", default-features = false, features = [ +snarkos-account = { git = "https://github.com/AleoNet/snarkOS", rev = "6cce476" } +snarkos-node = { git = "https://github.com/AleoNet/snarkOS", rev = "6cce476" } +snarkos-node-metrics = { git = "https://github.com/AleoNet/snarkOS", rev = "6cce476" } +snarkvm = { git = "https://github.com/AleoNet/snarkVM", rev = "0b391d2", default-features = false, features = [ "rocks", ] } diff --git a/crates/aot/src/auth/auth_fee.rs b/crates/aot/src/auth/auth_fee.rs index 8e01255c..794c3c2e 100644 --- a/crates/aot/src/auth/auth_fee.rs +++ b/crates/aot/src/auth/auth_fee.rs @@ -6,7 +6,7 @@ use snarkvm::{ ledger::Deployment, prelude::Field, synthesizer::{ - process::{cost_in_microcredits, deployment_cost}, + process::{cost_in_microcredits_v2, deployment_cost}, Process, }, utilities::ToBytes, @@ -181,7 +181,7 @@ pub fn estimate_cost(process: &Process, func: &Authorization) // Retrieve the function name, program id, and program. let function_name = *transition.function_name(); let stack = process.get_stack(transition.program_id())?; - let cost = cost_in_microcredits(&stack, &function_name)?; + let cost = cost_in_microcredits_v2(&stack, &function_name)?; // Accumulate the finalize cost. if let Some(cost) = finalize_cost.checked_add(cost) { diff --git a/crates/aot/src/genesis.rs b/crates/aot/src/genesis.rs index 065e6469..bd4b5c4b 100644 --- a/crates/aot/src/genesis.rs +++ b/crates/aot/src/genesis.rs @@ -158,6 +158,7 @@ pub fn genesis_quorum( let (ratifications, transactions, aborted_transaction_ids, ratified_finalize_operations) = vm .speculate( state, + 0, None, ratifications, &solutions, diff --git a/crates/aot/src/ledger/util.rs b/crates/aot/src/ledger/util.rs index bfb535d5..1529d725 100644 --- a/crates/aot/src/ledger/util.rs +++ b/crates/aot/src/ledger/util.rs @@ -10,7 +10,7 @@ use snarkvm::{ types::{Address, Field, U64}, }, ledger::{query::Query, store::ConsensusStorage, Block, Execution, Fee, Ledger, Transaction}, - prelude::{execution_cost, Network}, + prelude::{execution_cost_v2, Network}, synthesizer::VM, }; @@ -90,7 +90,7 @@ pub fn public_transaction, A: Aleo(vm, &private_key_fee, min_fee, execution.to_execution_id()?)?; @@ -170,7 +170,7 @@ pub fn _make_transaction_proof_private, A: Al )?; // compute fee for the execution - let (min_fee, _) = execution_cost(&vm.process().read(), &execution)?; + let (min_fee, _) = execution_cost_v2(&vm.process().read(), &execution)?; // proof for the fee, authorizing the execution let fee = From cc1f188eabec905b5cc6da2d21ed65dfa1576d16 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Fri, 15 Nov 2024 02:26:17 -0500 Subject: [PATCH 02/68] feat(agent): WIP offline node resurrection --- Cargo.lock | 1 + crates/agent/Cargo.toml | 1 + crates/agent/src/db.rs | 74 ++++++- crates/agent/src/main.rs | 25 ++- crates/agent/src/rpc/control.rs | 29 ++- crates/agent/src/state.rs | 8 +- crates/common/src/api.rs | 196 +++++++++++++++++++ crates/common/src/format/impl_collections.rs | 31 +++ crates/common/src/format/mod.rs | 1 + crates/common/src/state/agent_state.rs | 46 +++++ crates/common/src/state/agent_status.rs | 42 ++++ 11 files changed, 441 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b417dadf..d0226c84 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4581,6 +4581,7 @@ dependencies = [ "anyhow", "axum", "bincode", + "bytes", "chrono", "clap", "dashmap 6.1.0", diff --git a/crates/agent/Cargo.toml b/crates/agent/Cargo.toml index ccab7017..5cfb670d 100644 --- a/crates/agent/Cargo.toml +++ b/crates/agent/Cargo.toml @@ -15,6 +15,7 @@ mangen = ["snops-common/mangen"] anyhow.workspace = true axum = { workspace = true, features = ["http2", "json", "tokio", "ws"] } bincode.workspace = true +bytes.workspace = true chrono.workspace = true clap.workspace = true dashmap.workspace = true diff --git a/crates/agent/src/db.rs b/crates/agent/src/db.rs index bfbb3952..0aa46ca4 100644 --- a/crates/agent/src/db.rs +++ b/crates/agent/src/db.rs @@ -4,10 +4,14 @@ use std::{ sync::Mutex, }; +use bytes::Buf; use snops_common::{ + api::EnvInfo, db::{error::DatabaseError, tree::DbTree, Database as DatabaseTrait}, - format::{DataFormat, DataReadError, DataWriteError}, + format::{self, read_dataformat, DataFormat, DataReadError, DataWriteError}, + state::{AgentState, EnvId}, }; +use url::Url; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] #[repr(u8)] @@ -16,6 +20,12 @@ pub enum AgentDbString { Jwt, /// Process ID of node. Used to keep track of zombie node processes. NodePid, + // Url to Loki instance, configured by the endpoint. + LokiUrl, + /// Current state of the agent. + AgentState, + /// Latest stored environment info. + EnvInfo, } impl DataFormat for AgentDbString { @@ -34,6 +44,9 @@ impl DataFormat for AgentDbString { Ok(match u8::read_data(reader, &())? { 0 => Self::Jwt, 1 => Self::NodePid, + 2 => Self::LokiUrl, + 3 => Self::AgentState, + 4 => Self::EnvInfo, _ => return Err(DataReadError::custom("invalid agent DB string type")), }) } @@ -49,18 +62,21 @@ pub struct Database { pub jwt_mutex: Mutex>, pub strings: DbTree, + pub documents: DbTree, } impl DatabaseTrait for Database { fn open(path: &Path) -> Result { let db = sled::open(path)?; let strings = DbTree::new(db.open_tree(b"v1/strings")?); + let documents = DbTree::new(db.open_tree(b"v1/documents")?); let jwt_mutex = Mutex::new(strings.restore(&AgentDbString::Jwt)?); Ok(Self { db, jwt_mutex, strings, + documents, }) } } @@ -77,4 +93,60 @@ impl Database { *lock = jwt; Ok(()) } + + pub fn set_loki_url(&self, url: Option) -> Result<(), DatabaseError> { + self.strings + .save_option(&AgentDbString::LokiUrl, url.as_ref()) + } + + pub fn loki_url(&self) -> Option { + self.strings + .restore(&AgentDbString::LokiUrl) + .ok()? + .and_then(|url| url.parse::().ok()) + } + + pub fn env_info(&self) -> Result, DatabaseError> { + self.documents + .restore(&AgentDbString::EnvInfo)? + .map(|format::BinaryData(bytes)| read_dataformat(&mut bytes.reader())) + .transpose() + .map_err(DatabaseError::from) + } + + pub fn set_env_info(&self, info: Option<&(EnvId, EnvInfo)>) -> Result<(), DatabaseError> { + if let Some(info) = info { + self.documents.save( + &AgentDbString::EnvInfo, + &format::BinaryData(info.to_byte_vec()?), + ) + } else { + self.documents.delete(&AgentDbString::EnvInfo).map(|_| ()) + } + } + + pub fn agent_state(&self) -> Result { + Ok( + if let Some(format::BinaryData(bytes)) = + self.documents.restore(&AgentDbString::AgentState)? + { + read_dataformat(&mut bytes.reader())? + } else { + AgentState::default() + }, + ) + } + + pub fn set_agent_state(&self, state: Option<&AgentState>) -> Result<(), DatabaseError> { + if let Some(state) = state { + self.documents.save( + &AgentDbString::AgentState, + &format::BinaryData(state.to_byte_vec()?), + ) + } else { + self.documents + .delete(&AgentDbString::AgentState) + .map(|_| ()) + } + } } diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 2b1b3cae..88cbe5f0 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -35,6 +35,7 @@ use tarpc::server::Channel; use tokio::{ select, signal::unix::{signal, Signal, SignalKind}, + sync::RwLock, }; use tokio_tungstenite::{ connect_async, @@ -152,16 +153,27 @@ async fn main() { // create the client state let state = Arc::new(GlobalState { client, - db: OpaqueDebug(db), _started: Instant::now(), connected: Mutex::new(Instant::now()), external_addr, internal_addrs, cli: args, endpoint, - loki: Default::default(), - env_info: Default::default(), - agent_state: Default::default(), + loki: Mutex::new(db.loki_url()), + env_info: RwLock::new( + db.env_info() + .inspect_err(|e| { + error!("failed to load env info from db: {e}"); + }) + .unwrap_or_default(), + ), + agent_state: RwLock::new( + db.agent_state() + .inspect_err(|e| { + error!("failed to load agent state from db: {e}"); + }) + .unwrap_or_default(), + ), reconcilation_handle: Default::default(), child: Default::default(), resolved_addrs: Default::default(), @@ -171,6 +183,7 @@ async fn main() { transfers, node_client: Default::default(), log_level_handler: reload_handler, + db: OpaqueDebug(db), }); // start the metrics watcher @@ -211,6 +224,10 @@ async fn main() { // invalidate env info cache state.env_info.write().await.take(); + state + .db + .set_env_info(None) + .expect("failed to clear env info"); // attach JWT if we have one if let Some(jwt) = state.db.jwt() { diff --git a/crates/agent/src/rpc/control.rs b/crates/agent/src/rpc/control.rs index 19f9f5a6..3d366f93 100644 --- a/crates/agent/src/rpc/control.rs +++ b/crates/agent/src/rpc/control.rs @@ -69,12 +69,23 @@ impl AgentService for AgentRpcServer { } // store loki server URL - if let Some(loki) = handshake.loki.and_then(|l| l.parse::().ok()) { - self.state - .loki - .lock() - .expect("failed to acquire loki URL lock") - .replace(loki); + let loki_url = handshake.loki.and_then(|l| l.parse::().ok()); + + if let Err(e) = self + .state + .db + .set_loki_url(loki_url.as_ref().map(|u| u.to_string())) + { + error!("failed to save loki URL to db: {e}"); + } + + match self.state.loki.lock() { + Ok(mut guard) => { + *guard = loki_url; + } + Err(e) => { + error!("failed to acquire loki URL lock: {e}"); + } } // emit the transfer statuses @@ -191,6 +202,9 @@ impl AgentService for AgentRpcServer { AgentState::Inventory => { // wipe the env info cache. don't want to have stale storage info state.env_info.write().await.take(); + if let Err(e) = state.db.set_env_info(None) { + error!("failed to clear env info from db: {e}"); + } } // start snarkOS node when node @@ -364,6 +378,9 @@ impl AgentService for AgentRpcServer { // After completing the reconcilation, update the agent state let mut agent_state = state.agent_state.write().await; + if let Err(e) = state.db.set_agent_state(Some(&target)) { + error!("failed to save agent state to db: {e}"); + } *agent_state = target; Ok(()) diff --git a/crates/agent/src/state.rs b/crates/agent/src/state.rs index 49897aaa..e277771c 100644 --- a/crates/agent/src/state.rs +++ b/crates/agent/src/state.rs @@ -21,7 +21,7 @@ use tokio::{ sync::{Mutex as AsyncMutex, RwLock}, task::AbortHandle, }; -use tracing::info; +use tracing::{error, info}; use crate::{cli::Cli, db::Database, metrics::Metrics, transfers::TransferTx, ReloadHandler}; @@ -86,7 +86,11 @@ impl GlobalState { bail!("failed to get env info: env not found {env_id}"); }; - *self.env_info.write().await = Some((env_id, info.clone())); + let env_info = (env_id, info.clone()); + if let Err(e) = self.db.set_env_info(Some(&env_info)) { + error!("failed to save env info to db: {e}"); + } + *self.env_info.write().await = Some(env_info); Ok(info) } diff --git a/crates/common/src/api.rs b/crates/common/src/api.rs index f7f3f2e5..f11eb69d 100644 --- a/crates/common/src/api.rs +++ b/crates/common/src/api.rs @@ -4,6 +4,7 @@ use snops_checkpoint::RetentionPolicy; use crate::{ binaries::BinaryEntry, + format::{DataFormat, DataHeaderOf}, prelude::StorageId, state::{InternedId, LatestBlockInfo, NetworkId}, }; @@ -41,3 +42,198 @@ pub struct StorageInfo { /// download from the control plane) pub binaries: IndexMap, } + +#[derive(Debug, Clone)] +pub struct EnvInfoHeader { + pub version: u8, + pub network: DataHeaderOf, + pub storage: DataHeaderOf, + pub block: DataHeaderOf, +} + +impl DataFormat for EnvInfoHeader { + type Header = (u8, DataHeaderOf>); + const LATEST_HEADER: Self::Header = (1, DataHeaderOf::::LATEST_HEADER); + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + let mut written = self.version.write_data(writer)?; + written += self.network.write_data(writer)?; + written += self.storage.write_data(writer)?; + written += self.block.write_data(writer)?; + Ok(written) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.0 != Self::LATEST_HEADER.0 { + return Err(crate::format::DataReadError::unsupported( + "EnvInfoHeader", + Self::LATEST_HEADER.0, + header.0, + )); + } + Ok(Self { + version: u8::read_data(reader, &())?, + network: DataHeaderOf::::read_data(reader, &())?, + storage: DataHeaderOf::::read_data(reader, &header.1)?, + block: DataHeaderOf::::read_data(reader, &())?, + }) + } +} + +impl DataFormat for EnvInfo { + type Header = EnvInfoHeader; + const LATEST_HEADER: Self::Header = EnvInfoHeader { + version: 1, + network: NetworkId::LATEST_HEADER, + storage: StorageInfo::LATEST_HEADER, + block: LatestBlockInfo::LATEST_HEADER, + }; + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + let mut written = self.network.write_data(writer)?; + written += self.storage.write_data(writer)?; + written += self.block.write_data(writer)?; + Ok(written) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.version != 1 { + return Err(crate::format::DataReadError::unsupported( + "EnvInfo", + 1, + header.version, + )); + } + Ok(Self { + network: NetworkId::read_data(reader, &header.network)?, + storage: StorageInfo::read_data(reader, &header.storage)?, + block: Option::::read_data(reader, &header.block)?, + }) + } +} + +#[derive(Debug, Clone)] +pub struct StorageInfoHeader { + pub version: u8, + pub retention_policy: DataHeaderOf, + pub binaries: DataHeaderOf, +} + +impl DataFormat for StorageInfoHeader { + type Header = u8; + const LATEST_HEADER: Self::Header = 1; + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + let mut written = self.version.write_data(writer)?; + written += self.retention_policy.write_data(writer)?; + written += self.binaries.write_data(writer)?; + Ok(written) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if *header != Self::LATEST_HEADER { + return Err(crate::format::DataReadError::unsupported( + "StorageInfoHeader", + Self::LATEST_HEADER, + header, + )); + } + Ok(Self { + version: u8::read_data(reader, &())?, + retention_policy: DataHeaderOf::::read_data(reader, &((), ()))?, + binaries: DataHeaderOf::::read_data(reader, &())?, + }) + } +} + +impl DataFormat for StorageInfo { + type Header = StorageInfoHeader; + + const LATEST_HEADER: Self::Header = StorageInfoHeader { + version: 1, + retention_policy: RetentionPolicy::LATEST_HEADER, + binaries: BinaryEntry::LATEST_HEADER, + }; + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + let mut written = self.id.write_data(writer)?; + written += self.retention_policy.write_data(writer)?; + written += self + .checkpoints + .iter() + .map( + |CheckpointMeta { + height, + timestamp, + filename, + }| (*height, *timestamp, filename.to_owned()), + ) + .collect::>() + .write_data(writer)?; + written += self.persist.write_data(writer)?; + written += self.version.write_data(writer)?; + written += self.native_genesis.write_data(writer)?; + written += self.binaries.write_data(writer)?; + Ok(written) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.version != 1 { + return Err(crate::format::DataReadError::unsupported( + "StorageInfo", + 1, + header.version, + )); + } + + let id = StorageId::read_data(reader, &())?; + let retention_policy = + Option::::read_data(reader, &header.retention_policy)?; + let checkpoints = Vec::<(u32, i64, String)>::read_data(reader, &((), (), ()))? + .into_iter() + .map(|(height, timestamp, filename)| CheckpointMeta { + height, + timestamp, + filename, + }) + .collect(); + let persist = bool::read_data(reader, &())?; + let version = u16::read_data(reader, &())?; + let native_genesis = bool::read_data(reader, &())?; + let binaries = + IndexMap::::read_data(reader, &((), header.binaries))?; + Ok(Self { + id, + retention_policy, + checkpoints, + persist, + version, + native_genesis, + binaries, + }) + } +} diff --git a/crates/common/src/format/impl_collections.rs b/crates/common/src/format/impl_collections.rs index 9939cdf9..bef6fceb 100644 --- a/crates/common/src/format/impl_collections.rs +++ b/crates/common/src/format/impl_collections.rs @@ -10,6 +10,37 @@ use super::{ DataWriteError, }; +#[derive(Debug, Clone)] +pub struct BinaryData(pub Vec); +impl From> for BinaryData { + fn from(data: Vec) -> Self { + Self(data) + } +} +impl From for Vec { + fn from(data: BinaryData) -> Self { + data.0 + } +} + +impl DataFormat for BinaryData { + type Header = (); + const LATEST_HEADER: Self::Header = (); + + fn write_data(&self, writer: &mut W) -> Result { + let written = PackedUint::from(self.0.len()).write_data(writer)?; + writer.write_all(&self.0)?; + Ok(written + self.0.len()) + } + + fn read_data(reader: &mut R, _header: &Self::Header) -> Result { + let len = usize::from(PackedUint::read_data(reader, &())?); + let mut data = Vec::with_capacity(len); + reader.read_exact(&mut data)?; + Ok(Self(data)) + } +} + impl DataFormat for [T; N] { type Header = T::Header; const LATEST_HEADER: Self::Header = T::LATEST_HEADER; diff --git a/crates/common/src/format/mod.rs b/crates/common/src/format/mod.rs index 6f698154..7078fcd1 100644 --- a/crates/common/src/format/mod.rs +++ b/crates/common/src/format/mod.rs @@ -14,6 +14,7 @@ mod impl_strings; mod impl_tuples; mod packed_int; +pub use impl_collections::BinaryData; pub use packed_int::*; use thiserror::Error; diff --git a/crates/common/src/state/agent_state.rs b/crates/common/src/state/agent_state.rs index 8a443ce6..861f7e11 100644 --- a/crates/common/src/state/agent_state.rs +++ b/crates/common/src/state/agent_state.rs @@ -1,4 +1,5 @@ use super::{EnvId, NodeState}; +use crate::format::{DataFormat, DataHeaderOf}; #[derive(Debug, Default, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum AgentState { @@ -20,3 +21,48 @@ impl AgentState { } } } + +impl DataFormat for AgentState { + type Header = (u8, DataHeaderOf); + const LATEST_HEADER: Self::Header = (1, NodeState::LATEST_HEADER); + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + match self { + Self::Inventory => Ok(0u8.write_data(writer)?), + Self::Node(id, state) => { + let mut written = 1u8.write_data(writer)?; + written += id.write_data(writer)?; + written += state.write_data(writer)?; + Ok(written) + } + } + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.0 != Self::LATEST_HEADER.0 { + return Err(crate::format::DataReadError::unsupported( + "AgentState", + Self::LATEST_HEADER.0, + header.0, + )); + } + + match u8::read_data(reader, &())? { + 0 => Ok(Self::Inventory), + 1 => { + let id = EnvId::read_data(reader, &())?; + let state = NodeState::read_data(reader, &header.1)?; + Ok(Self::Node(id, Box::new(state))) + } + n => Err(crate::format::DataReadError::custom(format!( + "Invalid AgentState variant {n}", + ))), + } + } +} diff --git a/crates/common/src/state/agent_status.rs b/crates/common/src/state/agent_status.rs index 1916991e..f61c3787 100644 --- a/crates/common/src/state/agent_status.rs +++ b/crates/common/src/state/agent_status.rs @@ -3,6 +3,7 @@ use indexmap::IndexMap; use serde::{Deserialize, Serialize}; use super::snarkos_status::SnarkOSStatus; +use crate::format::DataFormat; #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum NodeStatus { @@ -145,3 +146,44 @@ pub struct AgentStatus { /// A map of transfers in progress pub transfers: IndexMap, } + +impl DataFormat for LatestBlockInfo { + type Header = u8; + + const LATEST_HEADER: Self::Header = 1; + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + let mut written = self.height.write_data(writer)?; + written += self.state_root.write_data(writer)?; + written += self.block_hash.write_data(writer)?; + written += self.previous_hash.write_data(writer)?; + written += self.block_timestamp.write_data(writer)?; + written += self.update_time.write_data(writer)?; + Ok(written) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if *header != Self::LATEST_HEADER { + return Err(crate::format::DataReadError::unsupported( + "LatestBlockInfo", + Self::LATEST_HEADER, + *header, + )); + } + + Ok(LatestBlockInfo { + height: u32::read_data(reader, &())?, + state_root: String::read_data(reader, &())?, + block_hash: String::read_data(reader, &())?, + previous_hash: String::read_data(reader, &())?, + block_timestamp: i64::read_data(reader, &())?, + update_time: DateTime::read_data(reader, &())?, + }) + } +} From b006beb6305c335ab52221cf5ccfdaf8e541a933 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Fri, 15 Nov 2024 22:56:42 -0500 Subject: [PATCH 03/68] feat(agent): WIP reconcile 2.0 --- Cargo.lock | 1 + crates/agent/Cargo.toml | 1 + crates/agent/src/cli.rs | 22 ++ crates/agent/src/client.rs | 196 ++++++++++++ crates/agent/src/db.rs | 34 +- crates/agent/src/log.rs | 53 ++++ crates/agent/src/main.rs | 300 ++---------------- crates/agent/src/reconcile/agent.rs | 267 ++++++++++++++++ crates/agent/src/reconcile/checkpoint.rs | 108 +++++++ .../src/{reconcile.rs => reconcile/files.rs} | 110 +------ crates/agent/src/reconcile/mod.rs | 91 ++++++ crates/agent/src/rpc/agent.rs | 14 +- crates/agent/src/rpc/control.rs | 18 +- crates/agent/src/state.rs | 37 ++- crates/agent/src/transfers.rs | 21 +- crates/common/src/rpc/error.rs | 18 ++ crates/common/src/state/port_config.rs | 2 +- 17 files changed, 884 insertions(+), 409 deletions(-) create mode 100644 crates/agent/src/client.rs create mode 100644 crates/agent/src/log.rs create mode 100644 crates/agent/src/reconcile/agent.rs create mode 100644 crates/agent/src/reconcile/checkpoint.rs rename crates/agent/src/{reconcile.rs => reconcile/files.rs} (76%) create mode 100644 crates/agent/src/reconcile/mod.rs diff --git a/Cargo.lock b/Cargo.lock index d0226c84..1dc29f6b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4589,6 +4589,7 @@ dependencies = [ "futures-util", "http 1.1.0", "httpdate", + "indexmap 2.6.0", "local-ip-address", "nix", "reqwest 0.12.8", diff --git a/crates/agent/Cargo.toml b/crates/agent/Cargo.toml index 5cfb670d..282cbc4e 100644 --- a/crates/agent/Cargo.toml +++ b/crates/agent/Cargo.toml @@ -23,6 +23,7 @@ futures.workspace = true futures-util.workspace = true http.workspace = true httpdate.workspace = true +indexmap.workspace = true local-ip-address.workspace = true nix = { workspace = true, features = ["signal"] } reqwest = { workspace = true, features = ["json", "stream"] } diff --git a/crates/agent/src/cli.rs b/crates/agent/src/cli.rs index 033b1619..9b9d7111 100644 --- a/crates/agent/src/cli.rs +++ b/crates/agent/src/cli.rs @@ -13,6 +13,8 @@ use http::Uri; use snops_common::state::{AgentId, AgentModeOptions, PortConfig}; use tracing::{info, warn}; +use crate::net; + pub const ENV_ENDPOINT: &str = "SNOPS_ENDPOINT"; pub const ENV_ENDPOINT_DEFAULT: &str = "127.0.0.1:1234"; @@ -167,4 +169,24 @@ impl Cli { ws_uri, ) } + + pub fn addrs(&self) -> (Vec, Option) { + let internal_addrs = match (self.internal, self.external) { + // use specified internal address + (Some(internal), _) => vec![internal], + // use no internal address if the external address is loopback + (None, Some(external)) if external.is_loopback() => vec![], + // otherwise, get the local network interfaces available to this node + (None, _) => net::get_internal_addrs().expect("failed to get network interfaces"), + }; + + let external_addr = self.external; + if let Some(addr) = external_addr { + info!("using external addr: {}", addr); + } else { + info!("skipping external addr"); + } + + (internal_addrs, external_addr) + } } diff --git a/crates/agent/src/client.rs b/crates/agent/src/client.rs new file mode 100644 index 00000000..d66493af --- /dev/null +++ b/crates/agent/src/client.rs @@ -0,0 +1,196 @@ +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; + +use futures::{SinkExt, StreamExt}; +use http::{HeaderValue, Uri}; +use snops_common::{ + constant::{ENV_AGENT_KEY, HEADER_AGENT_KEY}, + rpc::{ + control::{agent::AgentService, ControlServiceClient, PING_HEADER}, + RpcTransport, PING_INTERVAL_SEC, PING_LENGTH, + }, +}; +use tarpc::server::Channel; +use tokio::select; +use tokio_tungstenite::{ + connect_async, + tungstenite::{self, client::IntoClientRequest, handshake::client::Request}, +}; +use tracing::{error, info, warn}; + +use crate::{ + rpc::control::{self, AgentRpcServer}, + state::GlobalState, +}; + +pub fn new_ws_request(ws_uri: &Uri, jwt: Option) -> Request { + let mut req = ws_uri.to_owned().into_client_request().unwrap(); + + // attach JWT if we have one + if let Some(jwt) = jwt { + req.headers_mut().insert( + "Authorization", + HeaderValue::from_bytes(format!("Bearer {jwt}").as_bytes()) + .expect("attach authorization header"), + ); + } + + // attach agent key if one is set in env vars + if let Ok(key) = std::env::var(ENV_AGENT_KEY) { + req.headers_mut().insert( + HEADER_AGENT_KEY, + HeaderValue::from_bytes(key.as_bytes()).expect("attach agent key header"), + ); + } + + req +} + +pub async fn ws_connection(ws_req: Request, state: Arc) { + let (mut stream, _response) = match connect_async(ws_req).await { + Ok(res) => res, + Err(e) => { + error!("failed to connect to websocket: {e}"); + return; + } + }; + + info!("Connection established with the control plane"); + + // Clear old info cache. we will get new info from the control plane + state.set_env_info(None).await; + // TODO: fetch latest info from controlplane rather than clearing + + // create rpc channels + let (client_response_in, client_transport, mut client_request_out) = RpcTransport::new(); + let (server_request_in, server_transport, mut server_response_out) = RpcTransport::new(); + + // set up the client, facing the control plane + let client = + ControlServiceClient::new(tarpc::client::Config::default(), client_transport).spawn(); + state.client.write().await.replace(client.clone()); + + let start_time = Instant::now(); + let mut interval = tokio::time::interval(Duration::from_secs(PING_INTERVAL_SEC)); + let mut num_pings: u32 = 0; + + // initialize and start the rpc server + let mut server_handle = Box::pin( + tarpc::server::BaseChannel::with_defaults(server_transport).execute( + AgentRpcServer { + client, + state: Arc::clone(&state), + version: env!("CARGO_PKG_VERSION"), + } + .serve(), + ), + ); + + loop { + select! { + _ = interval.tick() => { + // ping payload contains "snops-agent", number of pings, and uptime + let mut payload = Vec::from(PING_HEADER); + payload.extend_from_slice(&num_pings.to_le_bytes()); + payload.extend_from_slice(&start_time.elapsed().as_micros().to_le_bytes()); + + let send = stream.send(tungstenite::Message::Ping(payload)); + if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { + error!("The connection to the control plane was interrupted while sending ping"); + break + } + } + + // handle outgoing responses + msg = server_response_out.recv() => { + let msg = msg.expect("internal RPC channel closed"); + let bin = bincode::serialize(&control::MuxedMessageOutgoing::Child(msg)).expect("failed to serialize response"); + let send = stream.send(tungstenite::Message::Binary(bin)); + if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { + error!("The connection to the control plane was interrupted while sending agent message"); + break; + } + } + + // handle outgoing requests + msg = client_request_out.recv() => { + let msg = msg.expect("internal RPC channel closed"); + let bin = bincode::serialize(&control::MuxedMessageOutgoing::Parent(msg)).expect("failed to serialize request"); + let send = stream.send(tungstenite::Message::Binary(bin)); + if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { + error!("The connection to the control plane was interrupted while sending control message"); + break; + } + } + + // handle incoming messages + msg = stream.next() => match msg { + Some(Ok(tungstenite::Message::Close(frame))) => { + if let Some(frame) = frame { + info!("The control plane has closed the connection: {frame}"); + } else { + info!("The control plane has closed the connection"); + } + break; + } + + Some(Ok(tungstenite::Message::Pong(payload))) => { + let mut payload = payload.as_slice(); + // check the header + if !payload.starts_with(PING_HEADER) { + warn!("Received a pong payload with an invalid header prefix"); + continue; + } + payload = &payload[PING_HEADER.len()..]; + if payload.len() != PING_LENGTH { + warn!("Received a pong payload with an invalid length {}, expected {PING_LENGTH}", payload.len()); + continue; + } + let (left, right) = payload.split_at(size_of::()); + let ping_index = u32::from_le_bytes(left.try_into().unwrap()); + let _uptime_start = u128::from_le_bytes(right.try_into().unwrap()); + + if ping_index != num_pings { + warn!("Received a pong payload with an invalid index {ping_index}, expected {num_pings}"); + continue; + } + + num_pings += 1; + + // when desired, we can add this as a metric + // let uptime_now = start_time.elapsed().as_micros(); + // let uptime_diff = uptime_now - uptime_start; + } + + Some(Ok(tungstenite::Message::Binary(bin))) => { + let msg = match bincode::deserialize(&bin) { + Ok(msg) => msg, + Err(e) => { + error!("failed to deserialize a message from the control plane: {e}"); + continue; + } + }; + + match msg { + control::MuxedMessageIncoming::Child(msg) => server_request_in.send(msg).expect("internal RPC channel closed"), + control::MuxedMessageIncoming::Parent(msg) => client_response_in.send(msg).expect("internal RPC channel closed"), + } + } + + None | Some(Err(_)) => { + error!("The connection to the control plane was interrupted"); + break; + } + + Some(Ok(o)) => println!("{o:#?}"), + }, + + // handle server requests + Some(r) = server_handle.next() => { + tokio::spawn(r); + } + } + } +} diff --git a/crates/agent/src/db.rs b/crates/agent/src/db.rs index 0aa46ca4..c7d8b8cb 100644 --- a/crates/agent/src/db.rs +++ b/crates/agent/src/db.rs @@ -1,15 +1,17 @@ use std::{ io::{Read, Write}, + net::IpAddr, path::Path, sync::Mutex, }; use bytes::Buf; +use indexmap::IndexMap; use snops_common::{ api::EnvInfo, db::{error::DatabaseError, tree::DbTree, Database as DatabaseTrait}, format::{self, read_dataformat, DataFormat, DataReadError, DataWriteError}, - state::{AgentState, EnvId}, + state::{AgentId, AgentState, EnvId}, }; use url::Url; @@ -26,6 +28,8 @@ pub enum AgentDbString { AgentState, /// Latest stored environment info. EnvInfo, + /// Agent addresses resolved by the controlplane. + ResolvedAddrs, } impl DataFormat for AgentDbString { @@ -149,4 +153,32 @@ impl Database { .map(|_| ()) } } + + pub fn resolved_addrs(&self) -> Result, DatabaseError> { + Ok( + if let Some(format::BinaryData(bytes)) = + self.documents.restore(&AgentDbString::ResolvedAddrs)? + { + read_dataformat(&mut bytes.reader())? + } else { + IndexMap::new() + }, + ) + } + + pub fn set_resolved_addrs( + &self, + addrs: Option<&IndexMap>, + ) -> Result<(), DatabaseError> { + if let Some(addrs) = addrs { + self.documents.save( + &AgentDbString::ResolvedAddrs, + &format::BinaryData(addrs.to_byte_vec()?), + ) + } else { + self.documents + .delete(&AgentDbString::ResolvedAddrs) + .map(|_| ()) + } + } } diff --git a/crates/agent/src/log.rs b/crates/agent/src/log.rs new file mode 100644 index 00000000..9d87492b --- /dev/null +++ b/crates/agent/src/log.rs @@ -0,0 +1,53 @@ +use tracing::level_filters::LevelFilter; +use tracing_appender::non_blocking::WorkerGuard; +use tracing_subscriber::{layer::SubscriberExt, reload, util::SubscriberInitExt, EnvFilter}; + +pub type ReloadHandler = reload::Handle; + +pub fn make_env_filter(level: LevelFilter) -> EnvFilter { + EnvFilter::builder() + .with_env_var("SNOPS_AGENT_LOG") + .with_default_directive(level.into()) + .from_env_lossy() + .add_directive(level.into()) + .add_directive("neli=off".parse().unwrap()) + .add_directive("hyper_util=off".parse().unwrap()) + .add_directive("reqwest=off".parse().unwrap()) + .add_directive("tungstenite=off".parse().unwrap()) + .add_directive("tokio_tungstenite=off".parse().unwrap()) + .add_directive("tarpc::client=ERROR".parse().unwrap()) + .add_directive("tarpc::server=ERROR".parse().unwrap()) +} + +pub fn init_logging() -> (WorkerGuard, ReloadHandler) { + let (stdout, guard) = tracing_appender::non_blocking(std::io::stdout()); + + let output: tracing_subscriber::fmt::Layer< + _, + tracing_subscriber::fmt::format::DefaultFields, + tracing_subscriber::fmt::format::Format, + tracing_appender::non_blocking::NonBlocking, + > = tracing_subscriber::fmt::layer().with_writer(stdout); + + let output = if cfg!(debug_assertions) { + output.with_file(true).with_line_number(true) + } else { + output + }; + + let filter_level = if cfg!(debug_assertions) { + LevelFilter::TRACE + } else { + LevelFilter::INFO + }; + + let (env_filter, reload_handler) = reload::Layer::new(make_env_filter(filter_level)); + + tracing_subscriber::registry() + .with(env_filter) + .with(output) + .try_init() + .unwrap(); + + (guard, reload_handler) +} diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 88cbe5f0..2bcab7b4 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -1,5 +1,6 @@ mod api; mod cli; +mod client; mod db; mod metrics; mod net; @@ -10,7 +11,6 @@ mod state; mod transfers; use std::{ - mem::size_of, net::Ipv4Addr, sync::{Arc, Mutex}, time::{Duration, Instant}, @@ -18,50 +18,18 @@ use std::{ use clap::Parser; use cli::Cli; -use futures::SinkExt; use futures_util::stream::{FuturesUnordered, StreamExt}; -use http::HeaderValue; -use rpc::control::{self, AgentRpcServer}; -use snops_common::{ - constant::{ENV_AGENT_KEY, HEADER_AGENT_KEY}, - db::Database, - rpc::{ - control::{agent::AgentService, ControlServiceClient, PING_HEADER}, - RpcTransport, PING_INTERVAL_SEC, PING_LENGTH, - }, - util::OpaqueDebug, -}; -use tarpc::server::Channel; +use log::init_logging; +use snops_common::{db::Database, util::OpaqueDebug}; use tokio::{ select, signal::unix::{signal, Signal, SignalKind}, sync::RwLock, }; -use tokio_tungstenite::{ - connect_async, - tungstenite::{self, client::IntoClientRequest}, -}; -use tracing::{error, info, level_filters::LevelFilter, warn}; -use tracing_subscriber::{layer::SubscriberExt, reload, util::SubscriberInitExt, EnvFilter}; +use tracing::{error, info}; use crate::state::GlobalState; - -type ReloadHandler = reload::Handle; - -fn make_env_filter(level: LevelFilter) -> EnvFilter { - EnvFilter::builder() - .with_env_var("SNOPS_AGENT_LOG") - .with_default_directive(level.into()) - .from_env_lossy() - .add_directive(level.into()) - .add_directive("neli=off".parse().unwrap()) - .add_directive("hyper_util=off".parse().unwrap()) - .add_directive("reqwest=off".parse().unwrap()) - .add_directive("tungstenite=off".parse().unwrap()) - .add_directive("tokio_tungstenite=off".parse().unwrap()) - .add_directive("tarpc::client=ERROR".parse().unwrap()) - .add_directive("tarpc::server=ERROR".parse().unwrap()) -} +mod log; #[tokio::main] async fn main() { @@ -69,59 +37,18 @@ async fn main() { .install_default() .expect("Failed to install rustls crypto provider"); - let (stdout, _guard) = tracing_appender::non_blocking(std::io::stdout()); - let start_time = Instant::now(); - - let output: tracing_subscriber::fmt::Layer< - _, - tracing_subscriber::fmt::format::DefaultFields, - tracing_subscriber::fmt::format::Format, - tracing_appender::non_blocking::NonBlocking, - > = tracing_subscriber::fmt::layer().with_writer(stdout); - - let output = if cfg!(debug_assertions) { - output.with_file(true).with_line_number(true) - } else { - output - }; - - let filter_level = if cfg!(debug_assertions) { - LevelFilter::TRACE - } else { - LevelFilter::INFO - }; - - let (env_filter, reload_handler) = reload::Layer::new(make_env_filter(filter_level)); - - tracing_subscriber::registry() - .with(env_filter) - .with(output) - .try_init() - .unwrap(); - // For documentation purposes will exit after running the command. #[cfg(any(feature = "clipages", feature = "mangen"))] Cli::parse().run(); + + let (_guard, reload_handler) = init_logging(); + let args = Cli::parse(); - let internal_addrs = match (args.internal, args.external) { - // use specified internal address - (Some(internal), _) => vec![internal], - // use no internal address if the external address is loopback - (None, Some(external)) if external.is_loopback() => vec![], - // otherwise, get the local network interfaces available to this node - (None, _) => net::get_internal_addrs().expect("failed to get network interfaces"), - }; - let external_addr = args.external; - if let Some(addr) = external_addr { - info!("using external addr: {}", addr); - } else { - info!("skipping external addr"); - } + let (internal_addrs, external_addr) = args.addrs(); - // get the endpoint let (endpoint, ws_uri) = args.endpoint_and_uri(); - info!("connecting to {endpoint}"); + info!("Using endpoint {endpoint}"); // create the data directory tokio::fs::create_dir_all(&args.path) @@ -131,16 +58,10 @@ async fn main() { // open the database let db = db::Database::open(&args.path.join("store")).expect("failed to open database"); - // create rpc channels - let (client_response_in, client_transport, mut client_request_out) = RpcTransport::new(); - let (server_request_in, server_transport, mut server_response_out) = RpcTransport::new(); - - // set up the client, facing the control plane - let client = - ControlServiceClient::new(tarpc::client::Config::default(), client_transport).spawn(); + let client = Default::default(); // start transfer monitor - let (transfer_tx, transfers) = transfers::start_monitor(client.clone()); + let (transfer_tx, transfers) = transfers::start_monitor(Arc::clone(&client)); let agent_rpc_listener = tokio::net::TcpListener::bind((Ipv4Addr::LOCALHOST, 0)) .await @@ -154,7 +75,6 @@ async fn main() { let state = Arc::new(GlobalState { client, _started: Instant::now(), - connected: Mutex::new(Instant::now()), external_addr, internal_addrs, cli: args, @@ -199,195 +119,25 @@ async fn main() { } }); - // initialize and start the rpc server - let rpc_server = tarpc::server::BaseChannel::with_defaults(server_transport); - tokio::spawn( - rpc_server - .execute( - AgentRpcServer { - state: state.to_owned(), - version: env!("CARGO_PKG_VERSION"), - } - .serve(), - ) - .for_each(|r| async move { - tokio::spawn(r); - }), - ); - // get the interrupt signals to break the stream connection let mut interrupt = Signals::new(&[SignalKind::terminate(), SignalKind::interrupt()]); - 'process: loop { - 'connection: { - let mut req = ws_uri.to_owned().into_client_request().unwrap(); - - // invalidate env info cache - state.env_info.write().await.take(); - state - .db - .set_env_info(None) - .expect("failed to clear env info"); - - // attach JWT if we have one - if let Some(jwt) = state.db.jwt() { - req.headers_mut().insert( - "Authorization", - HeaderValue::from_bytes(format!("Bearer {jwt}").as_bytes()) - .expect("attach authorization header"), - ); - } - - // attach agent key if one is set in env vars - if let Ok(key) = std::env::var(ENV_AGENT_KEY) { - req.headers_mut().insert( - HEADER_AGENT_KEY, - HeaderValue::from_bytes(key.as_bytes()).expect("attach agent key header"), - ); - } - - let (mut ws_stream, _) = select! { - _ = interrupt.recv_any() => break 'process, - - res = connect_async(req) => match res { - Ok(c) => c, - Err(e) => { - error!("An error occurred establishing the connection: {e}"); - break 'connection; - }, - }, - }; - - *state.connected.lock().unwrap() = Instant::now(); - - info!("Connection established with the control plane"); - - let mut terminating = false; - let mut interval = tokio::time::interval(Duration::from_secs(PING_INTERVAL_SEC)); - let mut num_pings: u32 = 0; - - 'event: loop { - select! { - // terminate if an interrupt was triggered - _ = interrupt.recv_any() => { - terminating = true; - break 'event; - } - - _ = interval.tick() => { - // ping payload contains "snops-agent", number of pings, and uptime - let mut payload = Vec::from(PING_HEADER); - payload.extend_from_slice(&num_pings.to_le_bytes()); - payload.extend_from_slice(&start_time.elapsed().as_micros().to_le_bytes()); - - let send = ws_stream.send(tungstenite::Message::Ping(payload)); - if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { - error!("The connection to the control plane was interrupted while sending ping"); - break 'event; - } - } - - // handle outgoing responses - msg = server_response_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&control::MuxedMessageOutgoing::Child(msg)).expect("failed to serialize response"); - let send = ws_stream.send(tungstenite::Message::Binary(bin)); - if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { - error!("The connection to the control plane was interrupted while sending agent message"); - break 'event; - } - } - - // handle outgoing requests - msg = client_request_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&control::MuxedMessageOutgoing::Parent(msg)).expect("failed to serialize request"); - let send = ws_stream.send(tungstenite::Message::Binary(bin)); - if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { - error!("The connection to the control plane was interrupted while sending control message"); - break 'event; - } - } - - // handle incoming messages - msg = ws_stream.next() => match msg { - Some(Ok(tungstenite::Message::Close(frame))) => { - if let Some(frame) = frame { - info!("The control plane has closed the connection: {frame}"); - } else { - info!("The control plane has closed the connection"); - } - break 'event; - } - - Some(Ok(tungstenite::Message::Pong(payload))) => { - let mut payload = payload.as_slice(); - // check the header - if !payload.starts_with(PING_HEADER) { - warn!("Received a pong payload with an invalid header prefix"); - continue; - } - payload = &payload[PING_HEADER.len()..]; - if payload.len() != PING_LENGTH { - warn!("Received a pong payload with an invalid length {}, expected {PING_LENGTH}", payload.len()); - continue; - } - let (left, right) = payload.split_at(size_of::()); - let ping_index = u32::from_le_bytes(left.try_into().unwrap()); - let _uptime_start = u128::from_le_bytes(right.try_into().unwrap()); - - if ping_index != num_pings { - warn!("Received a pong payload with an invalid index {ping_index}, expected {num_pings}"); - continue; - } - - num_pings += 1; - - // when desired, we can add this as a metric - // let uptime_now = start_time.elapsed().as_micros(); - // let uptime_diff = uptime_now - uptime_start; - - } - - Some(Ok(tungstenite::Message::Binary(bin))) => { - let msg = match bincode::deserialize(&bin) { - Ok(msg) => msg, - Err(e) => { - error!("failed to deserialize a message from the control plane: {e}"); - continue; - } - }; - - match msg { - control::MuxedMessageIncoming::Child(msg) => server_request_in.send(msg).expect("internal RPC channel closed"), - control::MuxedMessageIncoming::Parent(msg) => client_response_in.send(msg).expect("internal RPC channel closed"), - } - } - - None | Some(Err(_)) => { - error!("The connection to the control plane was interrupted"); - break 'event; - } - - Some(Ok(o)) => println!("{o:#?}"), - }, - }; - } - - if terminating { - break 'process; - } + let state2 = Arc::clone(&state); + let connection_loop = Box::pin(async move { + loop { + let req = client::new_ws_request(&ws_uri, state2.db.jwt()); + client::ws_connection(req, Arc::clone(&state2)).await; + info!("Attempting to reconnect..."); + tokio::time::sleep(Duration::from_secs(5)).await; } + }); - // wait some time before attempting to reconnect - select! { - _ = interrupt.recv_any() => break, + select! { + _ = interrupt.recv_any() => { + info!("Received interrupt signal, shutting down..."); + }, - // TODO: dynamic time - _ = tokio::time::sleep(Duration::from_secs(5)) => { - info!("Attempting to reconnect..."); - }, - } + _ = connection_loop => unreachable!() } state.node_graceful_shutdown().await; diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs new file mode 100644 index 00000000..5c1a653e --- /dev/null +++ b/crates/agent/src/reconcile/agent.rs @@ -0,0 +1,267 @@ +use std::{collections::HashSet, ops::Deref, process::Stdio, sync::Arc}; + +use snops_common::{ + constant::{ + LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, SNARKOS_LOG_FILE, + }, + rpc::error::ReconcileError2, + state::{AgentId, AgentPeer, AgentState, EnvId, KeyState, NodeState}, +}; +use tarpc::context; +use tokio::process::Command; +use tracing::{error, warn}; + +use super::{Reconcile, ReconcileStatus}; +use crate::state::GlobalState; + +struct AgentStateReconciler { + agent_state: AgentState, + state: Arc, +} + +impl Reconcile<(), ReconcileError2> for AgentStateReconciler { + async fn reconcile(&self) -> Result, ReconcileError2> { + match &self.agent_state { + AgentState::Inventory => { + // TODO: cleanup child process + // TODO: cleanup other things + return Ok(ReconcileStatus::empty()); + } + AgentState::Node(env_id, node) => { + // node is offline, no need to reconcile + if !node.online { + return Ok(ReconcileStatus::empty()); + } + + let command_res = NodeCommandReconciler { + env_id: *env_id, + node: Arc::new(*node.clone()), + state: Arc::clone(&self.state), + } + .reconcile() + .await?; + + if command_res.is_requeue() { + return Ok(command_res.emptied()); + } + + let Some(_command) = command_res.take() else { + return Ok(ReconcileStatus::default()); + }; + + // TODO: spawn the command, manage its state + } + } + + Ok(ReconcileStatus::empty()) + } +} + +struct NodeCommandReconciler { + node: Arc, + state: Arc, + env_id: EnvId, +} + +impl Reconcile for NodeCommandReconciler { + async fn reconcile(&self) -> Result, ReconcileError2> { + let NodeCommandReconciler { + node, + state, + env_id, + } = self; + let info = state.get_env_info(*env_id).await?; + + // Resolve the addresses of the peers and validators + let res = AddressResolveReconciler { + node: Arc::clone(node), + state: Arc::clone(state), + } + .reconcile() + .await?; + + if res.is_requeue() { + return Ok(res.emptied()); + } + + let mut command = Command::new(state.cli.path.join(SNARKOS_FILE)); + + // set stdio + if state.cli.quiet { + command.stdout(Stdio::null()); + } else { + command.stdout(std::io::stdout()); + } + command.stderr(std::io::stderr()); + + let storage_path = state + .cli + .path + .join("storage") + .join(info.network.to_string()) + .join(info.storage.id.to_string()); + + let ledger_path = if info.storage.persist { + storage_path.join(LEDGER_PERSIST_DIR) + } else { + state.cli.path.join(LEDGER_BASE_DIR) + }; + + // add loki URL if one is set + if let Some(loki) = state.loki.lock().unwrap().deref() { + command + .env( + "SNOPS_LOKI_LABELS", + format!("env_id={},node_key={}", env_id, node.node_key), + ) + .arg("--loki") + .arg(loki.as_str()); + } + + // setup the run command + command + .stderr(std::io::stderr()) + .envs(&node.env) + .env("NETWORK", info.network.to_string()) + .env("HOME", &ledger_path) + .arg("--log") + .arg(state.cli.path.join(SNARKOS_LOG_FILE)) + .arg("run") + .arg("--agent-rpc-port") + .arg(state.agent_rpc_port.to_string()) + .arg("--type") + .arg(node.node_key.ty.to_string()) + .arg("--ledger") + .arg(ledger_path); + + if !info.storage.native_genesis { + command + .arg("--genesis") + .arg(storage_path.join(SNARKOS_GENESIS_FILE)); + } + + // storage configuration + command + // port configuration + .arg("--bind") + .arg(state.cli.bind_addr.to_string()) + .arg("--bft") + .arg(state.cli.ports.bft.to_string()) + .arg("--rest") + .arg(state.cli.ports.rest.to_string()) + .arg("--metrics") + .arg(state.cli.ports.metrics.to_string()) + .arg("--node") + .arg(state.cli.ports.node.to_string()); + + match &node.private_key { + KeyState::None => {} + KeyState::Local => { + command.arg("--private-key-file").arg( + state + .cli + .private_key_file + .as_ref() + .ok_or(ReconcileError2::MissingLocalPrivateKey)?, + ); + } + KeyState::Literal(pk) => { + command.arg("--private-key").arg(pk); + } + } + + // conditionally add retention policy + if let Some(policy) = &info.storage.retention_policy { + command.arg("--retention-policy").arg(policy.to_string()); + } + + if !node.peers.is_empty() { + command + .arg("--peers") + .arg(state.agentpeers_to_cli(&node.peers).await.join(",")); + } + + if !node.validators.is_empty() { + command + .arg("--validators") + .arg(state.agentpeers_to_cli(&node.validators).await.join(",")); + } + + Ok(ReconcileStatus::new(Some(command))) + } +} + +struct AddressResolveReconciler { + state: Arc, + node: Arc, +} + +impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { + async fn reconcile(&self) -> Result, ReconcileError2> { + let AddressResolveReconciler { state, node } = self; + + // Find agents that do not have cached addresses + let unresolved_addrs: HashSet = { + let resolved_addrs = state.resolved_addrs.read().await; + node.peers + .iter() + .chain(node.validators.iter()) + .filter_map(|p| { + if let AgentPeer::Internal(id, _) = p { + (!resolved_addrs.contains_key(id)).then_some(*id) + } else { + None + } + }) + .collect() + }; + + // All addrs have been resolved. + // TODO: May need to mark some of these as stale at some point. + if unresolved_addrs.is_empty() { + return Ok(ReconcileStatus::default()); + } + + let Some(client) = state.client.read().await.clone() else { + warn!("Agent state contains {} addresses that need to be resolved, but client is not connected", unresolved_addrs.len()); + + // Client is offline so new addrs cannot be requested + return Ok(ReconcileStatus::default()); + }; + + // Fetch all unresolved addresses and update the cache + tracing::debug!( + "need to resolve addrs: {}", + unresolved_addrs + .iter() + .map(|id| id.to_string()) + .collect::>() + .join(",") + ); + + // Resolve the addresses + let new_addrs = client + .resolve_addrs(context::current(), unresolved_addrs) + .await + .map_err(|e| ReconcileError2::RpcError(e.to_string()))? + .map_err(ReconcileError2::AddressResolve)?; + + tracing::debug!( + "resolved new addrs: {}", + new_addrs + .iter() + .map(|(id, addr)| format!("{}: {}", id, addr)) + .collect::>() + .join(", ") + ); + + // Extend the cache with the new addresses + let mut lock = state.resolved_addrs.write().await; + lock.extend(new_addrs); + if let Err(e) = state.db.set_resolved_addrs(Some(&lock)) { + error!("failed to save resolved addrs to db: {e}"); + } + + Ok(ReconcileStatus::default()) + } +} diff --git a/crates/agent/src/reconcile/checkpoint.rs b/crates/agent/src/reconcile/checkpoint.rs new file mode 100644 index 00000000..b8003e2b --- /dev/null +++ b/crates/agent/src/reconcile/checkpoint.rs @@ -0,0 +1,108 @@ +use std::{ + collections::BTreeMap, + path::{Path, PathBuf}, +}; + +use snops_checkpoint::{CheckpointHeader, CheckpointManager, RetentionSpan}; +use snops_common::{ + api::CheckpointMeta, + rpc::error::ReconcileError, + state::{NetworkId, StorageId}, +}; +use tracing::{error, info}; + +use crate::{api, state::GlobalState}; + +pub enum CheckpointSource<'a> { + Manager(&'a CheckpointHeader, &'a PathBuf), + Meta(&'a CheckpointMeta), +} + +impl<'a> CheckpointSource<'a> { + pub async fn acquire( + self, + state: &GlobalState, + storage_path: &Path, + storage_id: StorageId, + network: NetworkId, + ) -> Result { + Ok(match self { + CheckpointSource::Meta(meta) => { + info!( + "using checkpoint from control plane with height {} and time {}", + meta.height, meta.timestamp + ); + let checkpoint_url = format!( + "{}/content/storage/{network}/{storage_id}/{}", + &state.endpoint, meta.filename + ); + let path = storage_path.join(&meta.filename); + info!("downloading {} from {checkpoint_url}...", meta.filename); + + api::check_file(checkpoint_url, &path, state.transfer_tx()) + .await + .map_err(|e| { + error!( + "failed to download {} from the control plane: {e}", + meta.filename + ); + ReconcileError::StorageAcquireError(meta.filename.clone()) + })?; + + path + } + CheckpointSource::Manager(header, path) => { + info!( + "using checkpoint from manager with height {} and time {}", + header.block_height, + header.time() + ); + path.clone() + } + }) + } +} + +pub fn find_by_height<'a>( + manager: &'a CheckpointManager, + checkpoints: &'a [CheckpointMeta], + height: u32, +) -> Option> { + let sorted: BTreeMap<_, _> = manager + .checkpoints() + .map(|(c, p)| (c.block_height, CheckpointSource::Manager(c, p))) + .chain( + checkpoints + .iter() + .map(|c| (c.height, CheckpointSource::Meta(c))), + ) + .collect(); + + sorted + .into_iter() + .rev() + .find_map(|(h, c)| if h <= height { Some(c) } else { None }) +} + +pub fn find_by_span<'a>( + manager: &'a CheckpointManager, + checkpoints: &'a [CheckpointMeta], + span: RetentionSpan, +) -> Option> { + let timestamp = span.as_timestamp()?; + + let sorted: BTreeMap<_, _> = manager + .checkpoints() + .map(|(c, p)| (c.timestamp, CheckpointSource::Manager(c, p))) + .chain( + checkpoints + .iter() + .map(|c| (c.timestamp, CheckpointSource::Meta(c))), + ) + .collect(); + + sorted + .into_iter() + .rev() + .find_map(|(t, c)| if t <= timestamp { Some(c) } else { None }) +} diff --git a/crates/agent/src/reconcile.rs b/crates/agent/src/reconcile/files.rs similarity index 76% rename from crates/agent/src/reconcile.rs rename to crates/agent/src/reconcile/files.rs index 693c593a..d2b37fdb 100644 --- a/crates/agent/src/reconcile.rs +++ b/crates/agent/src/reconcile/files.rs @@ -1,22 +1,20 @@ -use std::{ - collections::BTreeMap, - path::{Path, PathBuf}, -}; +use std::path::PathBuf; -use snops_checkpoint::{CheckpointHeader, CheckpointManager, RetentionSpan}; +use snops_checkpoint::CheckpointManager; use snops_common::{ - api::{CheckpointMeta, EnvInfo}, + api::EnvInfo, binaries::{BinaryEntry, BinarySource}, constant::{ LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, LEDGER_STORAGE_FILE, SNARKOS_FILE, SNARKOS_GENESIS_FILE, VERSION_FILE, }, rpc::error::ReconcileError, - state::{HeightRequest, InternedId, NetworkId, StorageId}, + state::{HeightRequest, InternedId}, }; use tokio::process::Command; use tracing::{debug, error, info, trace}; +use super::checkpoint; use crate::{api, state::GlobalState}; /// Ensure the correct binary is present for running snarkos @@ -273,10 +271,10 @@ pub async fn load_ledger( // determine which checkpoint to use by the next available height/time let checkpoint = match height { HeightRequest::Absolute(block_height) => { - find_checkpoint_by_height(manager, &info.storage.checkpoints, *block_height) + checkpoint::find_by_height(manager, &info.storage.checkpoints, *block_height) } HeightRequest::Checkpoint(span) => { - find_checkpoint_by_span(manager, &info.storage.checkpoints, *span) + checkpoint::find_by_span(manager, &info.storage.checkpoints, *span) } _ => unreachable!("handled by previous match"), } @@ -327,100 +325,6 @@ pub async fn load_ledger( Ok(true) } -enum CheckpointSource<'a> { - Manager(&'a CheckpointHeader, &'a PathBuf), - Meta(&'a CheckpointMeta), -} - -impl<'a> CheckpointSource<'a> { - async fn acquire( - self, - state: &GlobalState, - storage_path: &Path, - storage_id: StorageId, - network: NetworkId, - ) -> Result { - Ok(match self { - CheckpointSource::Meta(meta) => { - info!( - "using checkpoint from control plane with height {} and time {}", - meta.height, meta.timestamp - ); - let checkpoint_url = format!( - "{}/content/storage/{network}/{storage_id}/{}", - &state.endpoint, meta.filename - ); - let path = storage_path.join(&meta.filename); - info!("downloading {} from {checkpoint_url}...", meta.filename); - - api::check_file(checkpoint_url, &path, state.transfer_tx()) - .await - .map_err(|e| { - error!( - "failed to download {} from the control plane: {e}", - meta.filename - ); - ReconcileError::StorageAcquireError(meta.filename.clone()) - })?; - - path - } - CheckpointSource::Manager(header, path) => { - info!( - "using checkpoint from manager with height {} and time {}", - header.block_height, - header.time() - ); - path.clone() - } - }) - } -} - -fn find_checkpoint_by_height<'a>( - manager: &'a CheckpointManager, - checkpoints: &'a [CheckpointMeta], - height: u32, -) -> Option> { - let sorted: BTreeMap<_, _> = manager - .checkpoints() - .map(|(c, p)| (c.block_height, CheckpointSource::Manager(c, p))) - .chain( - checkpoints - .iter() - .map(|c| (c.height, CheckpointSource::Meta(c))), - ) - .collect(); - - sorted - .into_iter() - .rev() - .find_map(|(h, c)| if h <= height { Some(c) } else { None }) -} - -fn find_checkpoint_by_span<'a>( - manager: &'a CheckpointManager, - checkpoints: &'a [CheckpointMeta], - span: RetentionSpan, -) -> Option> { - let timestamp = span.as_timestamp()?; - - let sorted: BTreeMap<_, _> = manager - .checkpoints() - .map(|(c, p)| (c.timestamp, CheckpointSource::Manager(c, p))) - .chain( - checkpoints - .iter() - .map(|c| (c.timestamp, CheckpointSource::Meta(c))), - ) - .collect(); - - sorted - .into_iter() - .rev() - .find_map(|(t, c)| if t <= timestamp { Some(c) } else { None }) -} - async fn get_version_from_path(path: &PathBuf) -> Result, ReconcileError> { if !path.exists() { return Ok(None); diff --git a/crates/agent/src/reconcile/mod.rs b/crates/agent/src/reconcile/mod.rs new file mode 100644 index 00000000..facc9a7c --- /dev/null +++ b/crates/agent/src/reconcile/mod.rs @@ -0,0 +1,91 @@ +use std::{collections::HashSet, time::Duration}; + +use indexmap::IndexSet; + +mod agent; +mod checkpoint; +mod files; +pub use files::*; + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum ReconcileCondition { + /// A file is being downloaded. + PendingDownload(String), + /// A file is being unpacked. + PendingUnpack(String), + /// A process is being spawned / confirmed + PendingProcess(String), +} + +trait Reconcile { + async fn reconcile(&self) -> Result, E>; +} + +pub struct ReconcileStatus { + pub inner: Option, + pub requeue_after: Option, + pub conditions: IndexSet, +} + +impl Default for ReconcileStatus { + fn default() -> Self { + Self::new(Some(Default::default())) + } +} + +impl ReconcileStatus { + pub fn new(inner: Option) -> Self { + Self { + inner, + requeue_after: None, + conditions: IndexSet::new(), + } + } + + pub fn empty() -> Self { + Self::new(None) + } + + pub fn is_requeue(&self) -> bool { + self.requeue_after.is_some() + } + + pub fn replace(&self, inner: Option) -> ReconcileStatus { + ReconcileStatus { + inner, + requeue_after: self.requeue_after, + conditions: self.conditions.clone(), + } + } + + pub fn emptied(&self) -> ReconcileStatus { + ReconcileStatus { + inner: None, + requeue_after: self.requeue_after, + conditions: self.conditions.clone(), + } + } + + pub fn take(self) -> Option { + self.inner + } + + pub fn take_conditions(&mut self) -> IndexSet { + std::mem::take(&mut self.conditions) + } + + pub fn requeue_after(mut self, duration: Duration) -> Self { + self.requeue_after = Some(duration); + self + } + + pub fn add_condition(mut self, condition: ReconcileCondition) -> Self { + self.conditions.insert(condition); + self + } + + pub fn add_conditions(mut self, conditions: HashSet) -> Self { + self.conditions.extend(conditions); + self + } +} diff --git a/crates/agent/src/rpc/agent.rs b/crates/agent/src/rpc/agent.rs index 7b668654..0af005a2 100644 --- a/crates/agent/src/rpc/agent.rs +++ b/crates/agent/src/rpc/agent.rs @@ -34,8 +34,11 @@ impl AgentNodeService for AgentNodeRpcServer { block_timestamp, }: SnarkOSBlockInfo, ) -> Result<(), ()> { - self.state - .client + let Some(client) = self.state.client.read().await.clone() else { + return Ok(()); // ignore if client is not available + }; + + client .post_block_status( context::current(), height, @@ -50,8 +53,11 @@ impl AgentNodeService for AgentNodeRpcServer { } async fn post_status(self, _: context::Context, status: SnarkOSStatus) -> Result<(), ()> { - self.state - .client + let Some(client) = self.state.client.read().await.clone() else { + return Ok(()); // ignore if client is not available + }; + + client .post_node_status(context::current(), status.into()) .await .inspect_err(|err| tracing::error!("failed to post node status: {err}")) diff --git a/crates/agent/src/rpc/control.rs b/crates/agent/src/rpc/control.rs index 3d366f93..5aafaf29 100644 --- a/crates/agent/src/rpc/control.rs +++ b/crates/agent/src/rpc/control.rs @@ -18,7 +18,7 @@ use snops_common::{ AgentMetric, AgentService, AgentServiceRequest, AgentServiceResponse, AgentStatus, Handshake, }, - ControlServiceRequest, ControlServiceResponse, + ControlServiceClient, ControlServiceRequest, ControlServiceResponse, }, error::{AgentError, ReconcileError, SnarkosRequestError}, }, @@ -29,7 +29,8 @@ use tokio::process::Command; use tracing::{debug, error, info, trace, warn}; use crate::{ - api, make_env_filter, + api, + log::make_env_filter, metrics::MetricComputer, reconcile::{self, ensure_correct_binary}, state::AppState, @@ -42,6 +43,7 @@ define_rpc_mux!(child; #[derive(Clone)] pub struct AgentRpcServer { + pub client: ControlServiceClient, pub state: AppState, pub version: &'static str, } @@ -78,7 +80,6 @@ impl AgentService for AgentRpcServer { { error!("failed to save loki URL to db: {e}"); } - match self.state.loki.lock() { Ok(mut guard) => { *guard = loki_url; @@ -90,7 +91,6 @@ impl AgentService for AgentRpcServer { // emit the transfer statuses if let Err(err) = self - .state .client .post_transfer_statuses( context, @@ -329,7 +329,7 @@ impl AgentService for AgentRpcServer { .collect::>() .join(",") ); - let new_addrs = state + let new_addrs = self .client .resolve_addrs(context::current(), unresolved_addrs) .await @@ -346,7 +346,13 @@ impl AgentService for AgentRpcServer { .collect::>() .join(", ") ); - state.resolved_addrs.write().await.extend(new_addrs); + { + let mut guard = state.resolved_addrs.write().await; + guard.extend(new_addrs); + if let Err(e) = state.db.set_resolved_addrs(Some(&guard)) { + error!("failed to save resolved addrs to db: {e}"); + } + } } if !node.peers.is_empty() { diff --git a/crates/agent/src/state.rs b/crates/agent/src/state.rs index e277771c..e8f2c459 100644 --- a/crates/agent/src/state.rs +++ b/crates/agent/src/state.rs @@ -1,16 +1,15 @@ use std::{ - collections::HashMap, net::IpAddr, sync::{Arc, Mutex}, time::{Duration, Instant}, }; -use anyhow::bail; use dashmap::DashMap; +use indexmap::IndexMap; use reqwest::Url; use snops_common::{ api::EnvInfo, - rpc::{agent::node::NodeServiceClient, control::ControlServiceClient}, + rpc::{agent::node::NodeServiceClient, control::ControlServiceClient, error::ReconcileError2}, state::{AgentId, AgentPeer, AgentState, EnvId, TransferId, TransferStatus}, util::OpaqueDebug, }; @@ -23,18 +22,18 @@ use tokio::{ }; use tracing::{error, info}; -use crate::{cli::Cli, db::Database, metrics::Metrics, transfers::TransferTx, ReloadHandler}; +use crate::{cli::Cli, db::Database, log::ReloadHandler, metrics::Metrics, transfers::TransferTx}; pub const NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(10); pub type AppState = Arc; +pub type ClientLock = Arc>>; /// Global state for this agent runner. pub struct GlobalState { - pub client: ControlServiceClient, + pub client: ClientLock, pub db: OpaqueDebug, pub _started: Instant, - pub connected: Mutex, pub external_addr: Option, pub internal_addrs: Vec, @@ -48,7 +47,7 @@ pub struct GlobalState { pub child: RwLock>, /* TODO: this may need to be handled by an owning thread, * not sure yet */ // Map of agent IDs to their resolved addresses. - pub resolved_addrs: RwLock>, + pub resolved_addrs: RwLock>, pub metrics: RwLock, pub transfer_tx: TransferTx, @@ -76,15 +75,31 @@ impl GlobalState { .collect::>() } - pub async fn get_env_info(&self, env_id: EnvId) -> anyhow::Result { + pub async fn set_env_info(&self, info: Option<(EnvId, EnvInfo)>) { + if let Err(e) = self.db.set_env_info(info.as_ref()) { + error!("failed to save env info to db: {e}"); + } + *self.env_info.write().await = info; + } + + pub async fn get_env_info(&self, env_id: EnvId) -> Result { match self.env_info.read().await.as_ref() { Some((id, info)) if *id == env_id => return Ok(info.clone()), _ => {} } - let Some(info) = self.client.get_env_info(context::current(), env_id).await? else { - bail!("failed to get env info: env not found {env_id}"); - }; + let client = self + .client + .read() + .await + .clone() + .ok_or(ReconcileError2::Offline)?; + + let info = client + .get_env_info(context::current(), env_id) + .await + .map_err(|e| ReconcileError2::RpcError(e.to_string()))? + .ok_or(ReconcileError2::MissingEnv(env_id))?; let env_info = (env_id, info.clone()); if let Err(e) = self.db.set_env_info(Some(&env_info)) { diff --git a/crates/agent/src/transfers.rs b/crates/agent/src/transfers.rs index 2d95cc28..8926d1fc 100644 --- a/crates/agent/src/transfers.rs +++ b/crates/agent/src/transfers.rs @@ -5,13 +5,12 @@ use std::sync::{ use chrono::{TimeDelta, Utc}; use dashmap::{mapref::entry::Entry, DashMap}; -use snops_common::{ - rpc::control::ControlServiceClient, - state::{TransferId, TransferStatus, TransferStatusUpdate}, -}; +use snops_common::state::{TransferId, TransferStatus, TransferStatusUpdate}; use tarpc::context; use tokio::{select, sync::mpsc}; +use crate::state::ClientLock; + pub type TransferTx = mpsc::UnboundedSender<(TransferId, TransferStatusUpdate)>; // how long to wait before cleaning up a transfer that has ended @@ -23,9 +22,7 @@ pub fn next_id() -> TransferId { TRANSFER_ID_CTR.fetch_add(1, Ordering::AcqRel) } -pub fn start_monitor( - client: ControlServiceClient, -) -> (TransferTx, Arc>) { +pub fn start_monitor(client: ClientLock) -> (TransferTx, Arc>) { let (tx, mut rx) = mpsc::unbounded_channel::<(TransferId, TransferStatusUpdate)>(); let state_transfers = Arc::new(DashMap::new()); @@ -39,7 +36,7 @@ pub fn start_monitor( // cleanup transfers that have ended _ = interval.tick() => { let now = Utc::now(); - let client = client.clone(); + let client = Arc::clone(&client); transfers.retain(|&id, transfer: &mut TransferStatus| { let is_done = transfer.total_bytes == transfer.downloaded_bytes; let is_error = transfer.interruption.is_some(); @@ -57,6 +54,10 @@ pub fn start_monitor( // send the update to the control plane let client = client.clone(); tokio::spawn(async move { + let Some(client) = client.read().await.clone() else { + return + }; + if let Err(e) = client.post_transfer_status(context::current(), id, TransferStatusUpdate::Cleanup).await { tracing::error!("failed to send transfer cleanup update: {e}"); } @@ -105,6 +106,10 @@ pub fn start_monitor( // send the update to the control plane let client = client.clone(); tokio::spawn(async move { + let Some(client) = client.read().await.clone() else { + return + }; + if let Err(e) = client.post_transfer_status(context::current(), id, message).await { tracing::error!("failed to send transfer status update: {e}"); } diff --git a/crates/common/src/rpc/error.rs b/crates/common/src/rpc/error.rs index cc7c856a..de6810cb 100644 --- a/crates/common/src/rpc/error.rs +++ b/crates/common/src/rpc/error.rs @@ -2,6 +2,8 @@ use serde::{Deserialize, Serialize}; use strum_macros::AsRefStr; use thiserror::Error; +use crate::state::EnvId; + #[macro_export] macro_rules! impl_into_type_str { ($name:path) => { @@ -146,3 +148,19 @@ pub enum ReconcileError { #[error("unknown error")] Unknown, } + +#[derive(Debug, Error, Serialize, Deserialize, AsRefStr)] +pub enum ReconcileError2 { + #[error("node is not connected to the controlplane")] + Offline, + #[error("env {0} not found")] + MissingEnv(EnvId), + #[error("unknown error")] + Unknown, + #[error("rpc error: {0}")] + RpcError(String), + #[error(transparent)] + AddressResolve(#[from] ResolveError), + #[error("missing local private key")] + MissingLocalPrivateKey, +} diff --git a/crates/common/src/state/port_config.rs b/crates/common/src/state/port_config.rs index e47c4ecf..222349ef 100644 --- a/crates/common/src/state/port_config.rs +++ b/crates/common/src/state/port_config.rs @@ -1,6 +1,6 @@ use crate::format::{DataFormat, DataFormatReader}; -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, clap::Parser)] +#[derive(Debug, Copy, Clone, serde::Serialize, serde::Deserialize, clap::Parser)] pub struct PortConfig { /// Specify the IP address and port for the node server #[clap(long = "node", default_value_t = 4130)] From 5db510f41fdbc403395ef545de60b237773ace7b Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 16 Nov 2024 16:06:55 -0500 Subject: [PATCH 04/68] feat(agent): WIP reconcile 2.0 loop, reconciler context and command equality testing --- crates/agent/src/main.rs | 79 +++++- crates/agent/src/reconcile/agent.rs | 319 ++++++++++++++++++------- crates/agent/src/reconcile/mod.rs | 21 +- crates/agent/src/rpc/control.rs | 32 ++- crates/agent/src/state.rs | 15 +- crates/common/src/state/port_config.rs | 2 +- 6 files changed, 352 insertions(+), 116 deletions(-) diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 2bcab7b4..aecc3ed9 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -12,6 +12,7 @@ mod transfers; use std::{ net::Ipv4Addr, + ops::Deref, sync::{Arc, Mutex}, time::{Duration, Instant}, }; @@ -20,13 +21,14 @@ use clap::Parser; use cli::Cli; use futures_util::stream::{FuturesUnordered, StreamExt}; use log::init_logging; +use reconcile::{agent::AgentStateReconciler, Reconcile}; use snops_common::{db::Database, util::OpaqueDebug}; use tokio::{ select, signal::unix::{signal, Signal, SignalKind}, - sync::RwLock, + sync::{mpsc, RwLock}, }; -use tracing::{error, info}; +use tracing::{error, info, trace}; use crate::state::GlobalState; mod log; @@ -71,6 +73,8 @@ async fn main() { .expect("failed to get status server port") .port(); + let (queue_reconcile_tx, mut reconcile_requests) = mpsc::channel(5); + // create the client state let state = Arc::new(GlobalState { client, @@ -79,6 +83,7 @@ async fn main() { internal_addrs, cli: args, endpoint, + queue_reconcile_tx, loki: Mutex::new(db.loki_url()), env_info: RwLock::new( db.env_info() @@ -89,6 +94,7 @@ async fn main() { ), agent_state: RwLock::new( db.agent_state() + .map(Arc::new) .inspect_err(|e| { error!("failed to load agent state from db: {e}"); }) @@ -96,7 +102,13 @@ async fn main() { ), reconcilation_handle: Default::default(), child: Default::default(), - resolved_addrs: Default::default(), + resolved_addrs: RwLock::new( + db.resolved_addrs() + .inspect_err(|e| { + error!("failed to load resolved addrs from db: {e}"); + }) + .unwrap_or_default(), + ), metrics: Default::default(), agent_rpc_port, transfer_tx, @@ -132,12 +144,71 @@ async fn main() { } }); + let state3 = Arc::clone(&state); + let reconcile_loop = Box::pin(async move { + let mut err_backoff = 0; + let mut reconcile_ctx = Default::default(); + + // The first reconcile is scheduled for 5 seconds after startup. + // Connecting to the controlplane will likely trigger a reconcile sooner. + let mut next_reconcile_at = Instant::now() + Duration::from_secs(5); + let mut wait = Box::pin(tokio::time::sleep_until(next_reconcile_at.into())); + + loop { + // await for the next reconcile, allowing for it to be moved up sooner + select! { + // replace the next_reconcile_at with the soonest reconcile time + Some(new_reconcile_at) = reconcile_requests.recv() => { + next_reconcile_at = next_reconcile_at.min(new_reconcile_at); + wait = Box::pin(tokio::time::sleep_until(next_reconcile_at.into())); + }, + _ = &mut wait => {} + } + + // drain the reconcile request queue + while reconcile_requests.try_recv().is_ok() {} + // schedule the next reconcile for 5 minutes from now + next_reconcile_at = Instant::now() + Duration::from_secs(5 * 60); + + trace!("reconciling agent state..."); + match (AgentStateReconciler { + agent_state: Arc::clone(state3.agent_state.read().await.deref()), + state: Arc::clone(&state3), + context: std::mem::take(&mut reconcile_ctx), + }) + .reconcile() + .await + { + Ok(mut status) => { + if let Some(context) = status.inner.take() { + trace!("reconcile completed"); + reconcile_ctx = context; + } + if !status.conditions.is_empty() { + trace!("reconcile conditions: {:?}", status.conditions); + } + if let Some(requeue_after) = status.requeue_after { + next_reconcile_at = Instant::now() + requeue_after; + } + } + Err(e) => { + error!("failed to reconcile agent state: {e}"); + err_backoff = (err_backoff + 5).min(30); + next_reconcile_at = Instant::now() + Duration::from_secs(err_backoff); + } + } + + // TODO: announce reconcile status to the server, throttled + } + }); + select! { _ = interrupt.recv_any() => { info!("Received interrupt signal, shutting down..."); }, - _ = connection_loop => unreachable!() + _ = connection_loop => unreachable!(), + _ = reconcile_loop => unreachable!(), } state.node_graceful_shutdown().await; diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index 5c1a653e..89db84cb 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -1,38 +1,71 @@ -use std::{collections::HashSet, ops::Deref, process::Stdio, sync::Arc}; +use std::{ + collections::HashSet, net::IpAddr, ops::Deref, path::PathBuf, process::Stdio, sync::Arc, +}; +use indexmap::IndexMap; +use snops_checkpoint::RetentionPolicy; use snops_common::{ + api::EnvInfo, constant::{ LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, SNARKOS_LOG_FILE, }, rpc::error::ReconcileError2, - state::{AgentId, AgentPeer, AgentState, EnvId, KeyState, NodeState}, + state::{ + AgentId, AgentPeer, AgentState, EnvId, InternedId, KeyState, NetworkId, NodeKey, NodeState, + PortConfig, + }, }; use tarpc::context; use tokio::process::Command; use tracing::{error, warn}; +use url::Url; use super::{Reconcile, ReconcileStatus}; use crate::state::GlobalState; -struct AgentStateReconciler { - agent_state: AgentState, - state: Arc, +/// Attempt to reconcile the agent's current state. +/// This will download files and start/stop the node +pub struct AgentStateReconciler { + pub agent_state: Arc, + pub state: Arc, + pub context: AgentStateReconcilerContext, } -impl Reconcile<(), ReconcileError2> for AgentStateReconciler { - async fn reconcile(&self) -> Result, ReconcileError2> { - match &self.agent_state { +#[derive(Default)] +pub struct AgentStateReconcilerContext { + /// All parameters needed to build the command to start the node + command: Option, + // TODO: store active transfers here for monitoring + // TODO: update api::download_file to receive a transfer id +} + +impl Reconcile for AgentStateReconciler { + async fn reconcile( + self, + ) -> Result, ReconcileError2> { + match self.agent_state.as_ref() { AgentState::Inventory => { // TODO: cleanup child process // TODO: cleanup other things - return Ok(ReconcileStatus::empty()); + + // return a default context because the node, in inventory, has no state + return Ok(ReconcileStatus::default().add_scope("agent_state/inventory")); } AgentState::Node(env_id, node) => { // node is offline, no need to reconcile if !node.online { - return Ok(ReconcileStatus::empty()); + // TODO: tear down the node if it is running + return Ok( + ReconcileStatus::with(self.context).add_scope("agent_state/node/offline") + ); } + // TODO: download binaries + // TODO: restart the node if the binaries changed. this means storing the hashes + // of the downloaded files + + // TODO: requeue if the binaries are not ready + let command_res = NodeCommandReconciler { env_id: *env_id, node: Arc::new(*node.clone()), @@ -42,14 +75,21 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { .await?; if command_res.is_requeue() { - return Ok(command_res.emptied()); + return Ok(command_res.emptied().add_scope("agent_state/node/requeue")); } - let Some(_command) = command_res.take() else { - return Ok(ReconcileStatus::default()); + let Some(command) = command_res.take() else { + return Ok(ReconcileStatus::default().add_scope("agent_state/node/no_command")); }; - // TODO: spawn the command, manage its state + if self.context.command.as_ref() != Some(&command) { + // TODO: OK to restart the node -- command has changed + } + + // TODO: spawn the command, manage its state, check that it's up + // TODO: if possible, use the NodeCommand as configuration for a node service to + // allow running the node outside of the agent + let _cmd = command.build(); } } @@ -57,62 +97,71 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { } } +/// Given a node state, construct the command needed to start the node struct NodeCommandReconciler { node: Arc, state: Arc, env_id: EnvId, } -impl Reconcile for NodeCommandReconciler { - async fn reconcile(&self) -> Result, ReconcileError2> { - let NodeCommandReconciler { - node, - state, - env_id, - } = self; - let info = state.get_env_info(*env_id).await?; - - // Resolve the addresses of the peers and validators - let res = AddressResolveReconciler { - node: Arc::clone(node), - state: Arc::clone(state), - } - .reconcile() - .await?; - - if res.is_requeue() { - return Ok(res.emptied()); - } +#[derive(Debug, Clone, Eq, PartialEq)] +struct NodeCommand { + /// Path to the snarkos binary + command_path: PathBuf, + /// If true, do not print stdout + quiet: bool, + /// Environment ID (used in loki) + env_id: EnvId, + /// Node key (drives NETWORK env) + network: NetworkId, + /// Node key (derives node type and loki) + node_key: NodeKey, + /// URL for sending logs to loki + loki: Option, + /// Path to the ledger directory + ledger_path: PathBuf, + /// Path to place the log file + log_path: PathBuf, + /// Path to genesis block. When absent, use the network's genesis block. + genesis_path: Option, + /// Env variables to pass to the node + env: IndexMap, + /// Port to bind the agent's RPC server for node status + agent_rpc_port: u16, + /// Address to bind the node to + bind_addr: IpAddr, + /// Port configuration for the node + ports: PortConfig, + /// Private key to use for the node + private_key: Option, + /// Path to a file containing the private key + private_key_file: Option, + /// Retention policy for the node + retention_policy: Option, + /// Resolved peer addresses for the node + peers: Vec, + /// Resolved validator addresses for the node + validators: Vec, +} - let mut command = Command::new(state.cli.path.join(SNARKOS_FILE)); +impl NodeCommand { + fn build(&self) -> Command { + let mut command = Command::new(&self.command_path); // set stdio - if state.cli.quiet { + if self.quiet { command.stdout(Stdio::null()); } else { command.stdout(std::io::stdout()); } command.stderr(std::io::stderr()); - let storage_path = state - .cli - .path - .join("storage") - .join(info.network.to_string()) - .join(info.storage.id.to_string()); - - let ledger_path = if info.storage.persist { - storage_path.join(LEDGER_PERSIST_DIR) - } else { - state.cli.path.join(LEDGER_BASE_DIR) - }; - // add loki URL if one is set - if let Some(loki) = state.loki.lock().unwrap().deref() { + if let Some(loki) = &self.loki { command .env( "SNOPS_LOKI_LABELS", - format!("env_id={},node_key={}", env_id, node.node_key), + format!("env_id={},node_key={}", self.env_id, self.node_key), ) .arg("--loki") .arg(loki.as_str()); @@ -121,83 +170,148 @@ impl Reconcile for NodeCommandReconciler { // setup the run command command .stderr(std::io::stderr()) - .envs(&node.env) - .env("NETWORK", info.network.to_string()) - .env("HOME", &ledger_path) + .envs(&self.env) + .env("NETWORK", self.network.to_string()) + .env("HOME", &self.ledger_path) .arg("--log") - .arg(state.cli.path.join(SNARKOS_LOG_FILE)) + .arg(&self.log_path) .arg("run") .arg("--agent-rpc-port") - .arg(state.agent_rpc_port.to_string()) + .arg(self.agent_rpc_port.to_string()) .arg("--type") - .arg(node.node_key.ty.to_string()) + .arg(self.node_key.ty.to_string()) .arg("--ledger") - .arg(ledger_path); + .arg(&self.ledger_path); - if !info.storage.native_genesis { - command - .arg("--genesis") - .arg(storage_path.join(SNARKOS_GENESIS_FILE)); + if let Some(genesis) = &self.genesis_path { + command.arg("--genesis").arg(genesis); } // storage configuration command // port configuration .arg("--bind") - .arg(state.cli.bind_addr.to_string()) + .arg(self.bind_addr.to_string()) .arg("--bft") - .arg(state.cli.ports.bft.to_string()) + .arg(self.ports.bft.to_string()) .arg("--rest") - .arg(state.cli.ports.rest.to_string()) + .arg(self.ports.rest.to_string()) .arg("--metrics") - .arg(state.cli.ports.metrics.to_string()) + .arg(self.ports.metrics.to_string()) .arg("--node") - .arg(state.cli.ports.node.to_string()); + .arg(self.ports.node.to_string()); - match &node.private_key { - KeyState::None => {} - KeyState::Local => { - command.arg("--private-key-file").arg( - state - .cli - .private_key_file - .as_ref() - .ok_or(ReconcileError2::MissingLocalPrivateKey)?, - ); - } - KeyState::Literal(pk) => { - command.arg("--private-key").arg(pk); - } + if let Some(pk) = &self.private_key { + command.arg("--private-key").arg(pk); + } + + if let Some(pk_file) = &self.private_key_file { + command.arg("--private-key-file").arg(pk_file); } // conditionally add retention policy - if let Some(policy) = &info.storage.retention_policy { + if let Some(policy) = &self.retention_policy { command.arg("--retention-policy").arg(policy.to_string()); } - if !node.peers.is_empty() { - command - .arg("--peers") - .arg(state.agentpeers_to_cli(&node.peers).await.join(",")); + if !self.peers.is_empty() { + command.arg("--peers").arg(self.peers.join(",")); } - if !node.validators.is_empty() { - command - .arg("--validators") - .arg(state.agentpeers_to_cli(&node.validators).await.join(",")); + if !self.validators.is_empty() { + command.arg("--validators").arg(self.validators.join(",")); + } + + command + } +} + +impl Reconcile for NodeCommandReconciler { + async fn reconcile(self) -> Result, ReconcileError2> { + let NodeCommandReconciler { + node, + state, + env_id, + } = self; + let info = state.get_env_info(env_id).await?; + + // Resolve the addresses of the peers and validators + let res = AddressResolveReconciler { + node: Arc::clone(&node), + state: Arc::clone(&state), + } + .reconcile() + .await?; + + if res.is_requeue() { + return Ok(res + .emptied() + .add_scope("node_command/address_resolve/requeue")); } - Ok(ReconcileStatus::new(Some(command))) + let storage_path = state + .cli + .path + .join("storage") + .join(info.network.to_string()) + .join(info.storage.id.to_string()); + + let ledger_path = if info.storage.persist { + storage_path.join(LEDGER_PERSIST_DIR) + } else { + state.cli.path.join(LEDGER_BASE_DIR) + }; + + let run = NodeCommand { + command_path: state.cli.path.join(SNARKOS_FILE), + quiet: state.cli.quiet, + env_id, + node_key: node.node_key.clone(), + loki: state.loki.lock().ok().and_then(|l| l.deref().clone()), + ledger_path, + log_path: state.cli.path.join(SNARKOS_LOG_FILE), + genesis_path: (!info.storage.native_genesis) + .then(|| storage_path.join(SNARKOS_GENESIS_FILE)), + network: info.network, + env: node.env.clone(), + agent_rpc_port: state.agent_rpc_port, + bind_addr: state.cli.bind_addr, + ports: state.cli.ports, + private_key: if let KeyState::Literal(pk) = &node.private_key { + Some(pk.clone()) + } else { + None + }, + private_key_file: if let KeyState::Local = &node.private_key { + Some( + state + .cli + .private_key_file + .clone() + .ok_or(ReconcileError2::MissingLocalPrivateKey)?, + ) + } else { + None + }, + peers: state.agentpeers_to_cli(&node.peers).await, + validators: state.agentpeers_to_cli(&node.validators).await, + retention_policy: info.storage.retention_policy.clone(), + }; + + Ok(ReconcileStatus::new(Some(run))) } } +/// Given a node state, resolve the addresses of the agent based peers and +/// validators. Non-agent based peers have their addresses within the state +/// already. struct AddressResolveReconciler { state: Arc, node: Arc, } impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { - async fn reconcile(&self) -> Result, ReconcileError2> { + async fn reconcile(self) -> Result, ReconcileError2> { let AddressResolveReconciler { state, node } = self; // Find agents that do not have cached addresses @@ -265,3 +379,24 @@ impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { Ok(ReconcileStatus::default()) } } + +/// Download a specific binary file needed to run the node +struct BinaryReconciler { + binary_id: Option, + state: Arc, + info: EnvInfo, +} + +// TODO: binary reconcile behavior: +// 1. check if the file exists. +// 2. if not, start downloading the file +// 3. if the file is already downloading, requeue if not done +// 4. when the transfer is done, check the sha256 hash and size + +// TODO: large file download behavior (ledgers): +// same as above, except maybe chunk the downloads or + +// TODO: support ledger.aleo.network snapshots: +// https://ledger.aleo.network/mainnet/snapshot/latest.txt +// https://ledger.aleo.network/testnet/snapshot/latest.txt +// https://ledger.aleo.network/canarynet/snapshot/latest.txt diff --git a/crates/agent/src/reconcile/mod.rs b/crates/agent/src/reconcile/mod.rs index facc9a7c..fecf680d 100644 --- a/crates/agent/src/reconcile/mod.rs +++ b/crates/agent/src/reconcile/mod.rs @@ -1,8 +1,8 @@ -use std::{collections::HashSet, time::Duration}; +use std::{collections::HashSet, fmt::Display, time::Duration}; use indexmap::IndexSet; -mod agent; +pub mod agent; mod checkpoint; mod files; pub use files::*; @@ -17,11 +17,12 @@ pub enum ReconcileCondition { PendingProcess(String), } -trait Reconcile { - async fn reconcile(&self) -> Result, E>; +pub trait Reconcile { + async fn reconcile(self) -> Result, E>; } pub struct ReconcileStatus { + pub scopes: Vec, pub inner: Option, pub requeue_after: Option, pub conditions: IndexSet, @@ -36,12 +37,17 @@ impl Default for ReconcileStatus { impl ReconcileStatus { pub fn new(inner: Option) -> Self { Self { + scopes: Vec::new(), inner, requeue_after: None, conditions: IndexSet::new(), } } + pub fn with(inner: T) -> Self { + Self::new(Some(inner)) + } + pub fn empty() -> Self { Self::new(None) } @@ -52,6 +58,7 @@ impl ReconcileStatus { pub fn replace(&self, inner: Option) -> ReconcileStatus { ReconcileStatus { + scopes: self.scopes.clone(), inner, requeue_after: self.requeue_after, conditions: self.conditions.clone(), @@ -61,6 +68,7 @@ impl ReconcileStatus { pub fn emptied(&self) -> ReconcileStatus { ReconcileStatus { inner: None, + scopes: self.scopes.clone(), requeue_after: self.requeue_after, conditions: self.conditions.clone(), } @@ -79,6 +87,11 @@ impl ReconcileStatus { self } + pub fn add_scope(mut self, scope: impl Display) -> Self { + self.scopes.push(scope.to_string()); + self + } + pub fn add_condition(mut self, condition: ReconcileCondition) -> Self { self.conditions.insert(condition); self diff --git a/crates/agent/src/rpc/control.rs b/crates/agent/src/rpc/control.rs index 5aafaf29..f49a7bfc 100644 --- a/crates/agent/src/rpc/control.rs +++ b/crates/agent/src/rpc/control.rs @@ -2,6 +2,7 @@ use std::{ collections::HashSet, net::IpAddr, ops::Deref, path::PathBuf, process::Stdio, sync::Arc, + time::Duration, }; use snops_common::{ @@ -105,11 +106,12 @@ impl AgentService for AgentRpcServer { error!("failed to send transfer statuses: {err}"); } - // reconcile if state has changed - let needs_reconcile = *self.state.agent_state.read().await != handshake.state; - if needs_reconcile { - Self::reconcile(self, context, handshake.state).await?; - } + info!("queing reconcilation on handshake..."); + + // Queue a reconcile immediately as we have received new state. + // The reconciler will decide if anything has actually changed + *self.state.agent_state.write().await = Arc::new(handshake.state); + self.state.queue_reconcile(Duration::ZERO).await; Ok(()) } @@ -119,7 +121,11 @@ impl AgentService for AgentRpcServer { _: context::Context, target: AgentState, ) -> Result<(), ReconcileError> { - info!("beginning reconcilation..."); + info!("queing reconcilation..."); + *self.state.agent_state.write().await = Arc::new(target.clone()); + self.state.queue_reconcile(Duration::ZERO).await; + + // TODO: remove the following code, handled entirely by the reconciler logic // acquire the handle lock let mut handle_container = self.state.reconcilation_handle.lock().await; @@ -136,7 +142,7 @@ impl AgentService for AgentRpcServer { // previous state cleanup let old_state = { let agent_state_lock = state.agent_state.read().await; - match agent_state_lock.deref() { + match agent_state_lock.as_ref() { // kill existing child if running AgentState::Node(_, node) if node.online => { info!("cleaning up snarkos process..."); @@ -151,7 +157,7 @@ impl AgentService for AgentRpcServer { // download new storage if storage_id changed 'storage: { - let (is_same_env, is_same_index) = match (&old_state, &target) { + let (is_same_env, is_same_index) = match (old_state.as_ref(), &target) { (AgentState::Node(old_env, old_node), AgentState::Node(new_env, new_node)) => { (old_env == new_env, old_node.height.0 == new_node.height.0) } @@ -383,11 +389,11 @@ impl AgentService for AgentRpcServer { } // After completing the reconcilation, update the agent state - let mut agent_state = state.agent_state.write().await; + let target = Arc::new(target); if let Err(e) = state.db.set_agent_state(Some(&target)) { error!("failed to save agent state to db: {e}"); } - *agent_state = target; + *state.agent_state.write().await = target; Ok(()) }); @@ -421,7 +427,7 @@ impl AgentService for AgentRpcServer { async fn get_addrs(self, _: context::Context) -> (PortConfig, Option, Vec) { ( - self.state.cli.ports.clone(), + self.state.cli.ports, self.state.external_addr, self.state.internal_addrs.clone(), ) @@ -433,7 +439,7 @@ impl AgentService for AgentRpcServer { route: String, ) -> Result { let env_id = - if let AgentState::Node(env_id, state) = self.state.agent_state.read().await.deref() { + if let AgentState::Node(env_id, state) = self.state.agent_state.read().await.as_ref() { if !state.online { return Err(SnarkosRequestError::OfflineNode); } @@ -472,7 +478,7 @@ impl AgentService for AgentRpcServer { async fn broadcast_tx(self, _: context::Context, tx: String) -> Result<(), AgentError> { let env_id = - if let AgentState::Node(env_id, _) = self.state.agent_state.read().await.deref() { + if let AgentState::Node(env_id, _) = self.state.agent_state.read().await.as_ref() { *env_id } else { return Err(AgentError::InvalidState); diff --git a/crates/agent/src/state.rs b/crates/agent/src/state.rs index e8f2c459..9f13e348 100644 --- a/crates/agent/src/state.rs +++ b/crates/agent/src/state.rs @@ -17,7 +17,7 @@ use tarpc::context; use tokio::{ process::Child, select, - sync::{Mutex as AsyncMutex, RwLock}, + sync::{mpsc::Sender, Mutex as AsyncMutex, RwLock}, task::AbortHandle, }; use tracing::{error, info}; @@ -41,7 +41,12 @@ pub struct GlobalState { pub cli: Cli, pub endpoint: String, pub loki: Mutex>, - pub agent_state: RwLock, + /// Desired state the agent should be in. After each reconciliation, the + /// agent will attempt to transition to this state. + pub agent_state: RwLock>, + /// A sender for emitting the next time to reconcile the agent. + /// Helpful for scheduling the next reconciliation. + pub queue_reconcile_tx: Sender, pub env_info: RwLock>, pub reconcilation_handle: AsyncMutex>, pub child: RwLock>, /* TODO: this may need to be handled by an owning thread, @@ -75,6 +80,12 @@ impl GlobalState { .collect::>() } + pub async fn queue_reconcile(&self, duration: Duration) -> bool { + self.queue_reconcile_tx + .try_send(Instant::now() + duration) + .is_ok() + } + pub async fn set_env_info(&self, info: Option<(EnvId, EnvInfo)>) { if let Err(e) = self.db.set_env_info(info.as_ref()) { error!("failed to save env info to db: {e}"); diff --git a/crates/common/src/state/port_config.rs b/crates/common/src/state/port_config.rs index 222349ef..675dbd8c 100644 --- a/crates/common/src/state/port_config.rs +++ b/crates/common/src/state/port_config.rs @@ -1,6 +1,6 @@ use crate::format::{DataFormat, DataFormatReader}; -#[derive(Debug, Copy, Clone, serde::Serialize, serde::Deserialize, clap::Parser)] +#[derive(Debug, Copy, Clone, serde::Serialize, serde::Deserialize, clap::Parser, Eq, PartialEq)] pub struct PortConfig { /// Specify the IP address and port for the node server #[clap(long = "node", default_value_t = 4130)] From 0f85e91e50287ba11731f2e17b6a25c3fae8e607 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 16 Nov 2024 16:21:09 -0500 Subject: [PATCH 05/68] refactor(agent): cleanup agent reconciler mutability --- crates/agent/src/main.rs | 28 ++++++++++++++++------------ crates/agent/src/reconcile/agent.rs | 25 +++++++++++-------------- crates/agent/src/reconcile/mod.rs | 10 ++++++---- 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index aecc3ed9..3627672c 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -147,7 +147,15 @@ async fn main() { let state3 = Arc::clone(&state); let reconcile_loop = Box::pin(async move { let mut err_backoff = 0; - let mut reconcile_ctx = Default::default(); + + // Root reconciler that walks through configuring the agent. + // The context is mutated while reconciling to keep track of things + // like downloads, ledger manipulations, node command, and more. + let mut root = AgentStateReconciler { + agent_state: Arc::clone(state3.agent_state.read().await.deref()), + state: Arc::clone(&state3), + context: Default::default(), + }; // The first reconcile is scheduled for 5 seconds after startup. // Connecting to the controlplane will likely trigger a reconcile sooner. @@ -170,19 +178,15 @@ async fn main() { // schedule the next reconcile for 5 minutes from now next_reconcile_at = Instant::now() + Duration::from_secs(5 * 60); + // update the reconciler with the latest agent state + // this prevents the agent state from changing during reconciliation + root.agent_state = state3.agent_state.read().await.deref().clone(); + trace!("reconciling agent state..."); - match (AgentStateReconciler { - agent_state: Arc::clone(state3.agent_state.read().await.deref()), - state: Arc::clone(&state3), - context: std::mem::take(&mut reconcile_ctx), - }) - .reconcile() - .await - { - Ok(mut status) => { - if let Some(context) = status.inner.take() { + match root.reconcile().await { + Ok(status) => { + if status.inner.is_some() { trace!("reconcile completed"); - reconcile_ctx = context; } if !status.conditions.is_empty() { trace!("reconcile conditions: {:?}", status.conditions); diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index 89db84cb..cce2a139 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -37,27 +37,24 @@ pub struct AgentStateReconcilerContext { command: Option, // TODO: store active transfers here for monitoring // TODO: update api::download_file to receive a transfer id + // TODO: allow transfers to be interrupted. potentially allow them to be resumed by using the + // file range feature. } -impl Reconcile for AgentStateReconciler { - async fn reconcile( - self, - ) -> Result, ReconcileError2> { +impl Reconcile<(), ReconcileError2> for AgentStateReconciler { + async fn reconcile(&mut self) -> Result, ReconcileError2> { match self.agent_state.as_ref() { AgentState::Inventory => { // TODO: cleanup child process // TODO: cleanup other things - // return a default context because the node, in inventory, has no state return Ok(ReconcileStatus::default().add_scope("agent_state/inventory")); } AgentState::Node(env_id, node) => { // node is offline, no need to reconcile if !node.online { // TODO: tear down the node if it is running - return Ok( - ReconcileStatus::with(self.context).add_scope("agent_state/node/offline") - ); + return Ok(ReconcileStatus::default().add_scope("agent_state/node/offline")); } // TODO: download binaries @@ -227,18 +224,18 @@ impl NodeCommand { } impl Reconcile for NodeCommandReconciler { - async fn reconcile(self) -> Result, ReconcileError2> { + async fn reconcile(&mut self) -> Result, ReconcileError2> { let NodeCommandReconciler { node, state, env_id, } = self; - let info = state.get_env_info(env_id).await?; + let info = state.get_env_info(*env_id).await?; // Resolve the addresses of the peers and validators let res = AddressResolveReconciler { - node: Arc::clone(&node), - state: Arc::clone(&state), + node: Arc::clone(node), + state: Arc::clone(state), } .reconcile() .await?; @@ -265,7 +262,7 @@ impl Reconcile for NodeCommandReconciler { let run = NodeCommand { command_path: state.cli.path.join(SNARKOS_FILE), quiet: state.cli.quiet, - env_id, + env_id: *env_id, node_key: node.node_key.clone(), loki: state.loki.lock().ok().and_then(|l| l.deref().clone()), ledger_path, @@ -311,7 +308,7 @@ struct AddressResolveReconciler { } impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { - async fn reconcile(self) -> Result, ReconcileError2> { + async fn reconcile(&mut self) -> Result, ReconcileError2> { let AddressResolveReconciler { state, node } = self; // Find agents that do not have cached addresses diff --git a/crates/agent/src/reconcile/mod.rs b/crates/agent/src/reconcile/mod.rs index fecf680d..ea375592 100644 --- a/crates/agent/src/reconcile/mod.rs +++ b/crates/agent/src/reconcile/mod.rs @@ -6,19 +6,21 @@ pub mod agent; mod checkpoint; mod files; pub use files::*; +use snops_common::state::TransferId; #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum ReconcileCondition { - /// A file is being downloaded. - PendingDownload(String), + /// A file is being transferred. + PendingTransfer(String, TransferId), /// A file is being unpacked. PendingUnpack(String), - /// A process is being spawned / confirmed + /// A process is being spawned / confirmed. Could be starting the node or + /// manipulating the ledger PendingProcess(String), } pub trait Reconcile { - async fn reconcile(self) -> Result, E>; + async fn reconcile(&mut self) -> Result, E>; } pub struct ReconcileStatus { From 22eab784a0fdb1a7120825551cdbb5e7da0ba71b Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sun, 17 Nov 2024 00:49:44 -0500 Subject: [PATCH 06/68] feat(agent): WIP file and binary reconcilers --- crates/agent/src/api.rs | 100 +++-- crates/agent/src/db.rs | 6 +- crates/agent/src/reconcile/agent.rs | 399 ++++++++------------ crates/agent/src/reconcile/command.rs | 197 ++++++++++ crates/agent/src/reconcile/files.rs | 242 ++++++++++-- crates/agent/src/reconcile/mod.rs | 7 + crates/agent/src/state.rs | 16 +- crates/agent/src/transfers.rs | 6 + crates/common/src/binaries.rs | 9 +- crates/common/src/format/impl_containers.rs | 18 +- crates/common/src/rpc/error.rs | 18 + crates/common/src/state/agent_status.rs | 20 + 12 files changed, 740 insertions(+), 298 deletions(-) create mode 100644 crates/agent/src/reconcile/command.rs diff --git a/crates/agent/src/api.rs b/crates/agent/src/api.rs index 412e3458..c4bbfa7a 100644 --- a/crates/agent/src/api.rs +++ b/crates/agent/src/api.rs @@ -12,7 +12,8 @@ use reqwest::IntoUrl; use sha2::{Digest, Sha256}; use snops_common::{ binaries::{BinaryEntry, BinarySource}, - state::TransferStatusUpdate, + rpc::error::ReconcileError2, + state::{TransferId, TransferStatusUpdate}, util::sha256_file, }; use tokio::{fs::File, io::AsyncWriteExt}; @@ -24,6 +25,7 @@ const TRANSFER_UPDATE_RATE: Duration = Duration::from_secs(2); /// Download a file. Returns a None if 404. pub async fn download_file( + tx_id: TransferId, client: &reqwest::Client, url: impl IntoUrl, to: impl AsRef, @@ -35,8 +37,7 @@ pub async fn download_file( return Ok(None); } - // create a new transfer - let tx_id = transfers::next_id(); + // start a new transfer transfer_tx.send(( tx_id, TransferStatusUpdate::Start { @@ -105,7 +106,7 @@ pub async fn check_file( ) -> anyhow::Result<()> { let client = reqwest::Client::new(); - if !should_download_file(&client, url.as_str(), to, None) + if !should_download_file(&client, url.as_str(), to, None, None, false) .await .unwrap_or(true) { @@ -113,7 +114,9 @@ pub async fn check_file( } info!("downloading {to:?}"); - download_file(&client, url, to, transfer_tx).await?; + + let tx_id = transfers::next_id(); + download_file(tx_id, &client, url, to, transfer_tx).await?; Ok(()) } @@ -136,9 +139,16 @@ pub async fn check_binary( // this also checks for sha256 differences, along with last modified time // against the target - if !should_download_file(&client, &source_url, path, Some(binary)) - .await - .unwrap_or(true) + if !should_download_file( + &client, + &source_url, + path, + binary.size, + binary.sha256.as_deref(), + false, + ) + .await + .unwrap_or(true) { // check permissions and ensure 0o755 let perms = path.metadata()?.permissions(); @@ -152,7 +162,9 @@ pub async fn check_binary( } info!("downloading binary update to {}: {binary}", path.display()); - let Some((file, sha256, size)) = download_file(&client, &source_url, path, transfer_tx).await? + let tx_id = transfers::next_id(); + let Some((file, sha256, size)) = + download_file(tx_id, &client, &source_url, path, transfer_tx).await? else { bail!("downloading binary returned 404"); }; @@ -190,43 +202,73 @@ pub async fn should_download_file( client: &reqwest::Client, loc: &str, path: &Path, - binary: Option<&BinaryEntry>, -) -> anyhow::Result { + size: Option, + sha256: Option<&str>, + offline: bool, +) -> Result { if !path.exists() { return Ok(true); } - let meta = tokio::fs::metadata(&path).await?; + let meta = tokio::fs::metadata(&path) + .await + .map_err(|e| ReconcileError2::FileStatError(path.to_path_buf(), e.to_string()))?; let local_content_length = meta.len(); // if the binary entry is provided, check if the file size and sha256 match - if let Some(binary) = binary { - // file size is incorrect - if binary.size.is_some_and(|s| s != local_content_length) { - return Ok(true); - } + // file size is incorrect + if size.is_some_and(|s| s != local_content_length) { + return Ok(true); + } - // if sha256 is present, only download if the sha256 is different - if let Some(sha256) = binary.sha256.as_ref() { - return Ok(sha256_file(&path.to_path_buf())? != sha256.to_ascii_lowercase()); - } + // if sha256 is present, only download if the sha256 is different + if let Some(sha256) = sha256 { + return Ok(sha256_file(&path.to_path_buf()) + .map_err(|e| ReconcileError2::FileReadError(path.to_path_buf(), e.to_string()))? + != sha256.to_ascii_lowercase()); + } + + // if we're offline, don't download + if offline { + return Ok(false); } // check last modified - let res = client.head(loc).send().await?; + let res = client + .head(loc) + .send() + .await + .map_err(|e| ReconcileError2::HttpError { + method: String::from("HEAD"), + url: loc.to_owned(), + error: e.to_string(), + })?; - let Some(last_modified_header) = res.headers().get(http::header::LAST_MODIFIED) else { + let Some(last_modified_header) = res + .headers() + .get(http::header::LAST_MODIFIED) + // parse as a string + .and_then(|e| e.to_str().ok()) + else { return Ok(true); }; - let Some(content_length_header) = res.headers().get(http::header::CONTENT_LENGTH) else { + let Some(remote_content_length) = res + .headers() + .get(http::header::CONTENT_LENGTH) + // parse the header as a u64 + .and_then(|e| e.to_str().ok().and_then(|s| s.parse::().ok())) + else { return Ok(true); }; - let remote_last_modified = httpdate::parse_http_date(last_modified_header.to_str()?)?; - let local_last_modified = meta.modified()?; + let remote_last_modified = httpdate::parse_http_date(last_modified_header); + let local_last_modified = meta + .modified() + .map_err(|e| ReconcileError2::FileStatError(path.to_path_buf(), e.to_string()))?; - let remote_content_length = content_length_header.to_str()?.parse::()?; - - Ok(remote_last_modified > local_last_modified || remote_content_length != local_content_length) + Ok(remote_last_modified + .map(|res| res > local_last_modified) + .unwrap_or(true) + || remote_content_length != local_content_length) } diff --git a/crates/agent/src/db.rs b/crates/agent/src/db.rs index c7d8b8cb..ab360b9f 100644 --- a/crates/agent/src/db.rs +++ b/crates/agent/src/db.rs @@ -2,7 +2,7 @@ use std::{ io::{Read, Write}, net::IpAddr, path::Path, - sync::Mutex, + sync::{Arc, Mutex}, }; use bytes::Buf; @@ -110,7 +110,7 @@ impl Database { .and_then(|url| url.parse::().ok()) } - pub fn env_info(&self) -> Result, DatabaseError> { + pub fn env_info(&self) -> Result)>, DatabaseError> { self.documents .restore(&AgentDbString::EnvInfo)? .map(|format::BinaryData(bytes)| read_dataformat(&mut bytes.reader())) @@ -118,7 +118,7 @@ impl Database { .map_err(DatabaseError::from) } - pub fn set_env_info(&self, info: Option<&(EnvId, EnvInfo)>) -> Result<(), DatabaseError> { + pub fn set_env_info(&self, info: Option<(EnvId, Arc)>) -> Result<(), DatabaseError> { if let Some(info) = info { self.documents.save( &AgentDbString::EnvInfo, diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index cce2a139..2f5e000d 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -1,26 +1,25 @@ use std::{ - collections::HashSet, net::IpAddr, ops::Deref, path::PathBuf, process::Stdio, sync::Arc, + collections::HashSet, + sync::Arc, + time::{Duration, Instant}, }; -use indexmap::IndexMap; -use snops_checkpoint::RetentionPolicy; use snops_common::{ api::EnvInfo, - constant::{ - LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, SNARKOS_LOG_FILE, - }, + binaries::{BinaryEntry, BinarySource}, + constant::SNARKOS_FILE, rpc::error::ReconcileError2, state::{ - AgentId, AgentPeer, AgentState, EnvId, InternedId, KeyState, NetworkId, NodeKey, NodeState, - PortConfig, + AgentId, AgentPeer, AgentState, InternedId, NetworkId, NodeState, StorageId, TransferId, }, }; use tarpc::context; -use tokio::process::Command; -use tracing::{error, warn}; -use url::Url; +use tracing::{error, trace, warn}; -use super::{Reconcile, ReconcileStatus}; +use super::{ + command::NodeCommand, default_binary, FileReconciler, Reconcile, ReconcileCondition, + ReconcileStatus, +}; use crate::state::GlobalState; /// Attempt to reconcile the agent's current state. @@ -31,12 +30,40 @@ pub struct AgentStateReconciler { pub context: AgentStateReconcilerContext, } +#[derive(Default)] +struct TransfersContext { + // TODO: persist network_id, storage_id, and storage_version + network_id: NetworkId, + storage_id: StorageId, + storage_version: u16, + /// Metadata about an active binary transfer + binary_transfer: Option<(TransferId, BinaryEntry)>, + /// Time the binary was marked as OK + binary_ok_at: Option, + /// Metadata about an active genesis block transfer + genesis_transfer: Option, + /// Time the genesis block was marked as OK + genesis_ok_at: Option, + /// Metadata about an active ledger transfer + ledger_transfer: Option, + /// Time the ledger was marked as OK + ledger_ok_at: Option, +} + +impl TransfersContext { + pub fn changed(&self, env_info: &EnvInfo) -> bool { + env_info.storage.version != self.storage_version + || env_info.storage.id != self.storage_id + || env_info.network != self.network_id + } +} + #[derive(Default)] pub struct AgentStateReconcilerContext { /// All parameters needed to build the command to start the node command: Option, - // TODO: store active transfers here for monitoring - // TODO: update api::download_file to receive a transfer id + /// Information about active transfers + transfers: Option, // TODO: allow transfers to be interrupted. potentially allow them to be resumed by using the // file range feature. } @@ -51,33 +78,145 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { return Ok(ReconcileStatus::default().add_scope("agent_state/inventory")); } AgentState::Node(env_id, node) => { + let env_info = self.state.get_env_info(*env_id).await?; + + // Check if the storage version, storage id, or network id has changed + let storage_has_changed = self + .context + .transfers + .as_ref() + .map(|t| t.changed(&env_info)) + .unwrap_or(true); + + // If the node should be torn down, or the storage has changed, we need to + // gracefully shut down the node. + let shutdown_pending = !node.online || storage_has_changed; + // node is offline, no need to reconcile if !node.online { // TODO: tear down the node if it is running return Ok(ReconcileStatus::default().add_scope("agent_state/node/offline")); } - // TODO: download binaries - // TODO: restart the node if the binaries changed. this means storing the hashes - // of the downloaded files + let node_arc = Arc::new(*node.clone()); - // TODO: requeue if the binaries are not ready + if storage_has_changed { + // TODO: abort any ongoing transfers, then requeue + } - let command_res = NodeCommandReconciler { - env_id: *env_id, - node: Arc::new(*node.clone()), + // initialize the transfers context with the current status + if self.context.transfers.is_none() { + // TODO: write this to the db + self.context.transfers = Some(TransfersContext { + network_id: env_info.network, + storage_id: env_info.storage.id, + storage_version: env_info.storage.version, + ..Default::default() + }); + } + let transfers = self.context.transfers.as_mut().unwrap(); + + // Resolve the node's binary + // TODO: move into BinaryReconciler + 'binary: { + // Binary entry for the node + let default_binary = default_binary(&env_info); + let target_binary = env_info + .storage + .binaries + .get(&node.binary.unwrap_or_default()) + .unwrap_or(&default_binary); + + // Check if the binary has changed + let binary_has_changed = transfers + .binary_transfer + .as_ref() + .map(|(_, b)| b != target_binary) + .unwrap_or(true); + let binary_is_ok = transfers + .binary_ok_at + .map(|ok| ok.elapsed().as_secs() < 300) // check if the binary has been OK for 5 minutes + .unwrap_or(false); + + // If the binary has not changed and has not expired, we can skip the binary + // reconciler + if !binary_has_changed && binary_is_ok { + break 'binary; + } + + let src = match &target_binary.source { + BinarySource::Url(url) => url.clone(), + BinarySource::Path(path) => { + let url = format!("{}{}", &self.state.endpoint, path.display()); + url.parse::() + .map_err(|e| ReconcileError2::UrlParseError(url, e.to_string()))? + } + }; + let dst = self.state.cli.path.join(SNARKOS_FILE); + + let is_api_offline = self.state.client.read().await.is_none(); + + let binary_res = FileReconciler::new(Arc::clone(&self.state), src, dst) + .with_offline(target_binary.is_api_file() && is_api_offline) + .with_binary(target_binary) + .with_tx_id(transfers.binary_transfer.as_ref().map(|(tx, _)| *tx)) + .reconcile() + .await?; + + // transfer is pending or a failure occurred + if binary_res.is_requeue() { + return Ok(binary_res.emptied().add_scope("binary_reconcile/requeue")); + } + + match binary_res.inner { + // If the binary is OK, update the context + Some(true) => { + transfers.binary_ok_at = Some(Instant::now()); + } + // If the binary is not OK, we will wait for the endpoint to come back + // online... + Some(false) => { + trace!( + "binary is not OK, waiting for the endpoint to come back online..." + ); + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingConnection) + .add_scope("binary_reconcile/offline") + .requeue_after(Duration::from_secs(5))); + } + None => unreachable!("file reconciler returns a result when not requeued"), + } + } + + // Resolve the addresses of the peers and validators + // TODO: Set an expiry for resolved addresses + let addr_res = AddressResolveReconciler { + node: Arc::clone(&node_arc), state: Arc::clone(&self.state), } .reconcile() .await?; - if command_res.is_requeue() { - return Ok(command_res.emptied().add_scope("agent_state/node/requeue")); + if addr_res.is_requeue() { + return Ok(addr_res.add_scope("address_resolve/requeue")); } - let Some(command) = command_res.take() else { - return Ok(ReconcileStatus::default().add_scope("agent_state/node/no_command")); - }; + // TODO: download binaries + // TODO: restart the node if the binaries changed. this means storing the hashes + // of the downloaded files + + // TODO: requeue if the binaries are not ready + + // Accumulate all the fields that are used to derive the command that starts + // the node. + // This will be used to determine if the command has changed at all. + let command = NodeCommand::new( + Arc::clone(&self.state), + node_arc, + *env_id, + Arc::clone(&env_info), + ) + .await?; if self.context.command.as_ref() != Some(&command) { // TODO: OK to restart the node -- command has changed @@ -94,211 +233,6 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { } } -/// Given a node state, construct the command needed to start the node -struct NodeCommandReconciler { - node: Arc, - state: Arc, - env_id: EnvId, -} - -#[derive(Debug, Clone, Eq, PartialEq)] -struct NodeCommand { - /// Path to the snarkos binary - command_path: PathBuf, - /// If true, do not print stdout - quiet: bool, - /// Environment ID (used in loki) - env_id: EnvId, - /// Node key (drives NETWORK env) - network: NetworkId, - /// Node key (derives node type and loki) - node_key: NodeKey, - /// URL for sending logs to loki - loki: Option, - /// Path to the ledger directory - ledger_path: PathBuf, - /// Path to place the log file - log_path: PathBuf, - /// Path to genesis block. When absent, use the network's genesis block. - genesis_path: Option, - /// Env variables to pass to the node - env: IndexMap, - /// Port to bind the agent's RPC server for node status - agent_rpc_port: u16, - /// Address to bind the node to - bind_addr: IpAddr, - /// Port configuration for the node - ports: PortConfig, - /// Private key to use for the node - private_key: Option, - /// Path to a file containing the private key - private_key_file: Option, - /// Retention policy for the node - retention_policy: Option, - /// Resolved peer addresses for the node - peers: Vec, - /// Resolved validator addresses for the node - validators: Vec, -} - -impl NodeCommand { - fn build(&self) -> Command { - let mut command = Command::new(&self.command_path); - - // set stdio - if self.quiet { - command.stdout(Stdio::null()); - } else { - command.stdout(std::io::stdout()); - } - command.stderr(std::io::stderr()); - - // add loki URL if one is set - if let Some(loki) = &self.loki { - command - .env( - "SNOPS_LOKI_LABELS", - format!("env_id={},node_key={}", self.env_id, self.node_key), - ) - .arg("--loki") - .arg(loki.as_str()); - } - - // setup the run command - command - .stderr(std::io::stderr()) - .envs(&self.env) - .env("NETWORK", self.network.to_string()) - .env("HOME", &self.ledger_path) - .arg("--log") - .arg(&self.log_path) - .arg("run") - .arg("--agent-rpc-port") - .arg(self.agent_rpc_port.to_string()) - .arg("--type") - .arg(self.node_key.ty.to_string()) - .arg("--ledger") - .arg(&self.ledger_path); - - if let Some(genesis) = &self.genesis_path { - command.arg("--genesis").arg(genesis); - } - - // storage configuration - command - // port configuration - .arg("--bind") - .arg(self.bind_addr.to_string()) - .arg("--bft") - .arg(self.ports.bft.to_string()) - .arg("--rest") - .arg(self.ports.rest.to_string()) - .arg("--metrics") - .arg(self.ports.metrics.to_string()) - .arg("--node") - .arg(self.ports.node.to_string()); - - if let Some(pk) = &self.private_key { - command.arg("--private-key").arg(pk); - } - - if let Some(pk_file) = &self.private_key_file { - command.arg("--private-key-file").arg(pk_file); - } - - // conditionally add retention policy - if let Some(policy) = &self.retention_policy { - command.arg("--retention-policy").arg(policy.to_string()); - } - - if !self.peers.is_empty() { - command.arg("--peers").arg(self.peers.join(",")); - } - - if !self.validators.is_empty() { - command.arg("--validators").arg(self.validators.join(",")); - } - - command - } -} - -impl Reconcile for NodeCommandReconciler { - async fn reconcile(&mut self) -> Result, ReconcileError2> { - let NodeCommandReconciler { - node, - state, - env_id, - } = self; - let info = state.get_env_info(*env_id).await?; - - // Resolve the addresses of the peers and validators - let res = AddressResolveReconciler { - node: Arc::clone(node), - state: Arc::clone(state), - } - .reconcile() - .await?; - - if res.is_requeue() { - return Ok(res - .emptied() - .add_scope("node_command/address_resolve/requeue")); - } - - let storage_path = state - .cli - .path - .join("storage") - .join(info.network.to_string()) - .join(info.storage.id.to_string()); - - let ledger_path = if info.storage.persist { - storage_path.join(LEDGER_PERSIST_DIR) - } else { - state.cli.path.join(LEDGER_BASE_DIR) - }; - - let run = NodeCommand { - command_path: state.cli.path.join(SNARKOS_FILE), - quiet: state.cli.quiet, - env_id: *env_id, - node_key: node.node_key.clone(), - loki: state.loki.lock().ok().and_then(|l| l.deref().clone()), - ledger_path, - log_path: state.cli.path.join(SNARKOS_LOG_FILE), - genesis_path: (!info.storage.native_genesis) - .then(|| storage_path.join(SNARKOS_GENESIS_FILE)), - network: info.network, - env: node.env.clone(), - agent_rpc_port: state.agent_rpc_port, - bind_addr: state.cli.bind_addr, - ports: state.cli.ports, - private_key: if let KeyState::Literal(pk) = &node.private_key { - Some(pk.clone()) - } else { - None - }, - private_key_file: if let KeyState::Local = &node.private_key { - Some( - state - .cli - .private_key_file - .clone() - .ok_or(ReconcileError2::MissingLocalPrivateKey)?, - ) - } else { - None - }, - peers: state.agentpeers_to_cli(&node.peers).await, - validators: state.agentpeers_to_cli(&node.validators).await, - retention_policy: info.storage.retention_policy.clone(), - }; - - Ok(ReconcileStatus::new(Some(run))) - } -} - /// Given a node state, resolve the addresses of the agent based peers and /// validators. Non-agent based peers have their addresses within the state /// already. @@ -351,6 +285,8 @@ impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { ); // Resolve the addresses + // TODO: turn this into a background process so the reconcile operation can run + // instantly let new_addrs = client .resolve_addrs(context::current(), unresolved_addrs) .await @@ -379,7 +315,6 @@ impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { /// Download a specific binary file needed to run the node struct BinaryReconciler { - binary_id: Option, state: Arc, info: EnvInfo, } diff --git a/crates/agent/src/reconcile/command.rs b/crates/agent/src/reconcile/command.rs new file mode 100644 index 00000000..818ecb2c --- /dev/null +++ b/crates/agent/src/reconcile/command.rs @@ -0,0 +1,197 @@ +use std::{net::IpAddr, ops::Deref, path::PathBuf, process::Stdio, sync::Arc}; + +use indexmap::IndexMap; +use snops_checkpoint::RetentionPolicy; +use snops_common::{ + api::EnvInfo, + constant::{ + LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, SNARKOS_LOG_FILE, + }, + rpc::error::ReconcileError2, + state::{EnvId, KeyState, NetworkId, NodeKey, NodeState, PortConfig}, +}; +use tokio::process::Command; +use url::Url; + +use crate::state::GlobalState; + +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct NodeCommand { + /// Path to the snarkos binary + command_path: PathBuf, + /// If true, do not print stdout + quiet: bool, + /// Environment ID (used in loki) + env_id: EnvId, + /// Node key (drives NETWORK env) + network: NetworkId, + /// Node key (derives node type and loki) + node_key: NodeKey, + /// URL for sending logs to loki + loki: Option, + /// Path to the ledger directory + ledger_path: PathBuf, + /// Path to place the log file + log_path: PathBuf, + /// Path to genesis block. When absent, use the network's genesis block. + genesis_path: Option, + /// Env variables to pass to the node + env: IndexMap, + /// Port to bind the agent's RPC server for node status + agent_rpc_port: u16, + /// Address to bind the node to + bind_addr: IpAddr, + /// Port configuration for the node + ports: PortConfig, + /// Private key to use for the node + private_key: Option, + /// Path to a file containing the private key + private_key_file: Option, + /// Retention policy for the node + retention_policy: Option, + /// Resolved peer addresses for the node + peers: Vec, + /// Resolved validator addresses for the node + validators: Vec, +} + +impl NodeCommand { + pub async fn new( + state: Arc, + node: Arc, + env_id: EnvId, + env_info: Arc, + ) -> Result { + let storage_path = state + .cli + .path + .join("storage") + .join(env_info.network.to_string()) + .join(env_info.storage.id.to_string()); + + let ledger_path = if env_info.storage.persist { + storage_path.join(LEDGER_PERSIST_DIR) + } else { + state.cli.path.join(LEDGER_BASE_DIR) + }; + + Ok(NodeCommand { + command_path: state.cli.path.join(SNARKOS_FILE), + quiet: state.cli.quiet, + env_id, + node_key: node.node_key.clone(), + loki: state.loki.lock().ok().and_then(|l| l.deref().clone()), + ledger_path, + log_path: state.cli.path.join(SNARKOS_LOG_FILE), + genesis_path: (!env_info.storage.native_genesis) + .then(|| storage_path.join(SNARKOS_GENESIS_FILE)), + network: env_info.network, + env: node.env.clone(), + agent_rpc_port: state.agent_rpc_port, + bind_addr: state.cli.bind_addr, + ports: state.cli.ports, + private_key: if let KeyState::Literal(pk) = &node.private_key { + Some(pk.clone()) + } else { + None + }, + // Ensure the private key file can be resolved. + // This is only reachable when an agent is referred to by its + // id in an environment spec. + private_key_file: if let KeyState::Local = &node.private_key { + Some( + state + .cli + .private_key_file + .clone() + .ok_or(ReconcileError2::MissingLocalPrivateKey)?, + ) + } else { + None + }, + peers: state.agentpeers_to_cli(&node.peers).await, + validators: state.agentpeers_to_cli(&node.validators).await, + retention_policy: env_info.storage.retention_policy.clone(), + }) + } + + pub fn build(&self) -> Command { + let mut command = Command::new(&self.command_path); + + // set stdio + if self.quiet { + command.stdout(Stdio::null()); + } else { + command.stdout(std::io::stdout()); + } + command.stderr(std::io::stderr()); + + // add loki URL if one is set + if let Some(loki) = &self.loki { + command + .env( + "SNOPS_LOKI_LABELS", + format!("env_id={},node_key={}", self.env_id, self.node_key), + ) + .arg("--loki") + .arg(loki.as_str()); + } + + // setup the run command + command + .stderr(std::io::stderr()) + .envs(&self.env) + .env("NETWORK", self.network.to_string()) + .env("HOME", &self.ledger_path) + .arg("--log") + .arg(&self.log_path) + .arg("run") + .arg("--agent-rpc-port") + .arg(self.agent_rpc_port.to_string()) + .arg("--type") + .arg(self.node_key.ty.to_string()) + .arg("--ledger") + .arg(&self.ledger_path); + + if let Some(genesis) = &self.genesis_path { + command.arg("--genesis").arg(genesis); + } + + // storage configuration + command + // port configuration + .arg("--bind") + .arg(self.bind_addr.to_string()) + .arg("--bft") + .arg(self.ports.bft.to_string()) + .arg("--rest") + .arg(self.ports.rest.to_string()) + .arg("--metrics") + .arg(self.ports.metrics.to_string()) + .arg("--node") + .arg(self.ports.node.to_string()); + + if let Some(pk) = &self.private_key { + command.arg("--private-key").arg(pk); + } + + if let Some(pk_file) = &self.private_key_file { + command.arg("--private-key-file").arg(pk_file); + } + + // conditionally add retention policy + if let Some(policy) = &self.retention_policy { + command.arg("--retention-policy").arg(policy.to_string()); + } + + if !self.peers.is_empty() { + command.arg("--peers").arg(self.peers.join(",")); + } + + if !self.validators.is_empty() { + command.arg("--validators").arg(self.validators.join(",")); + } + + command + } +} diff --git a/crates/agent/src/reconcile/files.rs b/crates/agent/src/reconcile/files.rs index d2b37fdb..8d187a57 100644 --- a/crates/agent/src/reconcile/files.rs +++ b/crates/agent/src/reconcile/files.rs @@ -1,5 +1,8 @@ -use std::path::PathBuf; +use std::{ + fs::Permissions, os::unix::fs::PermissionsExt, path::PathBuf, sync::Arc, time::Duration, +}; +use chrono::{DateTime, TimeDelta, Utc}; use snops_checkpoint::CheckpointManager; use snops_common::{ api::EnvInfo, @@ -8,14 +11,30 @@ use snops_common::{ LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, LEDGER_STORAGE_FILE, SNARKOS_FILE, SNARKOS_GENESIS_FILE, VERSION_FILE, }, - rpc::error::ReconcileError, - state::{HeightRequest, InternedId}, + rpc::error::{ReconcileError, ReconcileError2}, + state::{HeightRequest, InternedId, NetworkId, StorageId, TransferId, TransferStatusUpdate}, }; use tokio::process::Command; use tracing::{debug, error, info, trace}; +use url::Url; -use super::checkpoint; -use crate::{api, state::GlobalState}; +use super::{checkpoint, Reconcile, ReconcileCondition, ReconcileStatus}; +use crate::{ + api::{self, download_file, should_download_file}, + state::GlobalState, + transfers, +}; + +pub fn default_binary(info: &EnvInfo) -> BinaryEntry { + BinaryEntry { + source: BinarySource::Path(PathBuf::from(format!( + "/content/storage/{}/{}/binaries/default", + info.network, info.storage.id + ))), + sha256: None, + size: None, + } +} /// Ensure the correct binary is present for running snarkos pub async fn ensure_correct_binary( @@ -25,22 +44,13 @@ pub async fn ensure_correct_binary( ) -> Result<(), ReconcileError> { let base_path = &state.cli.path; - let default_entry = BinaryEntry { - source: BinarySource::Path(PathBuf::from(format!( - "/content/storage/{}/{}/binaries/default", - info.network, info.storage.id - ))), - sha256: None, - size: None, - }; - // TODO: store binary based on binary id // download the snarkOS binary api::check_binary( info.storage .binaries .get(&binary_id.unwrap_or_default()) - .unwrap_or(&default_entry), + .unwrap_or(&default_binary(info)), &state.endpoint, &base_path.join(SNARKOS_FILE), state.transfer_tx(), @@ -51,6 +61,14 @@ pub async fn ensure_correct_binary( Ok(()) } +pub fn get_genesis_route(endpoint: &str, network: NetworkId, storage_id: StorageId) -> String { + format!("{endpoint}/content/storage/{network}/{storage_id}/{SNARKOS_GENESIS_FILE}") +} + +pub fn get_ledger_route(endpoint: &str, network: NetworkId, storage_id: StorageId) -> String { + format!("{endpoint}/content/storage/{network}/{storage_id}/{LEDGER_STORAGE_FILE}") +} + /// Ensure all required files are present in the storage directory pub async fn check_files( state: &GlobalState, @@ -85,15 +103,9 @@ pub async fn check_files( })?; let genesis_path = storage_path.join(SNARKOS_GENESIS_FILE); - let genesis_url = format!( - "{}/content/storage/{network}/{storage_id}/{SNARKOS_GENESIS_FILE}", - &state.endpoint - ); + let genesis_url = get_genesis_route(&state.endpoint, network, *storage_id); let ledger_path = storage_path.join(LEDGER_STORAGE_FILE); - let ledger_url = format!( - "{}/content/storage/{network}/{storage_id}/{LEDGER_STORAGE_FILE}", - &state.endpoint - ); + let ledger_url = get_ledger_route(&state.endpoint, network, *storage_id); // skip genesis download for native genesis storage if !info.storage.native_genesis { @@ -131,6 +143,190 @@ pub async fn check_files( Ok(()) } +/// This reconciler creates a directory if it does not exist +pub struct DirectoryReconciler(pub PathBuf); +impl Reconcile<(), ReconcileError2> for DirectoryReconciler { + async fn reconcile(&mut self) -> Result, ReconcileError2> { + std::fs::create_dir_all(&self.0) + .map(ReconcileStatus::with) + .map_err(|e| ReconcileError2::CreateDirectory(self.0.clone(), e.to_string())) + } +} + +/// The FileReconciler will download a file from a URL and place it in a local +/// directory. It will also check the file's size and sha256 hash if provided, +/// and set the file's permissions. If the file already exists, it will not be +/// downloaded again. +/// +/// The reconciler will return true when the file is ready, and false when the +/// file cannot be obtained (offline controlplane). +pub struct FileReconciler { + pub state: Arc, + pub src: Url, + pub dst: PathBuf, + pub offline: bool, + pub tx_id: Option, + pub permissions: Option, + pub check_sha256: Option, + pub check_size: Option, +} +impl FileReconciler { + pub fn new(state: Arc, src: Url, dst: PathBuf) -> Self { + Self { + state, + src, + dst, + offline: false, + tx_id: None, + permissions: None, + check_sha256: None, + check_size: None, + } + } + + pub fn with_offline(mut self, offline: bool) -> Self { + self.offline = offline; + self + } + + pub fn with_tx_id(mut self, tx_id: Option) -> Self { + self.tx_id = tx_id; + self + } + + pub fn with_binary(mut self, binary: &BinaryEntry) -> Self { + self.permissions = Some(0o755); + self.check_sha256 = binary.sha256.clone(); + self.check_size = binary.size; + self + } + + pub fn check_and_set_mode(&self) -> Result<(), ReconcileError2> { + // ensure the file has the correct permissions + let Some(check_perms) = self.permissions else { + return Ok(()); + }; + + let perms = self + .dst + .metadata() + .map_err(|e| ReconcileError2::FileStatError(self.dst.clone(), e.to_string()))? + .permissions(); + + if perms.mode() != check_perms { + std::fs::set_permissions(&self.dst, std::fs::Permissions::from_mode(check_perms)) + .map_err(|e| { + ReconcileError2::FilePermissionError(self.dst.clone(), e.to_string()) + })?; + } + + Ok(()) + } +} + +impl Reconcile for FileReconciler { + async fn reconcile(&mut self) -> Result, ReconcileError2> { + let client = reqwest::Client::new(); + + // Create a transfer id if one is not provided + if self.tx_id.is_none() { + self.tx_id = Some(transfers::next_id()); + } + + let tx_id = self.tx_id.unwrap(); + + // transfer is pending + match self.state.transfers.entry(tx_id) { + dashmap::Entry::Occupied(occupied_entry) => { + let entry = occupied_entry.get(); + + if entry.is_pending() { + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingTransfer( + self.src.to_string(), + tx_id, + )) + .requeue_after(Duration::from_secs(1))); + } + + if entry.is_interrupted() { + // if the failure is within the last 60 seconds, requeue + if Utc::now().signed_duration_since(entry.updated_at).abs() + < TimeDelta::seconds(60) + { + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::InterruptedTransfer( + self.src.to_string(), + tx_id, + entry.interruption.clone().unwrap_or_default(), + )) + .requeue_after(Duration::from_secs(60))); + } + + // if the failure is older than 60 seconds, remove the pending transfer and + // start over. + occupied_entry.remove(); + return Ok(ReconcileStatus::empty() + .add_scope("file/interrupt/restart") + .requeue_after(Duration::from_secs(1))); + } + + // entry is complete + } + dashmap::Entry::Vacant(_) => {} + } + + let is_file_ready = !should_download_file( + &client, + self.src.as_str(), + self.dst.as_path(), + self.check_size, + self.check_sha256.as_deref(), + self.offline, + ) + .await?; + + // Everything is good. Ensure file permissions + if is_file_ready { + self.check_and_set_mode()?; + return Ok(ReconcileStatus::with(true)); + } + + // file does not exist and cannot be downloaded right now + if !self.dst.exists() && self.offline { + return Ok(ReconcileStatus::with(false)); + } + + let src = self.src.clone(); + let dst = self.dst.clone(); + let transfer_tx = self.state.transfer_tx.clone(); + + // download the file + let handle = + tokio::spawn( + async move { download_file(tx_id, &client, src, &dst, transfer_tx).await }, + ) + .abort_handle(); + + // update the transfer with the handle (so it can be canceled if necessary) + if let Err(e) = self + .state + .transfer_tx + .send((tx_id, TransferStatusUpdate::Handle(handle))) + { + error!("failed to send transfer handle: {e}"); + } + + // transfer is pending - requeue after 1 second with the pending condition + Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingTransfer( + self.src.to_string(), + tx_id, + )) + .requeue_after(Duration::from_secs(1))) + } +} + /// Untar the ledger file into the storage directory pub async fn load_ledger( state: &GlobalState, diff --git a/crates/agent/src/reconcile/mod.rs b/crates/agent/src/reconcile/mod.rs index ea375592..42bbfcb2 100644 --- a/crates/agent/src/reconcile/mod.rs +++ b/crates/agent/src/reconcile/mod.rs @@ -4,6 +4,7 @@ use indexmap::IndexSet; pub mod agent; mod checkpoint; +pub mod command; mod files; pub use files::*; use snops_common::state::TransferId; @@ -17,6 +18,12 @@ pub enum ReconcileCondition { /// A process is being spawned / confirmed. Could be starting the node or /// manipulating the ledger PendingProcess(String), + /// A tranfer was started and interrupted. + InterruptedTransfer(String, TransferId, String), + /// A file is missing and cannot be downloaded at the moment. + MissingFile(String), + /// Waiting to reconnect to the controlplane + PendingConnection, } pub trait Reconcile { diff --git a/crates/agent/src/state.rs b/crates/agent/src/state.rs index 9f13e348..4d5d5518 100644 --- a/crates/agent/src/state.rs +++ b/crates/agent/src/state.rs @@ -47,7 +47,7 @@ pub struct GlobalState { /// A sender for emitting the next time to reconcile the agent. /// Helpful for scheduling the next reconciliation. pub queue_reconcile_tx: Sender, - pub env_info: RwLock>, + pub env_info: RwLock)>>, pub reconcilation_handle: AsyncMutex>, pub child: RwLock>, /* TODO: this may need to be handled by an owning thread, * not sure yet */ @@ -86,14 +86,14 @@ impl GlobalState { .is_ok() } - pub async fn set_env_info(&self, info: Option<(EnvId, EnvInfo)>) { - if let Err(e) = self.db.set_env_info(info.as_ref()) { + pub async fn set_env_info(&self, info: Option<(EnvId, Arc)>) { + if let Err(e) = self.db.set_env_info(info.clone()) { error!("failed to save env info to db: {e}"); } *self.env_info.write().await = info; } - pub async fn get_env_info(&self, env_id: EnvId) -> Result { + pub async fn get_env_info(&self, env_id: EnvId) -> Result, ReconcileError2> { match self.env_info.read().await.as_ref() { Some((id, info)) if *id == env_id => return Ok(info.clone()), _ => {} @@ -112,13 +112,13 @@ impl GlobalState { .map_err(|e| ReconcileError2::RpcError(e.to_string()))? .ok_or(ReconcileError2::MissingEnv(env_id))?; - let env_info = (env_id, info.clone()); - if let Err(e) = self.db.set_env_info(Some(&env_info)) { + let env_info = (env_id, Arc::new(info)); + if let Err(e) = self.db.set_env_info(Some(env_info.clone())) { error!("failed to save env info to db: {e}"); } - *self.env_info.write().await = Some(env_info); + *self.env_info.write().await = Some(env_info.clone()); - Ok(info) + Ok(env_info.1) } /// Attempt to gracefully shutdown the node if one is running. diff --git a/crates/agent/src/transfers.rs b/crates/agent/src/transfers.rs index 8926d1fc..ef2ef7fa 100644 --- a/crates/agent/src/transfers.rs +++ b/crates/agent/src/transfers.rs @@ -80,6 +80,7 @@ pub fn start_monitor(client: ClientLock) -> (TransferTx, Arc (TransferTx, Arc { + let transfer = ent.get_mut(); + transfer.handle = Some(handle); + }, + _ => continue, } diff --git a/crates/common/src/binaries.rs b/crates/common/src/binaries.rs index 4813eb46..9d94c75d 100644 --- a/crates/common/src/binaries.rs +++ b/crates/common/src/binaries.rs @@ -15,7 +15,7 @@ use crate::{ }; /// A BinaryEntry is the location to a binary with an optional shasum -#[derive(Serialize, Deserialize, Debug, Clone)] +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] pub struct BinaryEntry { pub source: BinarySource, #[serde(default)] @@ -43,6 +43,11 @@ impl BinaryEntry { } } + /// Determines if the file is fetched from the control plane + pub fn is_api_file(&self) -> bool { + matches!(self.source, BinarySource::Path(_)) + } + /// Check if the sha256 is a valid sha256 hash pub fn check_sha256(&self) -> bool { self.sha256 @@ -92,7 +97,7 @@ impl Display for BinaryEntry { } } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Eq, PartialEq)] pub enum BinarySource { Url(url::Url), Path(PathBuf), diff --git a/crates/common/src/format/impl_containers.rs b/crates/common/src/format/impl_containers.rs index 3707ce05..de567668 100644 --- a/crates/common/src/format/impl_containers.rs +++ b/crates/common/src/format/impl_containers.rs @@ -1,4 +1,7 @@ -use std::io::{Read, Write}; +use std::{ + io::{Read, Write}, + sync::Arc, +}; use super::{DataFormat, DataFormatReader, DataFormatWriter, DataReadError, DataWriteError}; @@ -37,6 +40,19 @@ impl DataFormat for Box { } } +impl DataFormat for Arc { + type Header = T::Header; + const LATEST_HEADER: Self::Header = T::LATEST_HEADER; + + fn write_data(&self, writer: &mut W) -> Result { + self.as_ref().write_data(writer) + } + + fn read_data(reader: &mut R, header: &Self::Header) -> Result { + Ok(Arc::new(reader.read_data(header)?)) + } +} + #[cfg(test)] #[rustfmt::skip] mod test { diff --git a/crates/common/src/rpc/error.rs b/crates/common/src/rpc/error.rs index de6810cb..5948f6cc 100644 --- a/crates/common/src/rpc/error.rs +++ b/crates/common/src/rpc/error.rs @@ -1,3 +1,5 @@ +use std::path::PathBuf; + use serde::{Deserialize, Serialize}; use strum_macros::AsRefStr; use thiserror::Error; @@ -163,4 +165,20 @@ pub enum ReconcileError2 { AddressResolve(#[from] ResolveError), #[error("missing local private key")] MissingLocalPrivateKey, + #[error("failed to create directory {0}: {1}")] + CreateDirectory(PathBuf, String), + #[error("failed to get metadata for {0}: {1}")] + FileStatError(PathBuf, String), + #[error("failed to read file {0}: {1}")] + FileReadError(PathBuf, String), + #[error("failed to make {method} request {url}: {error}")] + HttpError { + method: String, + url: String, + error: String, + }, + #[error("failed to set file permissions {0}: {1}")] + FilePermissionError(PathBuf, String), + #[error("failed to parse {0} as a url: {1}")] + UrlParseError(String, String), } diff --git a/crates/common/src/state/agent_status.rs b/crates/common/src/state/agent_status.rs index f61c3787..b066b3fa 100644 --- a/crates/common/src/state/agent_status.rs +++ b/crates/common/src/state/agent_status.rs @@ -1,6 +1,7 @@ use chrono::{DateTime, Utc}; use indexmap::IndexMap; use serde::{Deserialize, Serialize}; +use tokio::task::AbortHandle; use super::snarkos_status::SnarkOSStatus; use crate::format::DataFormat; @@ -100,7 +101,11 @@ pub enum TransferStatusUpdate { total: u64, /// The time the transfer started. time: DateTime, + // The transfer's abort handle, if any. }, + // Client only - specifies a handle to abort the transfer task + #[serde(skip)] + Handle(AbortHandle), /// The transfer has made progress. Progress { /// The current number of bytes transferred. @@ -129,6 +134,21 @@ pub struct TransferStatus { pub total_bytes: u64, /// A transfer interruption reason, if any. pub interruption: Option, + /// The transfer's abort handle, if any. + #[serde(skip)] + pub handle: Option, +} + +impl TransferStatus { + pub fn is_pending(&self) -> bool { + self.interruption.is_none() && self.downloaded_bytes < self.total_bytes + } + pub fn is_interrupted(&self) -> bool { + self.interruption.is_some() + } + pub fn is_complete(&self) -> bool { + self.downloaded_bytes >= self.total_bytes + } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] From f2f6138f0c03734e02a1b547cfd1c14eebf7e483 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sun, 17 Nov 2024 15:48:51 -0500 Subject: [PATCH 07/68] feat(agent): storage version reconciler, preparing for ledger and genesis block reconciler --- crates/agent/src/cli.rs | 9 +- crates/agent/src/reconcile/agent.rs | 237 +++++++++++++++++--------- crates/agent/src/reconcile/command.rs | 5 +- crates/agent/src/reconcile/files.rs | 27 +-- 4 files changed, 181 insertions(+), 97 deletions(-) diff --git a/crates/agent/src/cli.rs b/crates/agent/src/cli.rs index 9b9d7111..7ee34a5c 100644 --- a/crates/agent/src/cli.rs +++ b/crates/agent/src/cli.rs @@ -10,7 +10,7 @@ use std::{ use clap::CommandFactory; use clap::Parser; use http::Uri; -use snops_common::state::{AgentId, AgentModeOptions, PortConfig}; +use snops_common::state::{AgentId, AgentModeOptions, NetworkId, PortConfig, StorageId}; use tracing::{info, warn}; use crate::net; @@ -189,4 +189,11 @@ impl Cli { (internal_addrs, external_addr) } + + pub fn storage_path(&self, network: NetworkId, storage_id: StorageId) -> PathBuf { + let mut path = self.path.join("storage"); + path.push(network.to_string()); + path.push(storage_id.to_string()); + path + } } diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index 2f5e000d..1c98e8cd 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -1,24 +1,27 @@ use std::{ collections::HashSet, + path::Path, sync::Arc, time::{Duration, Instant}, }; +use futures::stream::AbortHandle; use snops_common::{ api::EnvInfo, binaries::{BinaryEntry, BinarySource}, - constant::SNARKOS_FILE, + constant::{SNARKOS_FILE, VERSION_FILE}, rpc::error::ReconcileError2, state::{ AgentId, AgentPeer, AgentState, InternedId, NetworkId, NodeState, StorageId, TransferId, }, }; use tarpc::context; +use tokio::sync::{Mutex, Semaphore}; use tracing::{error, trace, warn}; use super::{ - command::NodeCommand, default_binary, FileReconciler, Reconcile, ReconcileCondition, - ReconcileStatus, + command::NodeCommand, default_binary, get_version_from_path, DirectoryReconciler, + FileReconciler, Reconcile, ReconcileCondition, ReconcileStatus, }; use crate::state::GlobalState; @@ -30,6 +33,8 @@ pub struct AgentStateReconciler { pub context: AgentStateReconcilerContext, } +type LedgerModifyResult = Result; + #[derive(Default)] struct TransfersContext { // TODO: persist network_id, storage_id, and storage_version @@ -46,6 +51,14 @@ struct TransfersContext { genesis_ok_at: Option, /// Metadata about an active ledger transfer ledger_transfer: Option, + /// A handle containing the task that modifies the ledger. + /// The mutex is held until the task is complete, and the bool is set to + /// true when the task is successful. + ledger_modify_handle: Option<(AbortHandle, Arc>>)>, + /// A handle containing the task that unzips the ledger. + /// The mutex is held until the task is complete, and the bool is set to + /// true when the task is successful. + ledger_unpack_handle: Option<(AbortHandle, Arc>>)>, /// Time the ledger was marked as OK ledger_ok_at: Option, } @@ -92,6 +105,8 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { // gracefully shut down the node. let shutdown_pending = !node.online || storage_has_changed; + // TODO: check if addrs have changed, and update shutdown_pending + // node is offline, no need to reconcile if !node.online { // TODO: tear down the node if it is running @@ -116,76 +131,33 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { } let transfers = self.context.transfers.as_mut().unwrap(); - // Resolve the node's binary - // TODO: move into BinaryReconciler - 'binary: { - // Binary entry for the node - let default_binary = default_binary(&env_info); - let target_binary = env_info - .storage - .binaries - .get(&node.binary.unwrap_or_default()) - .unwrap_or(&default_binary); - - // Check if the binary has changed - let binary_has_changed = transfers - .binary_transfer - .as_ref() - .map(|(_, b)| b != target_binary) - .unwrap_or(true); - let binary_is_ok = transfers - .binary_ok_at - .map(|ok| ok.elapsed().as_secs() < 300) // check if the binary has been OK for 5 minutes - .unwrap_or(false); - - // If the binary has not changed and has not expired, we can skip the binary - // reconciler - if !binary_has_changed && binary_is_ok { - break 'binary; - } + let storage_path = self + .state + .cli + .storage_path(env_info.network, env_info.storage.id); - let src = match &target_binary.source { - BinarySource::Url(url) => url.clone(), - BinarySource::Path(path) => { - let url = format!("{}{}", &self.state.endpoint, path.display()); - url.parse::() - .map_err(|e| ReconcileError2::UrlParseError(url, e.to_string()))? - } - }; - let dst = self.state.cli.path.join(SNARKOS_FILE); - - let is_api_offline = self.state.client.read().await.is_none(); - - let binary_res = FileReconciler::new(Arc::clone(&self.state), src, dst) - .with_offline(target_binary.is_api_file() && is_api_offline) - .with_binary(target_binary) - .with_tx_id(transfers.binary_transfer.as_ref().map(|(tx, _)| *tx)) - .reconcile() - .await?; - - // transfer is pending or a failure occurred - if binary_res.is_requeue() { - return Ok(binary_res.emptied().add_scope("binary_reconcile/requeue")); - } + // Ensure the storage version is correct, deleting the storage path + // the version changes. + StorageVersionReconciler(&storage_path, env_info.storage.version) + .reconcile() + .await?; - match binary_res.inner { - // If the binary is OK, update the context - Some(true) => { - transfers.binary_ok_at = Some(Instant::now()); - } - // If the binary is not OK, we will wait for the endpoint to come back - // online... - Some(false) => { - trace!( - "binary is not OK, waiting for the endpoint to come back online..." - ); - return Ok(ReconcileStatus::empty() - .add_condition(ReconcileCondition::PendingConnection) - .add_scope("binary_reconcile/offline") - .requeue_after(Duration::from_secs(5))); - } - None => unreachable!("file reconciler returns a result when not requeued"), - } + // Create the storage path if it does not exist + DirectoryReconciler(&storage_path).reconcile().await?; + + // Resolve the node's binary + let binary_res = BinaryReconciler { + state: Arc::clone(&self.state), + env_info: Arc::clone(&env_info), + node_binary: node.binary, + binary_transfer: &mut transfers.binary_transfer, + binary_ok_at: &mut transfers.binary_ok_at, + } + .reconcile() + .await?; + + if binary_res.is_requeue() { + return Ok(binary_res.add_scope("binary_reconcile/requeue")); } // Resolve the addresses of the peers and validators @@ -201,7 +173,6 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { return Ok(addr_res.add_scope("address_resolve/requeue")); } - // TODO: download binaries // TODO: restart the node if the binaries changed. this means storing the hashes // of the downloaded files @@ -314,16 +285,121 @@ impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { } /// Download a specific binary file needed to run the node -struct BinaryReconciler { +struct BinaryReconciler<'a> { state: Arc, - info: EnvInfo, + env_info: Arc, + node_binary: Option, + /// Metadata about an active binary transfer + binary_transfer: &'a mut Option<(TransferId, BinaryEntry)>, + /// Time the binary was marked as OK + binary_ok_at: &'a mut Option, } -// TODO: binary reconcile behavior: -// 1. check if the file exists. -// 2. if not, start downloading the file -// 3. if the file is already downloading, requeue if not done -// 4. when the transfer is done, check the sha256 hash and size +impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError2> { + let BinaryReconciler { + state, + env_info, + node_binary, + binary_transfer, + binary_ok_at, + } = self; + + // Binary entry for the node + let default_binary = default_binary(env_info); + let target_binary = env_info + .storage + .binaries + .get(&node_binary.unwrap_or_default()) + .unwrap_or(&default_binary); + + // Check if the binary has changed + let binary_has_changed = binary_transfer + .as_ref() + .map(|(_, b)| b != target_binary) + .unwrap_or(true); + let binary_is_ok = binary_ok_at + .map(|ok| ok.elapsed().as_secs() < 300) // check if the binary has been OK for 5 minutes + .unwrap_or(false); + + // If the binary has not changed and has not expired, we can skip the binary + // reconciler + if !binary_has_changed && binary_is_ok { + return Ok(ReconcileStatus::default()); + } + + let src = match &target_binary.source { + BinarySource::Url(url) => url.clone(), + BinarySource::Path(path) => { + let url = format!("{}{}", &state.endpoint, path.display()); + url.parse::() + .map_err(|e| ReconcileError2::UrlParseError(url, e.to_string()))? + } + }; + let dst = state.cli.path.join(SNARKOS_FILE); + + let is_api_offline = state.client.read().await.is_none(); + + let file_res = FileReconciler::new(Arc::clone(state), src, dst) + .with_offline(target_binary.is_api_file() && is_api_offline) + .with_binary(target_binary) + .with_tx_id(binary_transfer.as_ref().map(|(tx, _)| *tx)) + .reconcile() + .await?; + + // transfer is pending or a failure occurred + if file_res.is_requeue() { + return Ok(file_res.emptied().add_scope("file_reconcile/requeue")); + } + + match file_res.inner { + // If the binary is OK, update the context + Some(true) => { + **binary_ok_at = Some(Instant::now()); + Ok(ReconcileStatus::default()) + } + // If the binary is not OK, we will wait for the endpoint to come back + // online... + Some(false) => { + trace!("binary is not OK, waiting for the endpoint to come back online..."); + Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingConnection) + .add_scope("agent_state/binary/offline") + .requeue_after(Duration::from_secs(5))) + } + None => unreachable!("file reconciler returns a result when not requeued"), + } + } +} + +struct StorageVersionReconciler<'a>(&'a Path, u16); + +impl<'a> Reconcile<(), ReconcileError2> for StorageVersionReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError2> { + let StorageVersionReconciler(path, version) = self; + + let version_file = path.join(VERSION_FILE); + + let version_file_data = if !version_file.exists() { + None + } else { + tokio::fs::read_to_string(&version_file) + .await + .map_err(|e| ReconcileError2::FileReadError(version_file.clone(), e.to_string()))? + .parse() + .ok() + }; + + // wipe old storage when the version changes + Ok(if version_file_data != Some(*version) && path.exists() { + let _ = tokio::fs::remove_dir_all(&path).await; + ReconcileStatus::default() + } else { + // return an empty status if the version is the same + ReconcileStatus::empty() + }) + } +} // TODO: large file download behavior (ledgers): // same as above, except maybe chunk the downloads or @@ -332,3 +408,6 @@ struct BinaryReconciler { // https://ledger.aleo.network/mainnet/snapshot/latest.txt // https://ledger.aleo.network/testnet/snapshot/latest.txt // https://ledger.aleo.network/canarynet/snapshot/latest.txt + +// TODO: some kind of reconciler iterator that attempts to reconcile a chain +// until hitting a requeue diff --git a/crates/agent/src/reconcile/command.rs b/crates/agent/src/reconcile/command.rs index 818ecb2c..94bd95b5 100644 --- a/crates/agent/src/reconcile/command.rs +++ b/crates/agent/src/reconcile/command.rs @@ -64,10 +64,7 @@ impl NodeCommand { ) -> Result { let storage_path = state .cli - .path - .join("storage") - .join(env_info.network.to_string()) - .join(env_info.storage.id.to_string()); + .storage_path(env_info.network, env_info.storage.id); let ledger_path = if env_info.storage.persist { storage_path.join(LEDGER_PERSIST_DIR) diff --git a/crates/agent/src/reconcile/files.rs b/crates/agent/src/reconcile/files.rs index 8d187a57..0227caff 100644 --- a/crates/agent/src/reconcile/files.rs +++ b/crates/agent/src/reconcile/files.rs @@ -1,5 +1,9 @@ use std::{ - fs::Permissions, os::unix::fs::PermissionsExt, path::PathBuf, sync::Arc, time::Duration, + fs::Permissions, + os::unix::fs::PermissionsExt, + path::{Path, PathBuf}, + sync::Arc, + time::Duration, }; use chrono::{DateTime, TimeDelta, Utc}; @@ -76,12 +80,9 @@ pub async fn check_files( height: &HeightRequest, ) -> Result<(), ReconcileError> { let base_path = &state.cli.path; - let storage_id = &info.storage.id; + let storage_id = info.storage.id; let network = info.network; - let storage_path = base_path - .join("storage") - .join(network.to_string()) - .join(storage_id.to_string()); + let storage_path = state.cli.storage_path(network, storage_id); // create the directory containing the storage files tokio::fs::create_dir_all(&storage_path) @@ -103,9 +104,9 @@ pub async fn check_files( })?; let genesis_path = storage_path.join(SNARKOS_GENESIS_FILE); - let genesis_url = get_genesis_route(&state.endpoint, network, *storage_id); + let genesis_url = get_genesis_route(&state.endpoint, network, storage_id); let ledger_path = storage_path.join(LEDGER_STORAGE_FILE); - let ledger_url = get_ledger_route(&state.endpoint, network, *storage_id); + let ledger_url = get_ledger_route(&state.endpoint, network, storage_id); // skip genesis download for native genesis storage if !info.storage.native_genesis { @@ -144,12 +145,12 @@ pub async fn check_files( } /// This reconciler creates a directory if it does not exist -pub struct DirectoryReconciler(pub PathBuf); -impl Reconcile<(), ReconcileError2> for DirectoryReconciler { +pub struct DirectoryReconciler<'a>(pub &'a Path); +impl<'a> Reconcile<(), ReconcileError2> for DirectoryReconciler<'a> { async fn reconcile(&mut self) -> Result, ReconcileError2> { - std::fs::create_dir_all(&self.0) + std::fs::create_dir_all(self.0) .map(ReconcileStatus::with) - .map_err(|e| ReconcileError2::CreateDirectory(self.0.clone(), e.to_string())) + .map_err(|e| ReconcileError2::CreateDirectory(self.0.to_path_buf(), e.to_string())) } } @@ -521,7 +522,7 @@ pub async fn load_ledger( Ok(true) } -async fn get_version_from_path(path: &PathBuf) -> Result, ReconcileError> { +pub async fn get_version_from_path(path: &PathBuf) -> Result, ReconcileError> { if !path.exists() { return Ok(None); } From b87750ccdc5d5ec2189e7107529f3166399e89a5 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 18 Nov 2024 22:39:38 -0500 Subject: [PATCH 08/68] feat(agent): Process context and reorganized storage reconcilers --- crates/agent/src/reconcile/agent.rs | 155 +++----------------------- crates/agent/src/reconcile/mod.rs | 2 + crates/agent/src/reconcile/process.rs | 84 ++++++++++++++ crates/agent/src/reconcile/storage.rs | 134 ++++++++++++++++++++++ crates/common/src/rpc/error.rs | 2 + 5 files changed, 240 insertions(+), 137 deletions(-) create mode 100644 crates/agent/src/reconcile/process.rs create mode 100644 crates/agent/src/reconcile/storage.rs diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index 1c98e8cd..8725044d 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -1,27 +1,21 @@ -use std::{ - collections::HashSet, - path::Path, - sync::Arc, - time::{Duration, Instant}, -}; +use std::{collections::HashSet, sync::Arc, time::Instant}; use futures::stream::AbortHandle; use snops_common::{ api::EnvInfo, - binaries::{BinaryEntry, BinarySource}, - constant::{SNARKOS_FILE, VERSION_FILE}, + binaries::BinaryEntry, rpc::error::ReconcileError2, - state::{ - AgentId, AgentPeer, AgentState, InternedId, NetworkId, NodeState, StorageId, TransferId, - }, + state::{AgentId, AgentPeer, AgentState, NetworkId, NodeState, StorageId, TransferId}, }; use tarpc::context; -use tokio::sync::{Mutex, Semaphore}; -use tracing::{error, trace, warn}; +use tokio::sync::Mutex; +use tracing::{error, warn}; use super::{ - command::NodeCommand, default_binary, get_version_from_path, DirectoryReconciler, - FileReconciler, Reconcile, ReconcileCondition, ReconcileStatus, + command::NodeCommand, + process::ProcessContext, + storage::{BinaryReconciler, StorageVersionReconciler}, + DirectoryReconciler, Reconcile, ReconcileStatus, }; use crate::state::GlobalState; @@ -73,12 +67,12 @@ impl TransfersContext { #[derive(Default)] pub struct AgentStateReconcilerContext { - /// All parameters needed to build the command to start the node - command: Option, - /// Information about active transfers - transfers: Option, // TODO: allow transfers to be interrupted. potentially allow them to be resumed by using the // file range feature. + /// Information about active transfers + transfers: Option, + /// Information about the node process + process: Option, } impl Reconcile<(), ReconcileError2> for AgentStateReconciler { @@ -105,6 +99,10 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { // gracefully shut down the node. let shutdown_pending = !node.online || storage_has_changed; + if let (true, Some(process)) = (shutdown_pending, self.context.process.as_ref()) { + // TODO: reconcile process destruction + } + // TODO: check if addrs have changed, and update shutdown_pending // node is offline, no need to reconcile @@ -189,7 +187,7 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { ) .await?; - if self.context.command.as_ref() != Some(&command) { + if self.context.process.as_ref().map(|p| &p.command) != Some(&command) { // TODO: OK to restart the node -- command has changed } @@ -284,123 +282,6 @@ impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { } } -/// Download a specific binary file needed to run the node -struct BinaryReconciler<'a> { - state: Arc, - env_info: Arc, - node_binary: Option, - /// Metadata about an active binary transfer - binary_transfer: &'a mut Option<(TransferId, BinaryEntry)>, - /// Time the binary was marked as OK - binary_ok_at: &'a mut Option, -} - -impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { - async fn reconcile(&mut self) -> Result, ReconcileError2> { - let BinaryReconciler { - state, - env_info, - node_binary, - binary_transfer, - binary_ok_at, - } = self; - - // Binary entry for the node - let default_binary = default_binary(env_info); - let target_binary = env_info - .storage - .binaries - .get(&node_binary.unwrap_or_default()) - .unwrap_or(&default_binary); - - // Check if the binary has changed - let binary_has_changed = binary_transfer - .as_ref() - .map(|(_, b)| b != target_binary) - .unwrap_or(true); - let binary_is_ok = binary_ok_at - .map(|ok| ok.elapsed().as_secs() < 300) // check if the binary has been OK for 5 minutes - .unwrap_or(false); - - // If the binary has not changed and has not expired, we can skip the binary - // reconciler - if !binary_has_changed && binary_is_ok { - return Ok(ReconcileStatus::default()); - } - - let src = match &target_binary.source { - BinarySource::Url(url) => url.clone(), - BinarySource::Path(path) => { - let url = format!("{}{}", &state.endpoint, path.display()); - url.parse::() - .map_err(|e| ReconcileError2::UrlParseError(url, e.to_string()))? - } - }; - let dst = state.cli.path.join(SNARKOS_FILE); - - let is_api_offline = state.client.read().await.is_none(); - - let file_res = FileReconciler::new(Arc::clone(state), src, dst) - .with_offline(target_binary.is_api_file() && is_api_offline) - .with_binary(target_binary) - .with_tx_id(binary_transfer.as_ref().map(|(tx, _)| *tx)) - .reconcile() - .await?; - - // transfer is pending or a failure occurred - if file_res.is_requeue() { - return Ok(file_res.emptied().add_scope("file_reconcile/requeue")); - } - - match file_res.inner { - // If the binary is OK, update the context - Some(true) => { - **binary_ok_at = Some(Instant::now()); - Ok(ReconcileStatus::default()) - } - // If the binary is not OK, we will wait for the endpoint to come back - // online... - Some(false) => { - trace!("binary is not OK, waiting for the endpoint to come back online..."); - Ok(ReconcileStatus::empty() - .add_condition(ReconcileCondition::PendingConnection) - .add_scope("agent_state/binary/offline") - .requeue_after(Duration::from_secs(5))) - } - None => unreachable!("file reconciler returns a result when not requeued"), - } - } -} - -struct StorageVersionReconciler<'a>(&'a Path, u16); - -impl<'a> Reconcile<(), ReconcileError2> for StorageVersionReconciler<'a> { - async fn reconcile(&mut self) -> Result, ReconcileError2> { - let StorageVersionReconciler(path, version) = self; - - let version_file = path.join(VERSION_FILE); - - let version_file_data = if !version_file.exists() { - None - } else { - tokio::fs::read_to_string(&version_file) - .await - .map_err(|e| ReconcileError2::FileReadError(version_file.clone(), e.to_string()))? - .parse() - .ok() - }; - - // wipe old storage when the version changes - Ok(if version_file_data != Some(*version) && path.exists() { - let _ = tokio::fs::remove_dir_all(&path).await; - ReconcileStatus::default() - } else { - // return an empty status if the version is the same - ReconcileStatus::empty() - }) - } -} - // TODO: large file download behavior (ledgers): // same as above, except maybe chunk the downloads or diff --git a/crates/agent/src/reconcile/mod.rs b/crates/agent/src/reconcile/mod.rs index 42bbfcb2..693bdbfb 100644 --- a/crates/agent/src/reconcile/mod.rs +++ b/crates/agent/src/reconcile/mod.rs @@ -7,6 +7,8 @@ mod checkpoint; pub mod command; mod files; pub use files::*; +pub mod process; +pub mod storage; use snops_common::state::TransferId; #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] diff --git a/crates/agent/src/reconcile/process.rs b/crates/agent/src/reconcile/process.rs new file mode 100644 index 00000000..d7c8d5ee --- /dev/null +++ b/crates/agent/src/reconcile/process.rs @@ -0,0 +1,84 @@ +use std::time::Instant; + +use snops_common::rpc::error::ReconcileError2; +use tokio::process::Child; +use tracing::error; + +use super::command::NodeCommand; + +/// Information about the current process +pub struct ProcessContext { + /// The command used to start the node. If the next command is different, + /// the node should be restarted + pub command: NodeCommand, + /// The child process that is running the node + child: Child, + /// Time the child process was started + started_at: Instant, + /// Time a sigint was sent to the child process + sigint_at: Option, + /// Time a sigkill was sent to the child process + sigkill_at: Option, +} + +impl ProcessContext { + pub fn new(command: NodeCommand) -> Result { + command + .build() + .spawn() + .map(|child| Self { + command, + child, + started_at: Instant::now(), + sigint_at: None, + sigkill_at: None, + }) + .map_err(|e| { + error!("failed to start node process: {e:?}"); + ReconcileError2::SpawnError(e.to_string()) + }) + } + + /// Returns true when the child process has not exited + pub fn is_running(&self) -> bool { + self.child.id().is_some() + } + + /// Send a SIGINT to the child process + pub fn send_sigint(&mut self) -> bool { + use nix::{ + sys::signal::{self, Signal}, + unistd::Pid, + }; + + // prevent multiple sigints + if self.sigint_at.is_some() { + return false; + } + + // obtain the id, or return false if the child is not running + let Some(id) = self.child.id() else { + return false; + }; + + // send SIGINT to the child process + signal::kill(Pid::from_raw(id as i32), Signal::SIGINT) + .inspect(|_| { + // update the sigint time if the sigint was successful + self.sigint_at = Some(Instant::now()); + }) + .is_ok() + } + + /// Send a SIGKILL to the child process + pub fn send_sigkill(&mut self) -> bool { + // start_kill return Err if the process is already killed + self.child + .start_kill() + .inspect(|_| { + // update the kill time if the kill was successful + self.sigkill_at = Some(Instant::now()); + }) + .is_ok() + } +} diff --git a/crates/agent/src/reconcile/storage.rs b/crates/agent/src/reconcile/storage.rs new file mode 100644 index 00000000..e08bf7c6 --- /dev/null +++ b/crates/agent/src/reconcile/storage.rs @@ -0,0 +1,134 @@ +use std::{ + path::Path, + sync::Arc, + time::{Duration, Instant}, +}; + +use snops_common::{ + api::EnvInfo, + binaries::{BinaryEntry, BinarySource}, + constant::{SNARKOS_FILE, VERSION_FILE}, + rpc::error::ReconcileError2, + state::{InternedId, TransferId}, +}; +use tracing::trace; + +use super::{default_binary, FileReconciler, Reconcile, ReconcileCondition, ReconcileStatus}; +use crate::state::GlobalState; + +/// Download a specific binary file needed to run the node +pub struct BinaryReconciler<'a> { + pub state: Arc, + pub env_info: Arc, + pub node_binary: Option, + /// Metadata about an active binary transfer + pub binary_transfer: &'a mut Option<(TransferId, BinaryEntry)>, + /// Time the binary was marked as OK + pub binary_ok_at: &'a mut Option, +} + +impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError2> { + let BinaryReconciler { + state, + env_info, + node_binary, + binary_transfer, + binary_ok_at, + } = self; + + // Binary entry for the node + let default_binary = default_binary(env_info); + let target_binary = env_info + .storage + .binaries + .get(&node_binary.unwrap_or_default()) + .unwrap_or(&default_binary); + + // Check if the binary has changed + let binary_has_changed = binary_transfer + .as_ref() + .map(|(_, b)| b != target_binary) + .unwrap_or(true); + let binary_is_ok = binary_ok_at + .map(|ok| ok.elapsed().as_secs() < 300) // check if the binary has been OK for 5 minutes + .unwrap_or(false); + + // If the binary has not changed and has not expired, we can skip the binary + // reconciler + if !binary_has_changed && binary_is_ok { + return Ok(ReconcileStatus::default()); + } + + let src = match &target_binary.source { + BinarySource::Url(url) => url.clone(), + BinarySource::Path(path) => { + let url = format!("{}{}", &state.endpoint, path.display()); + url.parse::() + .map_err(|e| ReconcileError2::UrlParseError(url, e.to_string()))? + } + }; + let dst = state.cli.path.join(SNARKOS_FILE); + + let is_api_offline = state.client.read().await.is_none(); + + let file_res = FileReconciler::new(Arc::clone(state), src, dst) + .with_offline(target_binary.is_api_file() && is_api_offline) + .with_binary(target_binary) + .with_tx_id(binary_transfer.as_ref().map(|(tx, _)| *tx)) + .reconcile() + .await?; + + // transfer is pending or a failure occurred + if file_res.is_requeue() { + return Ok(file_res.emptied().add_scope("file_reconcile/requeue")); + } + + match file_res.inner { + // If the binary is OK, update the context + Some(true) => { + **binary_ok_at = Some(Instant::now()); + Ok(ReconcileStatus::default()) + } + // If the binary is not OK, we will wait for the endpoint to come back + // online... + Some(false) => { + trace!("binary is not OK, waiting for the endpoint to come back online..."); + Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingConnection) + .add_scope("agent_state/binary/offline") + .requeue_after(Duration::from_secs(5))) + } + None => unreachable!("file reconciler returns a result when not requeued"), + } + } +} + +pub struct StorageVersionReconciler<'a>(pub &'a Path, pub u16); + +impl<'a> Reconcile<(), ReconcileError2> for StorageVersionReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError2> { + let StorageVersionReconciler(path, version) = self; + + let version_file = path.join(VERSION_FILE); + + let version_file_data = if !version_file.exists() { + None + } else { + tokio::fs::read_to_string(&version_file) + .await + .map_err(|e| ReconcileError2::FileReadError(version_file.clone(), e.to_string()))? + .parse() + .ok() + }; + + // wipe old storage when the version changes + Ok(if version_file_data != Some(*version) && path.exists() { + let _ = tokio::fs::remove_dir_all(&path).await; + ReconcileStatus::default() + } else { + // return an empty status if the version is the same + ReconcileStatus::empty() + }) + } +} diff --git a/crates/common/src/rpc/error.rs b/crates/common/src/rpc/error.rs index 5948f6cc..a4e635e8 100644 --- a/crates/common/src/rpc/error.rs +++ b/crates/common/src/rpc/error.rs @@ -177,6 +177,8 @@ pub enum ReconcileError2 { url: String, error: String, }, + #[error("failed to spawn process: {0}")] + SpawnError(String), #[error("failed to set file permissions {0}: {1}")] FilePermissionError(PathBuf, String), #[error("failed to parse {0} as a url: {1}")] From b787da5cbdfad7032158f81af15e121e7882e13a Mon Sep 17 00:00:00 2001 From: Meshiest Date: Tue, 19 Nov 2024 00:51:20 -0500 Subject: [PATCH 09/68] feat(agent): genesis reconciler, WIP ledger reconciler --- crates/agent/src/main.rs | 28 ++-- crates/agent/src/reconcile/agent.rs | 99 ++++++++------ crates/agent/src/reconcile/mod.rs | 2 +- crates/agent/src/reconcile/storage.rs | 181 +++++++++++++++++++++++--- crates/agent/src/state.rs | 4 + crates/controlplane/src/server/rpc.rs | 1 + 6 files changed, 243 insertions(+), 72 deletions(-) diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 3627672c..b3a16feb 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -52,17 +52,17 @@ async fn main() { let (endpoint, ws_uri) = args.endpoint_and_uri(); info!("Using endpoint {endpoint}"); - // create the data directory + // Create the data directory tokio::fs::create_dir_all(&args.path) .await .expect("failed to create data path"); - // open the database + // Open the database let db = db::Database::open(&args.path.join("store")).expect("failed to open database"); let client = Default::default(); - // start transfer monitor + // Start transfer monitor let (transfer_tx, transfers) = transfers::start_monitor(Arc::clone(&client)); let agent_rpc_listener = tokio::net::TcpListener::bind((Ipv4Addr::LOCALHOST, 0)) @@ -75,7 +75,7 @@ async fn main() { let (queue_reconcile_tx, mut reconcile_requests) = mpsc::channel(5); - // create the client state + // Create the client state let state = Arc::new(GlobalState { client, _started: Instant::now(), @@ -118,10 +118,10 @@ async fn main() { db: OpaqueDebug(db), }); - // start the metrics watcher + // Start the metrics watcher metrics::init(Arc::clone(&state)); - // start the status server + // Start the status server let status_state = Arc::clone(&state); tokio::spawn(async move { info!("starting status API server on port {agent_rpc_port}"); @@ -131,7 +131,7 @@ async fn main() { } }); - // get the interrupt signals to break the stream connection + // Get the interrupt signals to break the stream connection let mut interrupt = Signals::new(&[SignalKind::terminate(), SignalKind::interrupt()]); let state2 = Arc::clone(&state); @@ -163,9 +163,9 @@ async fn main() { let mut wait = Box::pin(tokio::time::sleep_until(next_reconcile_at.into())); loop { - // await for the next reconcile, allowing for it to be moved up sooner + // Await for the next reconcile, allowing for it to be moved up sooner select! { - // replace the next_reconcile_at with the soonest reconcile time + // Replace the next_reconcile_at with the soonest reconcile time Some(new_reconcile_at) = reconcile_requests.recv() => { next_reconcile_at = next_reconcile_at.min(new_reconcile_at); wait = Box::pin(tokio::time::sleep_until(next_reconcile_at.into())); @@ -173,13 +173,13 @@ async fn main() { _ = &mut wait => {} } - // drain the reconcile request queue + // Drain the reconcile request queue while reconcile_requests.try_recv().is_ok() {} - // schedule the next reconcile for 5 minutes from now - next_reconcile_at = Instant::now() + Duration::from_secs(5 * 60); + // Schedule the next reconcile for 1 week. + next_reconcile_at = Instant::now() + Duration::from_secs(60 * 60 * 24 * 7); - // update the reconciler with the latest agent state - // this prevents the agent state from changing during reconciliation + // Update the reconciler with the latest agent state + // This prevents the agent state from changing during reconciliation root.agent_state = state3.agent_state.read().await.deref().clone(); trace!("reconciling agent state..."); diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index 8725044d..d22477ee 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -5,7 +5,9 @@ use snops_common::{ api::EnvInfo, binaries::BinaryEntry, rpc::error::ReconcileError2, - state::{AgentId, AgentPeer, AgentState, NetworkId, NodeState, StorageId, TransferId}, + state::{ + AgentId, AgentPeer, AgentState, HeightRequest, NetworkId, NodeState, StorageId, TransferId, + }, }; use tarpc::context; use tokio::sync::Mutex; @@ -14,7 +16,7 @@ use tracing::{error, warn}; use super::{ command::NodeCommand, process::ProcessContext, - storage::{BinaryReconciler, StorageVersionReconciler}, + storage::{BinaryReconciler, GenesisReconciler, LedgerModifyResult, StorageVersionReconciler}, DirectoryReconciler, Reconcile, ReconcileStatus, }; use crate::state::GlobalState; @@ -27,24 +29,32 @@ pub struct AgentStateReconciler { pub context: AgentStateReconcilerContext, } -type LedgerModifyResult = Result; - #[derive(Default)] struct TransfersContext { // TODO: persist network_id, storage_id, and storage_version network_id: NetworkId, storage_id: StorageId, storage_version: u16, + /// Metadata about an active binary transfer binary_transfer: Option<(TransferId, BinaryEntry)>, /// Time the binary was marked as OK binary_ok_at: Option, + /// Metadata about an active genesis block transfer genesis_transfer: Option, /// Time the genesis block was marked as OK genesis_ok_at: Option, - /// Metadata about an active ledger transfer + + /// The last ledger height that was successfully configured + ledger_last_height: Option, + /// The height that is currently being configured + ledger_pending_height: Option, + + /// Metadata about an active ledger tar file transfer ledger_transfer: Option, + /// Time the ledger tar file was marked as OK + ledger_ok_at: Option, /// A handle containing the task that modifies the ledger. /// The mutex is held until the task is complete, and the bool is set to /// true when the task is successful. @@ -53,8 +63,6 @@ struct TransfersContext { /// The mutex is held until the task is complete, and the bool is set to /// true when the task is successful. ledger_unpack_handle: Option<(AbortHandle, Arc>>)>, - /// Time the ledger was marked as OK - ledger_ok_at: Option, } impl TransfersContext { @@ -75,6 +83,18 @@ pub struct AgentStateReconcilerContext { process: Option, } +/// Run a reconciler and return early if a requeue is needed. A condition is +/// added to the scope when a requeue is needed to provide more context when +/// monitoring the agent. +macro_rules! reconcile { + ($id:ident, $e:expr) => { + let res = $e.reconcile().await?; + if res.is_requeue() { + return Ok(res.add_scope(concat!(stringify!($id), "/requeue"))); + } + }; +} + impl Reconcile<(), ReconcileError2> for AgentStateReconciler { async fn reconcile(&mut self) -> Result, ReconcileError2> { match self.agent_state.as_ref() { @@ -108,7 +128,7 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { // node is offline, no need to reconcile if !node.online { // TODO: tear down the node if it is running - return Ok(ReconcileStatus::default().add_scope("agent_state/node/offline")); + return Ok(ReconcileStatus::default().add_scope("agent_state/offline")); } let node_arc = Arc::new(*node.clone()); @@ -136,40 +156,46 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { // Ensure the storage version is correct, deleting the storage path // the version changes. - StorageVersionReconciler(&storage_path, env_info.storage.version) - .reconcile() - .await?; + reconcile!( + storage, + StorageVersionReconciler(&storage_path, env_info.storage.version) + ); // Create the storage path if it does not exist - DirectoryReconciler(&storage_path).reconcile().await?; + reconcile!(dir, DirectoryReconciler(&storage_path)); + + // Resolve the genesis block + reconcile!( + genesis, + GenesisReconciler { + state: Arc::clone(&self.state), + env_info: Arc::clone(&env_info), + transfer: &mut transfers.genesis_transfer, + ok_at: &mut transfers.genesis_ok_at, + } + ); // Resolve the node's binary - let binary_res = BinaryReconciler { - state: Arc::clone(&self.state), - env_info: Arc::clone(&env_info), - node_binary: node.binary, - binary_transfer: &mut transfers.binary_transfer, - binary_ok_at: &mut transfers.binary_ok_at, - } - .reconcile() - .await?; - - if binary_res.is_requeue() { - return Ok(binary_res.add_scope("binary_reconcile/requeue")); - } + reconcile!( + binary, + BinaryReconciler { + state: Arc::clone(&self.state), + env_info: Arc::clone(&env_info), + node_binary: node.binary, + transfer: &mut transfers.binary_transfer, + ok_at: &mut transfers.binary_ok_at, + } + ); // Resolve the addresses of the peers and validators // TODO: Set an expiry for resolved addresses - let addr_res = AddressResolveReconciler { - node: Arc::clone(&node_arc), - state: Arc::clone(&self.state), - } - .reconcile() - .await?; - - if addr_res.is_requeue() { - return Ok(addr_res.add_scope("address_resolve/requeue")); - } + reconcile!( + address_resolve, + AddressResolveReconciler { + node: Arc::clone(&node_arc), + state: Arc::clone(&self.state), + } + ); // TODO: restart the node if the binaries changed. this means storing the hashes // of the downloaded files @@ -289,6 +315,3 @@ impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { // https://ledger.aleo.network/mainnet/snapshot/latest.txt // https://ledger.aleo.network/testnet/snapshot/latest.txt // https://ledger.aleo.network/canarynet/snapshot/latest.txt - -// TODO: some kind of reconciler iterator that attempts to reconcile a chain -// until hitting a requeue diff --git a/crates/agent/src/reconcile/mod.rs b/crates/agent/src/reconcile/mod.rs index 693bdbfb..9f60d075 100644 --- a/crates/agent/src/reconcile/mod.rs +++ b/crates/agent/src/reconcile/mod.rs @@ -9,7 +9,7 @@ mod files; pub use files::*; pub mod process; pub mod storage; -use snops_common::state::TransferId; +use snops_common::{rpc::error::ReconcileError2, state::TransferId}; #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum ReconcileCondition { diff --git a/crates/agent/src/reconcile/storage.rs b/crates/agent/src/reconcile/storage.rs index e08bf7c6..a6fae92a 100644 --- a/crates/agent/src/reconcile/storage.rs +++ b/crates/agent/src/reconcile/storage.rs @@ -7,13 +7,20 @@ use std::{ use snops_common::{ api::EnvInfo, binaries::{BinaryEntry, BinarySource}, - constant::{SNARKOS_FILE, VERSION_FILE}, + constant::{ + LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, VERSION_FILE, + }, rpc::error::ReconcileError2, - state::{InternedId, TransferId}, + state::{HeightRequest, InternedId, TransferId}, }; +use tokio::{sync::Mutex, task::AbortHandle}; use tracing::trace; +use url::Url; -use super::{default_binary, FileReconciler, Reconcile, ReconcileCondition, ReconcileStatus}; +use super::{ + default_binary, get_genesis_route, DirectoryReconciler, FileReconciler, Reconcile, + ReconcileCondition, ReconcileStatus, +}; use crate::state::GlobalState; /// Download a specific binary file needed to run the node @@ -22,9 +29,9 @@ pub struct BinaryReconciler<'a> { pub env_info: Arc, pub node_binary: Option, /// Metadata about an active binary transfer - pub binary_transfer: &'a mut Option<(TransferId, BinaryEntry)>, + pub transfer: &'a mut Option<(TransferId, BinaryEntry)>, /// Time the binary was marked as OK - pub binary_ok_at: &'a mut Option, + pub ok_at: &'a mut Option, } impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { @@ -33,8 +40,8 @@ impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { state, env_info, node_binary, - binary_transfer, - binary_ok_at, + transfer, + ok_at, } = self; // Binary entry for the node @@ -46,11 +53,11 @@ impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { .unwrap_or(&default_binary); // Check if the binary has changed - let binary_has_changed = binary_transfer + let binary_has_changed = transfer .as_ref() .map(|(_, b)| b != target_binary) .unwrap_or(true); - let binary_is_ok = binary_ok_at + let binary_is_ok = ok_at .map(|ok| ok.elapsed().as_secs() < 300) // check if the binary has been OK for 5 minutes .unwrap_or(false); @@ -70,24 +77,24 @@ impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { }; let dst = state.cli.path.join(SNARKOS_FILE); - let is_api_offline = state.client.read().await.is_none(); - - let file_res = FileReconciler::new(Arc::clone(state), src, dst) - .with_offline(target_binary.is_api_file() && is_api_offline) + let mut file_rec = FileReconciler::new(Arc::clone(state), src, dst) + .with_offline(target_binary.is_api_file() && !state.is_ws_online()) .with_binary(target_binary) - .with_tx_id(binary_transfer.as_ref().map(|(tx, _)| *tx)) - .reconcile() - .await?; + .with_tx_id(transfer.as_ref().map(|(tx, _)| *tx)); + let file_res = file_rec.reconcile().await?; + if let Some(tx_id) = file_rec.tx_id { + **transfer = Some((tx_id, target_binary.clone())); + } // transfer is pending or a failure occurred if file_res.is_requeue() { - return Ok(file_res.emptied().add_scope("file_reconcile/requeue")); + return Ok(file_res.emptied().add_scope("file/requeue")); } match file_res.inner { // If the binary is OK, update the context Some(true) => { - **binary_ok_at = Some(Instant::now()); + **ok_at = Some(Instant::now()); Ok(ReconcileStatus::default()) } // If the binary is not OK, we will wait for the endpoint to come back @@ -96,7 +103,83 @@ impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { trace!("binary is not OK, waiting for the endpoint to come back online..."); Ok(ReconcileStatus::empty() .add_condition(ReconcileCondition::PendingConnection) - .add_scope("agent_state/binary/offline") + .add_condition(ReconcileCondition::MissingFile(SNARKOS_FILE.to_string())) + .add_scope("binary/offline") + .requeue_after(Duration::from_secs(5))) + } + None => unreachable!("file reconciler returns a result when not requeued"), + } + } +} + +/// Download the genesis block needed to run the node +pub struct GenesisReconciler<'a> { + pub state: Arc, + pub env_info: Arc, + /// Metadata about an active genesis transfer + pub transfer: &'a mut Option, + /// Time the genesis was marked as OK + pub ok_at: &'a mut Option, +} + +impl<'a> Reconcile<(), ReconcileError2> for GenesisReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError2> { + let GenesisReconciler { + state, + env_info, + transfer, + ok_at, + } = self; + + let storage_path = state + .cli + .storage_path(env_info.network, env_info.storage.id); + + // Genesis block file has been checked within 5 minutes + let genesis_file_ok = ok_at + .map(|ok| ok.elapsed().as_secs() < 300) + .unwrap_or(false); + + if env_info.storage.native_genesis || !genesis_file_ok { + return Ok(ReconcileStatus::default()); + } + + let genesis_url = get_genesis_route(&state.endpoint, env_info.network, env_info.storage.id); + let mut file_rec = FileReconciler::new( + Arc::clone(&self.state), + genesis_url.parse::().map_err(|e| { + ReconcileError2::UrlParseError(genesis_url.to_string(), e.to_string()) + })?, + storage_path.join(SNARKOS_GENESIS_FILE), + ) + .with_offline(!self.state.is_ws_online()) + .with_tx_id(**transfer); + let file_res = file_rec.reconcile().await?; + + if let Some(tx_id) = file_rec.tx_id { + **transfer = Some(tx_id); + } + + if file_res.is_requeue() { + return Ok(file_res.emptied().add_scope("file/requeue")); + } + + match file_res.inner { + // If the binary is OK, update the context + Some(true) => { + **ok_at = Some(Instant::now()); + Ok(ReconcileStatus::default()) + } + // If the binary is not OK, we will wait for the endpoint to come back + // online... + Some(false) => { + trace!("genesis is not OK, waiting for the endpoint to come back online..."); + Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingConnection) + .add_condition(ReconcileCondition::MissingFile( + SNARKOS_GENESIS_FILE.to_string(), + )) + .add_scope("genesis/offline") .requeue_after(Duration::from_secs(5))) } None => unreachable!("file reconciler returns a result when not requeued"), @@ -104,6 +187,66 @@ impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { } } +pub type LedgerModifyResult = Result; + +pub struct LedgerReconciler<'a> { + pub state: Arc, + pub env_info: Arc, + pub target_height: HeightRequest, + pub last_height: &'a mut Option, + pub pending_height: &'a mut Option, + pub ok_at: &'a mut Option, + pub transfer: &'a mut Option, + pub modify_handle: &'a mut Option<(AbortHandle, Arc>>)>, + pub unpack_handle: &'a mut Option<(AbortHandle, Arc>>)>, +} + +impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError2> { + let LedgerReconciler { + state, + env_info, + ok_at, + transfer, + modify_handle, + unpack_handle, + target_height, + last_height, + pending_height, + } = self; + + let network = env_info.network; + let storage_id = env_info.storage.id; + + let (untar_base, untar_dir) = if env_info.storage.persist { + ( + state.cli.storage_path(network, storage_id), + LEDGER_PERSIST_DIR, + ) + } else { + (state.cli.path.clone(), LEDGER_BASE_DIR) + }; + + let ledger_path = untar_base.join(untar_dir); + + DirectoryReconciler(&ledger_path.join(".aleo")) + .reconcile() + .await?; + + // If the ledger is OK and the target height is the top, we can skip the ledger + // reconciler + if env_info.storage.persist && target_height.is_top() && ledger_path.exists() { + return Ok(ReconcileStatus::default()); + } + + // TODO: if pending_height - check unpack/modify handles + + let is_new_env = last_height.is_none(); + + Ok(ReconcileStatus::empty()) + } +} + pub struct StorageVersionReconciler<'a>(pub &'a Path, pub u16); impl<'a> Reconcile<(), ReconcileError2> for StorageVersionReconciler<'a> { diff --git a/crates/agent/src/state.rs b/crates/agent/src/state.rs index 4d5d5518..1c8820e8 100644 --- a/crates/agent/src/state.rs +++ b/crates/agent/src/state.rs @@ -64,6 +64,10 @@ pub struct GlobalState { } impl GlobalState { + pub fn is_ws_online(&self) -> bool { + self.client.try_read().is_ok_and(|c| c.is_some()) + } + // Resolve the addresses of the given agents. // Locks resolve_addrs pub async fn agentpeers_to_cli(&self, peers: &[AgentPeer]) -> Vec { diff --git a/crates/controlplane/src/server/rpc.rs b/crates/controlplane/src/server/rpc.rs index cbadd2b3..8ca05d97 100644 --- a/crates/controlplane/src/server/rpc.rs +++ b/crates/controlplane/src/server/rpc.rs @@ -81,6 +81,7 @@ impl ControlService for ControlRpcServer { downloaded_bytes: 0, total_bytes: total, interruption: None, + handle: None, }, ); } From 84a0e7a299ebe858c7bfb6d2b8c353c0cd52c46a Mon Sep 17 00:00:00 2001 From: Meshiest Date: Fri, 22 Nov 2024 22:29:41 -0500 Subject: [PATCH 10/68] feat(agent): WIP continued ledger reconciler --- crates/agent/src/reconcile/agent.rs | 40 +++++--- crates/agent/src/reconcile/mod.rs | 4 + crates/agent/src/reconcile/storage.rs | 108 ++++++++++++++++++++-- crates/common/src/rpc/error.rs | 6 +- crates/common/src/state/height_request.rs | 12 ++- 5 files changed, 149 insertions(+), 21 deletions(-) diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index d22477ee..b70d936a 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -1,6 +1,5 @@ use std::{collections::HashSet, sync::Arc, time::Instant}; -use futures::stream::AbortHandle; use snops_common::{ api::EnvInfo, binaries::BinaryEntry, @@ -10,7 +9,7 @@ use snops_common::{ }, }; use tarpc::context; -use tokio::sync::Mutex; +use tokio::{sync::Mutex, task::AbortHandle}; use tracing::{error, warn}; use super::{ @@ -19,7 +18,7 @@ use super::{ storage::{BinaryReconciler, GenesisReconciler, LedgerModifyResult, StorageVersionReconciler}, DirectoryReconciler, Reconcile, ReconcileStatus, }; -use crate::state::GlobalState; +use crate::{reconcile::storage::LedgerReconciler, state::GlobalState}; /// Attempt to reconcile the agent's current state. /// This will download files and start/stop the node @@ -31,10 +30,12 @@ pub struct AgentStateReconciler { #[derive(Default)] struct TransfersContext { - // TODO: persist network_id, storage_id, and storage_version + // TODO: persist network_id, storage_id, storage_version, and ledger_last_height network_id: NetworkId, storage_id: StorageId, storage_version: u16, + /// The last ledger height that was successfully configured + ledger_last_height: Option<(usize, HeightRequest)>, /// Metadata about an active binary transfer binary_transfer: Option<(TransferId, BinaryEntry)>, @@ -46,19 +47,18 @@ struct TransfersContext { /// Time the genesis block was marked as OK genesis_ok_at: Option, - /// The last ledger height that was successfully configured - ledger_last_height: Option, /// The height that is currently being configured - ledger_pending_height: Option, + ledger_pending_height: Option<(usize, HeightRequest)>, - /// Metadata about an active ledger tar file transfer - ledger_transfer: Option, - /// Time the ledger tar file was marked as OK - ledger_ok_at: Option, /// A handle containing the task that modifies the ledger. /// The mutex is held until the task is complete, and the bool is set to /// true when the task is successful. ledger_modify_handle: Option<(AbortHandle, Arc>>)>, + + /// Time the ledger tar file was marked as OK + ledger_ok_at: Option, + /// Metadata about an active ledger tar file transfer + ledger_transfer: Option, /// A handle containing the task that unzips the ledger. /// The mutex is held until the task is complete, and the bool is set to /// true when the task is successful. @@ -134,7 +134,8 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { let node_arc = Arc::new(*node.clone()); if storage_has_changed { - // TODO: abort any ongoing transfers, then requeue + // TODO: abort any ongoing transfers (binary/file), then + // requeue } // initialize the transfers context with the current status @@ -187,6 +188,21 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { } ); + reconcile!( + ledger, + LedgerReconciler { + state: Arc::clone(&self.state), + env_info: Arc::clone(&env_info), + ok_at: &mut transfers.ledger_ok_at, + transfer: &mut transfers.ledger_transfer, + modify_handle: &mut transfers.ledger_modify_handle, + unpack_handle: &mut transfers.ledger_unpack_handle, + target_height: node.height, + last_height: &mut transfers.ledger_last_height, + pending_height: &mut transfers.ledger_pending_height, + } + ); + // Resolve the addresses of the peers and validators // TODO: Set an expiry for resolved addresses reconcile!( diff --git a/crates/agent/src/reconcile/mod.rs b/crates/agent/src/reconcile/mod.rs index 9f60d075..24d746f1 100644 --- a/crates/agent/src/reconcile/mod.rs +++ b/crates/agent/src/reconcile/mod.rs @@ -22,6 +22,10 @@ pub enum ReconcileCondition { PendingProcess(String), /// A tranfer was started and interrupted. InterruptedTransfer(String, TransferId, String), + /// An unpack operation was started and interrupted. + InterruptedUnpack(String), + /// A modify operation was started and interrupted. + InterruptedModify(String), /// A file is missing and cannot be downloaded at the moment. MissingFile(String), /// Waiting to reconnect to the controlplane diff --git a/crates/agent/src/reconcile/storage.rs b/crates/agent/src/reconcile/storage.rs index a6fae92a..356e5ee0 100644 --- a/crates/agent/src/reconcile/storage.rs +++ b/crates/agent/src/reconcile/storage.rs @@ -4,17 +4,19 @@ use std::{ time::{Duration, Instant}, }; +use snops_checkpoint::CheckpointManager; use snops_common::{ api::EnvInfo, binaries::{BinaryEntry, BinarySource}, constant::{ LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, VERSION_FILE, }, - rpc::error::ReconcileError2, + db::error, + rpc::error::{ReconcileError, ReconcileError2}, state::{HeightRequest, InternedId, TransferId}, }; use tokio::{sync::Mutex, task::AbortHandle}; -use tracing::trace; +use tracing::{error, trace}; use url::Url; use super::{ @@ -192,9 +194,9 @@ pub type LedgerModifyResult = Result; pub struct LedgerReconciler<'a> { pub state: Arc, pub env_info: Arc, - pub target_height: HeightRequest, - pub last_height: &'a mut Option, - pub pending_height: &'a mut Option, + pub target_height: (usize, HeightRequest), + pub last_height: &'a mut Option<(usize, HeightRequest)>, + pub pending_height: &'a mut Option<(usize, HeightRequest)>, pub ok_at: &'a mut Option, pub transfer: &'a mut Option, pub modify_handle: &'a mut Option<(AbortHandle, Arc>>)>, @@ -217,6 +219,7 @@ impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { let network = env_info.network; let storage_id = env_info.storage.id; + let is_persist = env_info.storage.persist; let (untar_base, untar_dir) = if env_info.storage.persist { ( @@ -229,19 +232,110 @@ impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { let ledger_path = untar_base.join(untar_dir); + // TODO: implement a heightrequest that downloads a remote ledger + + // TODO: only call this after unpacking the ledger DirectoryReconciler(&ledger_path.join(".aleo")) .reconcile() .await?; + // defaulting the initial height allows the reconciler to treat + // a persisted env with non-top target heights as a request to delete + // the ledger + if last_height.is_none() { + // the default last height is the top when persisting + // and 0 when not persisting (clean ledger) + **last_height = Some(( + 0, + if is_persist { + HeightRequest::Top + } else { + HeightRequest::Absolute(0) + }, + )); + + // delete ledger because no last_height indicates a fresh env + if !is_persist { + let _ = tokio::fs::remove_dir_all(&ledger_path).await; + } + } + let last_height = last_height.as_mut().unwrap(); + + // If there is no pending height, check if there should be a pending height + if pending_height.is_none() { + // target height has been realized + if last_height == target_height { + return Ok(ReconcileStatus::default()); + } + + // If the target height is the top, we can skip the ledger reconciler + if target_height.1.is_top() { + *last_height = *target_height; + // ledger operation is complete + return Ok(ReconcileStatus::default()); + } + + // If the target height is 0, we can delete the ledger + if target_height.1.reset() { + let _ = tokio::fs::remove_dir_all(&ledger_path).await; + *last_height = *target_height; + // ledger operation is complete + return Ok(ReconcileStatus::default()); + } + + // TODO: ledger URL handling here instead of retention policy + + // Target height is guaranteed to be different, not top, and not 0, which means + // it's up to the retention policies + + // If there's a retention policy, load the checkpoint manager + // this is so we can wipe all leftover checkpoints for non-persisted storage + // after resets or new environments + let Some(mut manager) = env_info + .storage + .retention_policy + .clone() + .map(|policy| { + trace!("loading checkpoints from {untar_base:?}..."); + CheckpointManager::load(ledger_path.clone(), policy).map_err(|e| { + error!("failed to load checkpoints: {e}"); + ReconcileError2::CheckpointLoadError(e.to_string()) + }) + }) + .transpose()? + else { + // if there is no retention policy, this height request cannot be fulfilled + return Err(ReconcileError2::MissingRetentionPolicy(target_height.1)); + }; + + // TODO: find_by_span logic + } + let pending = pending_height.unwrap(); + + // If the target height changed while processing the last target height + // wait for the previous procedure to complete before starting a new one. + if *target_height != pending { + // TODO: complete current procedure before starting a new one + + // clear current pending height + **pending_height = None; + + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::InterruptedModify(String::from( + "target height changed", + ))) + .requeue_after(Duration::from_secs(1))); + } + // If the ledger is OK and the target height is the top, we can skip the ledger // reconciler - if env_info.storage.persist && target_height.is_top() && ledger_path.exists() { + if is_persist && target_height.1.is_top() && ledger_path.exists() { return Ok(ReconcileStatus::default()); } // TODO: if pending_height - check unpack/modify handles - let is_new_env = last_height.is_none(); + // let is_new_env = last_height.is_none(); Ok(ReconcileStatus::empty()) } diff --git a/crates/common/src/rpc/error.rs b/crates/common/src/rpc/error.rs index a4e635e8..a201a225 100644 --- a/crates/common/src/rpc/error.rs +++ b/crates/common/src/rpc/error.rs @@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize}; use strum_macros::AsRefStr; use thiserror::Error; -use crate::state::EnvId; +use crate::state::{EnvId, HeightRequest}; #[macro_export] macro_rules! impl_into_type_str { @@ -183,4 +183,8 @@ pub enum ReconcileError2 { FilePermissionError(PathBuf, String), #[error("failed to parse {0} as a url: {1}")] UrlParseError(String, String), + #[error("error loading checkpoints: {0}")] + CheckpointLoadError(String), + #[error("missing retention policy for request: {0}")] + MissingRetentionPolicy(HeightRequest), } diff --git a/crates/common/src/state/height_request.rs b/crates/common/src/state/height_request.rs index c66ec7d5..e882ab9f 100644 --- a/crates/common/src/state/height_request.rs +++ b/crates/common/src/state/height_request.rs @@ -1,4 +1,4 @@ -use std::str::FromStr; +use std::{fmt::Display, str::FromStr}; use snops_checkpoint::RetentionSpan; @@ -45,6 +45,16 @@ impl FromStr for DocHeightRequest { } } +impl Display for HeightRequest { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + HeightRequest::Top => write!(f, "top"), + HeightRequest::Absolute(h) => write!(f, "{h}"), + HeightRequest::Checkpoint(c) => write!(f, "{c}"), + } + } +} + impl DataFormat for DocHeightRequest { type Header = (u8, DataHeaderOf); const LATEST_HEADER: Self::Header = (1, RetentionSpan::LATEST_HEADER); From 4d6173b4a4e432ceb93be85d62ab03c6061274a7 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 23 Nov 2024 14:32:40 -0500 Subject: [PATCH 11/68] feat(agent): ledger reconciler modification task --- Cargo.lock | 1 + crates/agent/Cargo.toml | 1 + crates/agent/src/reconcile/agent.rs | 12 - crates/agent/src/reconcile/checkpoint.rs | 11 +- crates/agent/src/reconcile/mod.rs | 4 - crates/agent/src/reconcile/storage.rs | 296 +++++++++++++++------- crates/common/src/rpc/error.rs | 8 +- crates/common/src/state/height_request.rs | 4 +- 8 files changed, 225 insertions(+), 112 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1dc29f6b..70d6642c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4590,6 +4590,7 @@ dependencies = [ "http 1.1.0", "httpdate", "indexmap 2.6.0", + "lazysort", "local-ip-address", "nix", "reqwest 0.12.8", diff --git a/crates/agent/Cargo.toml b/crates/agent/Cargo.toml index 282cbc4e..7a5f2a76 100644 --- a/crates/agent/Cargo.toml +++ b/crates/agent/Cargo.toml @@ -24,6 +24,7 @@ futures-util.workspace = true http.workspace = true httpdate.workspace = true indexmap.workspace = true +lazysort.workspace = true local-ip-address.workspace = true nix = { workspace = true, features = ["signal"] } reqwest = { workspace = true, features = ["json", "stream"] } diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index b70d936a..8ab0f36b 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -54,15 +54,6 @@ struct TransfersContext { /// The mutex is held until the task is complete, and the bool is set to /// true when the task is successful. ledger_modify_handle: Option<(AbortHandle, Arc>>)>, - - /// Time the ledger tar file was marked as OK - ledger_ok_at: Option, - /// Metadata about an active ledger tar file transfer - ledger_transfer: Option, - /// A handle containing the task that unzips the ledger. - /// The mutex is held until the task is complete, and the bool is set to - /// true when the task is successful. - ledger_unpack_handle: Option<(AbortHandle, Arc>>)>, } impl TransfersContext { @@ -193,10 +184,7 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { LedgerReconciler { state: Arc::clone(&self.state), env_info: Arc::clone(&env_info), - ok_at: &mut transfers.ledger_ok_at, - transfer: &mut transfers.ledger_transfer, modify_handle: &mut transfers.ledger_modify_handle, - unpack_handle: &mut transfers.ledger_unpack_handle, target_height: node.height, last_height: &mut transfers.ledger_last_height, pending_height: &mut transfers.ledger_pending_height, diff --git a/crates/agent/src/reconcile/checkpoint.rs b/crates/agent/src/reconcile/checkpoint.rs index b8003e2b..9166d633 100644 --- a/crates/agent/src/reconcile/checkpoint.rs +++ b/crates/agent/src/reconcile/checkpoint.rs @@ -9,7 +9,7 @@ use snops_common::{ rpc::error::ReconcileError, state::{NetworkId, StorageId}, }; -use tracing::{error, info}; +use tracing::{error, info, trace}; use crate::{api, state::GlobalState}; @@ -28,16 +28,17 @@ impl<'a> CheckpointSource<'a> { ) -> Result { Ok(match self { CheckpointSource::Meta(meta) => { - info!( + trace!( "using checkpoint from control plane with height {} and time {}", - meta.height, meta.timestamp + meta.height, + meta.timestamp ); let checkpoint_url = format!( "{}/content/storage/{network}/{storage_id}/{}", &state.endpoint, meta.filename ); let path = storage_path.join(&meta.filename); - info!("downloading {} from {checkpoint_url}...", meta.filename); + trace!("downloading {} from {checkpoint_url}...", meta.filename); api::check_file(checkpoint_url, &path, state.transfer_tx()) .await @@ -52,7 +53,7 @@ impl<'a> CheckpointSource<'a> { path } CheckpointSource::Manager(header, path) => { - info!( + trace!( "using checkpoint from manager with height {} and time {}", header.block_height, header.time() diff --git a/crates/agent/src/reconcile/mod.rs b/crates/agent/src/reconcile/mod.rs index 24d746f1..650be3ab 100644 --- a/crates/agent/src/reconcile/mod.rs +++ b/crates/agent/src/reconcile/mod.rs @@ -15,15 +15,11 @@ use snops_common::{rpc::error::ReconcileError2, state::TransferId}; pub enum ReconcileCondition { /// A file is being transferred. PendingTransfer(String, TransferId), - /// A file is being unpacked. - PendingUnpack(String), /// A process is being spawned / confirmed. Could be starting the node or /// manipulating the ledger PendingProcess(String), /// A tranfer was started and interrupted. InterruptedTransfer(String, TransferId, String), - /// An unpack operation was started and interrupted. - InterruptedUnpack(String), /// A modify operation was started and interrupted. InterruptedModify(String), /// A file is missing and cannot be downloaded at the moment. diff --git a/crates/agent/src/reconcile/storage.rs b/crates/agent/src/reconcile/storage.rs index 356e5ee0..63707955 100644 --- a/crates/agent/src/reconcile/storage.rs +++ b/crates/agent/src/reconcile/storage.rs @@ -1,9 +1,10 @@ use std::{ - path::Path, + path::{Path, PathBuf}, sync::Arc, time::{Duration, Instant}, }; +use lazysort::SortedBy; use snops_checkpoint::CheckpointManager; use snops_common::{ api::EnvInfo, @@ -11,11 +12,10 @@ use snops_common::{ constant::{ LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, VERSION_FILE, }, - db::error, - rpc::error::{ReconcileError, ReconcileError2}, + rpc::error::ReconcileError2, state::{HeightRequest, InternedId, TransferId}, }; -use tokio::{sync::Mutex, task::AbortHandle}; +use tokio::{process::Command, sync::Mutex, task::AbortHandle}; use tracing::{error, trace}; use url::Url; @@ -88,7 +88,7 @@ impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { **transfer = Some((tx_id, target_binary.clone())); } - // transfer is pending or a failure occurred + // Transfer is pending or a failure occurred if file_res.is_requeue() { return Ok(file_res.emptied().add_scope("file/requeue")); } @@ -197,55 +197,157 @@ pub struct LedgerReconciler<'a> { pub target_height: (usize, HeightRequest), pub last_height: &'a mut Option<(usize, HeightRequest)>, pub pending_height: &'a mut Option<(usize, HeightRequest)>, - pub ok_at: &'a mut Option, - pub transfer: &'a mut Option, pub modify_handle: &'a mut Option<(AbortHandle, Arc>>)>, - pub unpack_handle: &'a mut Option<(AbortHandle, Arc>>)>, } -impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { - async fn reconcile(&mut self) -> Result, ReconcileError2> { - let LedgerReconciler { - state, - env_info, - ok_at, - transfer, - modify_handle, - unpack_handle, - target_height, - last_height, - pending_height, - } = self; - - let network = env_info.network; - let storage_id = env_info.storage.id; - let is_persist = env_info.storage.persist; - - let (untar_base, untar_dir) = if env_info.storage.persist { +impl<'a> LedgerReconciler<'a> { + pub fn untar_paths(&self) -> (PathBuf, &'static str) { + if self.env_info.storage.persist { ( - state.cli.storage_path(network, storage_id), + self.state + .cli + .storage_path(self.env_info.network, self.env_info.storage.id), LEDGER_PERSIST_DIR, ) } else { - (state.cli.path.clone(), LEDGER_BASE_DIR) - }; + (self.state.cli.path.clone(), LEDGER_BASE_DIR) + } + } - let ledger_path = untar_base.join(untar_dir); + pub fn ledger_path(&self) -> PathBuf { + let (path, dir) = self.untar_paths(); + path.join(dir) + } - // TODO: implement a heightrequest that downloads a remote ledger + /// Find the checkpoint to apply to the ledger + /// Guaranteed error when target height is not the top, 0, or unlimited span + pub fn find_checkpoint(&self) -> Result { + let (untar_base, ledger_dir) = self.untar_paths(); + let ledger_path = untar_base.join(ledger_dir); + + // If there's a retention policy, load the checkpoint manager + // this is so we can wipe all leftover checkpoints for non-persisted storage + // after resets or new environments + let manager = self + .env_info + .storage + .retention_policy + .clone() + .map(|policy| { + trace!("loading checkpoints from {untar_base:?}..."); + CheckpointManager::load(ledger_path.clone(), policy).map_err(|e| { + error!("failed to load checkpoints: {e}"); + ReconcileError2::CheckpointLoadError(e.to_string()) + }) + }) + .transpose()? + .ok_or(ReconcileError2::MissingRetentionPolicy( + self.target_height.1, + ))?; + + // Determine which checkpoint to use by the next available height/time + match self.target_height.1 { + HeightRequest::Absolute(height) => manager + .checkpoints() + .sorted_by(|(a, _), (b, _)| b.block_height.cmp(&a.block_height)) + .find_map(|(c, path)| (c.block_height <= height).then_some(path)), + HeightRequest::Checkpoint(span) => span.as_timestamp().and_then(|timestamp| { + manager + .checkpoints() + .sorted_by(|(a, _), (b, _)| b.timestamp.cmp(&a.timestamp)) + .find_map(|(c, path)| (c.timestamp <= timestamp).then_some(path)) + }), + // top cannot be a target height + _ => None, + } + .ok_or(ReconcileError2::NoAvailableCheckpoints( + self.target_height.1, + )) + .cloned() + } - // TODO: only call this after unpacking the ledger - DirectoryReconciler(&ledger_path.join(".aleo")) - .reconcile() - .await?; + pub fn spawn_modify( + &self, + checkpoint: PathBuf, + ) -> (AbortHandle, Arc>>) { + let result = Arc::new(Mutex::new(None)); + let result2 = Arc::clone(&result); + + let is_native_genesis = self.env_info.storage.native_genesis; + let snarkos_path = self.state.cli.path.join(SNARKOS_FILE); + let network = self.env_info.network; + let storage_path = self + .state + .cli + .storage_path(network, self.env_info.storage.id); + let ledger_path = self.ledger_path(); + + // apply the checkpoint to the ledger + let mut command = Command::new(snarkos_path); + command + .stdout(std::io::stdout()) + .stderr(std::io::stderr()) + .env("NETWORK", network.to_string()) + .arg("ledger") + .arg("--ledger") + .arg(&ledger_path); + + if !is_native_genesis { + command + .arg("--genesis") + .arg(storage_path.join(SNARKOS_GENESIS_FILE)); + } + + command.arg("checkpoint").arg("apply").arg(checkpoint); + + let handle = tokio::spawn(async move { + let mut mutex = result.lock().await; + + let res = command + .spawn() + .map_err(|e| { + error!("failed to spawn checkpoint apply process: {e}"); + mutex.replace(Err(ReconcileError2::CheckpointApplyError(String::from( + "spawn checkpoint apply process", + )))); + })? + .wait() + .await + .map_err(|e| { + error!("failed to await checkpoint apply process: {e}"); + mutex.replace(Err(ReconcileError2::CheckpointApplyError(String::from( + "await checkpoint apply process", + )))); + })?; + + mutex.replace(Ok(res.success())); + + Ok::<(), ()>(()) + }) + .abort_handle(); + + (handle, result2) + } +} + +impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError2> { + let env_info = self.env_info.clone(); + let target_height = self.target_height; + + let ledger_path = self.ledger_path(); - // defaulting the initial height allows the reconciler to treat + // Ledger reconcile behavior is different depending on whether the storage is + // persistent. + let is_persist = env_info.storage.persist; + + // Defaulting the initial height allows the reconciler to treat // a persisted env with non-top target heights as a request to delete // the ledger - if last_height.is_none() { - // the default last height is the top when persisting + if self.last_height.is_none() { + // The default last height is the top when persisting // and 0 when not persisting (clean ledger) - **last_height = Some(( + *self.last_height = Some(( 0, if is_persist { HeightRequest::Top @@ -259,18 +361,24 @@ impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { let _ = tokio::fs::remove_dir_all(&ledger_path).await; } } - let last_height = last_height.as_mut().unwrap(); + let last_height = self.last_height.as_mut().unwrap(); + + // TODO: only call this after unpacking the ledger + // create the ledger path if it doesn't exist + DirectoryReconciler(&ledger_path.join(".aleo")) + .reconcile() + .await?; // If there is no pending height, check if there should be a pending height - if pending_height.is_none() { + if self.pending_height.is_none() { // target height has been realized - if last_height == target_height { + if *last_height == target_height { return Ok(ReconcileStatus::default()); } // If the target height is the top, we can skip the ledger reconciler if target_height.1.is_top() { - *last_height = *target_height; + *last_height = target_height; // ledger operation is complete return Ok(ReconcileStatus::default()); } @@ -278,66 +386,78 @@ impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { // If the target height is 0, we can delete the ledger if target_height.1.reset() { let _ = tokio::fs::remove_dir_all(&ledger_path).await; - *last_height = *target_height; - // ledger operation is complete - return Ok(ReconcileStatus::default()); + *last_height = target_height; + // Ledger operation is complete... immediately requeue because the ledger was + // wiped + return Ok(ReconcileStatus::default().requeue_after(Duration::from_secs(0))); } - // TODO: ledger URL handling here instead of retention policy - // Target height is guaranteed to be different, not top, and not 0, which means // it's up to the retention policies - // If there's a retention policy, load the checkpoint manager - // this is so we can wipe all leftover checkpoints for non-persisted storage - // after resets or new environments - let Some(mut manager) = env_info - .storage - .retention_policy - .clone() - .map(|policy| { - trace!("loading checkpoints from {untar_base:?}..."); - CheckpointManager::load(ledger_path.clone(), policy).map_err(|e| { - error!("failed to load checkpoints: {e}"); - ReconcileError2::CheckpointLoadError(e.to_string()) - }) - }) - .transpose()? - else { - // if there is no retention policy, this height request cannot be fulfilled - return Err(ReconcileError2::MissingRetentionPolicy(target_height.1)); - }; - - // TODO: find_by_span logic - } - let pending = pending_height.unwrap(); + // TODO: implement a heightrequest that downloads a remote ledger + // TODO: ledger URL handling here instead of retention policy + // TODO: ledger downloading would enter a new code path that downloads a new one - // If the target height changed while processing the last target height - // wait for the previous procedure to complete before starting a new one. - if *target_height != pending { - // TODO: complete current procedure before starting a new one + // Find the checkpoint for the reconciler's target height + let checkpoint = self.find_checkpoint()?; + // Start a task to modify the ledger with the checkpoint + *self.modify_handle = Some(self.spawn_modify(checkpoint)); + // Now that a task is running, set the pending height + *self.pending_height = Some(target_height); - // clear current pending height - **pending_height = None; + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingProcess(format!( + "ledger modification to height {}", + target_height.1 + ))) + .requeue_after(Duration::from_secs(5))); + } + let pending = self.pending_height.unwrap(); + let Some(modify_handle) = self.modify_handle.as_mut() else { + // This should be an unreachable condition, but may not be unreachable + // when more complex ledger operations are implemented + error!("modify handle missing for pending height"); + *self.pending_height = None; return Ok(ReconcileStatus::empty() .add_condition(ReconcileCondition::InterruptedModify(String::from( - "target height changed", + "modify handle missing", ))) .requeue_after(Duration::from_secs(1))); - } + }; - // If the ledger is OK and the target height is the top, we can skip the ledger - // reconciler - if is_persist && target_height.1.is_top() && ledger_path.exists() { - return Ok(ReconcileStatus::default()); - } + // If the modify handle is locked, requeue until it's unlocked + let Ok(Some(handle)) = modify_handle.1.try_lock().map(|r| r.clone()) else { + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingProcess(format!( + "ledger modification to height {}", + target_height.1 + ))) + .requeue_after(Duration::from_secs(1))); + }; - // TODO: if pending_height - check unpack/modify handles + match handle { + // If the ledger was modified successfully, update the last height + Ok(true) => { + *last_height = pending; + } + // A failure in the ledger modification process is handled at the + // moment... + Ok(false) => { + error!("ledger modification to height {} failed", target_height.1); + // TODO: handle this failure + } + // Bubble an actual error up to the caller + Err(err) => return Err(err.clone()), + }; - // let is_new_env = last_height.is_none(); + // Modification is complete. The last height is change dhwen the modification + // succeeds (above) + *self.pending_height = None; + *self.modify_handle = None; - Ok(ReconcileStatus::empty()) + Ok(ReconcileStatus::default()) } } diff --git a/crates/common/src/rpc/error.rs b/crates/common/src/rpc/error.rs index a201a225..d83234bd 100644 --- a/crates/common/src/rpc/error.rs +++ b/crates/common/src/rpc/error.rs @@ -115,7 +115,7 @@ pub enum SnarkosRequestError { TimedOut, } -#[derive(Debug, Error, Serialize, Deserialize, AsRefStr)] +#[derive(Debug, Clone, Error, Serialize, Deserialize, AsRefStr)] pub enum ResolveError { #[error("source agent not found")] SourceAgentNotFound, @@ -151,7 +151,7 @@ pub enum ReconcileError { Unknown, } -#[derive(Debug, Error, Serialize, Deserialize, AsRefStr)] +#[derive(Debug, Clone, Error, Serialize, Deserialize, AsRefStr)] pub enum ReconcileError2 { #[error("node is not connected to the controlplane")] Offline, @@ -187,4 +187,8 @@ pub enum ReconcileError2 { CheckpointLoadError(String), #[error("missing retention policy for request: {0}")] MissingRetentionPolicy(HeightRequest), + #[error("no available checkpoints for request: {0}")] + NoAvailableCheckpoints(HeightRequest), + #[error("failed to apply checkpoint: {0}")] + CheckpointApplyError(String), } diff --git a/crates/common/src/state/height_request.rs b/crates/common/src/state/height_request.rs index e882ab9f..30ec7484 100644 --- a/crates/common/src/state/height_request.rs +++ b/crates/common/src/state/height_request.rs @@ -161,7 +161,9 @@ impl HeightRequest { } pub fn reset(&self) -> bool { - *self == Self::Absolute(0) + // height 0 = genesis block + // checkpoint an unlimited time in the past is also a reset + *self == Self::Absolute(0) || *self == Self::Checkpoint(RetentionSpan::Unlimited) } } From 9f87fe87e15fec8a9c88472a4f970e7a9c368973 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 23 Nov 2024 23:00:41 -0500 Subject: [PATCH 12/68] feat(agent): WIP process shutdown reconciler --- crates/agent/src/main.rs | 83 ++++--------------- crates/agent/src/reconcile/agent.rs | 111 ++++++++++++++++++++++---- crates/agent/src/reconcile/mod.rs | 15 ++-- crates/agent/src/reconcile/process.rs | 42 +++++++++- crates/agent/src/reconcile/storage.rs | 4 +- 5 files changed, 155 insertions(+), 100 deletions(-) diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index b3a16feb..6adbcfe1 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -21,7 +21,7 @@ use clap::Parser; use cli::Cli; use futures_util::stream::{FuturesUnordered, StreamExt}; use log::init_logging; -use reconcile::{agent::AgentStateReconciler, Reconcile}; +use reconcile::{agent::AgentStateReconciler, process::EndProcessReconciler, Reconcile}; use snops_common::{db::Database, util::OpaqueDebug}; use tokio::{ select, @@ -73,7 +73,7 @@ async fn main() { .expect("failed to get status server port") .port(); - let (queue_reconcile_tx, mut reconcile_requests) = mpsc::channel(5); + let (queue_reconcile_tx, reconcile_requests) = mpsc::channel(5); // Create the client state let state = Arc::new(GlobalState { @@ -135,7 +135,7 @@ async fn main() { let mut interrupt = Signals::new(&[SignalKind::terminate(), SignalKind::interrupt()]); let state2 = Arc::clone(&state); - let connection_loop = Box::pin(async move { + tokio::spawn(async move { loop { let req = client::new_ws_request(&ws_uri, state2.db.jwt()); client::ws_connection(req, Arc::clone(&state2)).await; @@ -144,75 +144,24 @@ async fn main() { } }); - let state3 = Arc::clone(&state); - let reconcile_loop = Box::pin(async move { - let mut err_backoff = 0; - - // Root reconciler that walks through configuring the agent. - // The context is mutated while reconciling to keep track of things - // like downloads, ledger manipulations, node command, and more. - let mut root = AgentStateReconciler { - agent_state: Arc::clone(state3.agent_state.read().await.deref()), - state: Arc::clone(&state3), - context: Default::default(), - }; - - // The first reconcile is scheduled for 5 seconds after startup. - // Connecting to the controlplane will likely trigger a reconcile sooner. - let mut next_reconcile_at = Instant::now() + Duration::from_secs(5); - let mut wait = Box::pin(tokio::time::sleep_until(next_reconcile_at.into())); - - loop { - // Await for the next reconcile, allowing for it to be moved up sooner - select! { - // Replace the next_reconcile_at with the soonest reconcile time - Some(new_reconcile_at) = reconcile_requests.recv() => { - next_reconcile_at = next_reconcile_at.min(new_reconcile_at); - wait = Box::pin(tokio::time::sleep_until(next_reconcile_at.into())); - }, - _ = &mut wait => {} - } - - // Drain the reconcile request queue - while reconcile_requests.try_recv().is_ok() {} - // Schedule the next reconcile for 1 week. - next_reconcile_at = Instant::now() + Duration::from_secs(60 * 60 * 24 * 7); - - // Update the reconciler with the latest agent state - // This prevents the agent state from changing during reconciliation - root.agent_state = state3.agent_state.read().await.deref().clone(); - - trace!("reconciling agent state..."); - match root.reconcile().await { - Ok(status) => { - if status.inner.is_some() { - trace!("reconcile completed"); - } - if !status.conditions.is_empty() { - trace!("reconcile conditions: {:?}", status.conditions); - } - if let Some(requeue_after) = status.requeue_after { - next_reconcile_at = Instant::now() + requeue_after; - } - } - Err(e) => { - error!("failed to reconcile agent state: {e}"); - err_backoff = (err_backoff + 5).min(30); - next_reconcile_at = Instant::now() + Duration::from_secs(err_backoff); - } - } - - // TODO: announce reconcile status to the server, throttled - } - }); + // Root reconciler that walks through configuring the agent. + // The context is mutated while reconciling to keep track of things + // like downloads, ledger manipulations, node command, and more. + let mut root = AgentStateReconciler { + agent_state: Arc::clone(state.agent_state.read().await.deref()), + state: Arc::clone(&state), + context: Default::default(), + }; select! { + _ = root.loop_forever(reconcile_requests) => unreachable!(), _ = interrupt.recv_any() => { + if let Some(process) = root.context.process.as_mut() { + EndProcessReconciler(process).reconcile().await; + + } info!("Received interrupt signal, shutting down..."); }, - - _ = connection_loop => unreachable!(), - _ = reconcile_loop => unreachable!(), } state.node_graceful_shutdown().await; diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index 8ab0f36b..32860f0e 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -1,4 +1,9 @@ -use std::{collections::HashSet, sync::Arc, time::Instant}; +use std::{ + collections::HashSet, + ops::Deref, + sync::Arc, + time::{Duration, Instant}, +}; use snops_common::{ api::EnvInfo, @@ -9,8 +14,12 @@ use snops_common::{ }, }; use tarpc::context; -use tokio::{sync::Mutex, task::AbortHandle}; -use tracing::{error, warn}; +use tokio::{ + select, + sync::{mpsc::Receiver, Mutex}, + task::AbortHandle, +}; +use tracing::{error, info, trace, warn}; use super::{ command::NodeCommand, @@ -18,7 +27,10 @@ use super::{ storage::{BinaryReconciler, GenesisReconciler, LedgerModifyResult, StorageVersionReconciler}, DirectoryReconciler, Reconcile, ReconcileStatus, }; -use crate::{reconcile::storage::LedgerReconciler, state::GlobalState}; +use crate::{ + reconcile::{process::EndProcessReconciler, storage::LedgerReconciler}, + state::GlobalState, +}; /// Attempt to reconcile the agent's current state. /// This will download files and start/stop the node @@ -64,6 +76,60 @@ impl TransfersContext { } } +impl AgentStateReconciler { + pub async fn loop_forever(&mut self, mut reconcile_requests: Receiver) { + let mut err_backoff = 0; + + // The first reconcile is scheduled for 5 seconds after startup. + // Connecting to the controlplane will likely trigger a reconcile sooner. + let mut next_reconcile_at = Instant::now() + Duration::from_secs(5); + let mut wait = Box::pin(tokio::time::sleep_until(next_reconcile_at.into())); + + loop { + // Await for the next reconcile, allowing for it to be moved up sooner + select! { + // Replace the next_reconcile_at with the soonest reconcile time + Some(new_reconcile_at) = reconcile_requests.recv() => { + next_reconcile_at = next_reconcile_at.min(new_reconcile_at); + wait = Box::pin(tokio::time::sleep_until(next_reconcile_at.into())); + }, + _ = &mut wait => {} + } + + // Drain the reconcile request queue + while reconcile_requests.try_recv().is_ok() {} + // Schedule the next reconcile for 1 week. + next_reconcile_at = Instant::now() + Duration::from_secs(60 * 60 * 24 * 7); + + // Update the reconciler with the latest agent state + // This prevents the agent state from changing during reconciliation + self.agent_state = self.state.agent_state.read().await.deref().clone(); + + trace!("reconciling agent state..."); + match self.reconcile().await { + Ok(status) => { + if status.inner.is_some() { + trace!("reconcile completed"); + } + if !status.conditions.is_empty() { + trace!("reconcile conditions: {:?}", status.conditions); + } + if let Some(requeue_after) = status.requeue_after { + next_reconcile_at = Instant::now() + requeue_after; + } + } + Err(e) => { + error!("failed to reconcile agent state: {e}"); + err_backoff = (err_backoff + 5).min(30); + next_reconcile_at = Instant::now() + Duration::from_secs(err_backoff); + } + } + + // TODO: announce reconcile status to the server, throttled + } + } +} + #[derive(Default)] pub struct AgentStateReconcilerContext { // TODO: allow transfers to be interrupted. potentially allow them to be resumed by using the @@ -71,7 +137,8 @@ pub struct AgentStateReconcilerContext { /// Information about active transfers transfers: Option, /// Information about the node process - process: Option, + pub process: Option, + pub shutdown_pending: bool, } /// Run a reconciler and return early if a requeue is needed. A condition is @@ -79,10 +146,15 @@ pub struct AgentStateReconcilerContext { /// monitoring the agent. macro_rules! reconcile { ($id:ident, $e:expr) => { - let res = $e.reconcile().await?; - if res.is_requeue() { - return Ok(res.add_scope(concat!(stringify!($id), "/requeue"))); + reconcile!($id, $e, res => {}) + }; + ($id:ident, $e:expr, $v:ident => $rest:expr) => { + + let $v = $e.reconcile().await?; + if $v.is_requeue() { + return Ok($v.add_scope(concat!(stringify!($id), "/requeue"))); } + $rest }; } @@ -110,8 +182,16 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { // gracefully shut down the node. let shutdown_pending = !node.online || storage_has_changed; - if let (true, Some(process)) = (shutdown_pending, self.context.process.as_ref()) { - // TODO: reconcile process destruction + if let (true, Some(process)) = ( + shutdown_pending || self.context.shutdown_pending, + self.context.process.as_mut(), + ) { + reconcile!(end_process, EndProcessReconciler(process), res => { + // If the process has exited, clear the process context + if res.inner.is_some() { + self.context.process = None; + } + }); } // TODO: check if addrs have changed, and update shutdown_pending @@ -201,11 +281,6 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { } ); - // TODO: restart the node if the binaries changed. this means storing the hashes - // of the downloaded files - - // TODO: requeue if the binaries are not ready - // Accumulate all the fields that are used to derive the command that starts // the node. // This will be used to determine if the command has changed at all. @@ -224,7 +299,11 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { // TODO: spawn the command, manage its state, check that it's up // TODO: if possible, use the NodeCommand as configuration for a node service to // allow running the node outside of the agent - let _cmd = command.build(); + + if self.context.process.is_none() { + info!("Starting node process"); + self.context.process = Some(ProcessContext::new(command)?); + } } } diff --git a/crates/agent/src/reconcile/mod.rs b/crates/agent/src/reconcile/mod.rs index 650be3ab..6f02f0f0 100644 --- a/crates/agent/src/reconcile/mod.rs +++ b/crates/agent/src/reconcile/mod.rs @@ -9,7 +9,7 @@ mod files; pub use files::*; pub mod process; pub mod storage; -use snops_common::{rpc::error::ReconcileError2, state::TransferId}; +use snops_common::state::TransferId; #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum ReconcileCondition { @@ -26,6 +26,10 @@ pub enum ReconcileCondition { MissingFile(String), /// Waiting to reconnect to the controlplane PendingConnection, + /// Waiting for the node to be shut down + PendingShutdown, + /// Waiting for the node to be gracefully shut down + RequestedShutdown, } pub trait Reconcile { @@ -89,10 +93,6 @@ impl ReconcileStatus { self.inner } - pub fn take_conditions(&mut self) -> IndexSet { - std::mem::take(&mut self.conditions) - } - pub fn requeue_after(mut self, duration: Duration) -> Self { self.requeue_after = Some(duration); self @@ -107,9 +107,4 @@ impl ReconcileStatus { self.conditions.insert(condition); self } - - pub fn add_conditions(mut self, conditions: HashSet) -> Self { - self.conditions.extend(conditions); - self - } } diff --git a/crates/agent/src/reconcile/process.rs b/crates/agent/src/reconcile/process.rs index d7c8d5ee..66141677 100644 --- a/crates/agent/src/reconcile/process.rs +++ b/crates/agent/src/reconcile/process.rs @@ -1,10 +1,11 @@ -use std::time::Instant; +use std::time::{Duration, Instant}; use snops_common::rpc::error::ReconcileError2; use tokio::process::Child; -use tracing::error; +use tracing::{error, info}; -use super::command::NodeCommand; +use super::{command::NodeCommand, Reconcile, ReconcileCondition, ReconcileStatus}; +use crate::state::NODE_GRACEFUL_SHUTDOWN_TIMEOUT; /// Information about the current process pub struct ProcessContext { @@ -14,6 +15,7 @@ pub struct ProcessContext { /// The child process that is running the node child: Child, /// Time the child process was started + #[allow(dead_code)] started_at: Instant, /// Time a sigint was sent to the child process sigint_at: Option, @@ -43,7 +45,6 @@ impl ProcessContext { pub fn is_running(&self) -> bool { self.child.id().is_some() } - /// Send a SIGINT to the child process pub fn send_sigint(&mut self) -> bool { use nix::{ @@ -82,3 +83,36 @@ impl ProcessContext { .is_ok() } } + +/// The EndProcessReconciler will return true when the child process has exited. +/// It will wait NODE_GRACEFUL_SHUTDOWN_TIMEOUT seconds after sending a SIGINT +/// before sending a SIGKILL (if the childi process has not exited), +pub struct EndProcessReconciler<'a>(pub &'a mut ProcessContext); + +impl<'a> Reconcile<(), ReconcileError2> for EndProcessReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError2> { + if !self.0.is_running() { + return Ok(ReconcileStatus::default()); + } + + let Some(sigint_at) = self.0.sigint_at else { + if self.0.send_sigint() { + info!("sent SIGINT to node process"); + } + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingShutdown) + .requeue_after(Duration::from_secs(1))); + }; + + if sigint_at.elapsed() > NODE_GRACEFUL_SHUTDOWN_TIMEOUT + && self.0.sigkill_at.is_none() + && self.0.send_sigkill() + { + info!("sent SIGKILL to node process"); + } + + Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingShutdown) + .requeue_after(Duration::from_secs(1))) + } +} diff --git a/crates/agent/src/reconcile/storage.rs b/crates/agent/src/reconcile/storage.rs index 63707955..98d6a115 100644 --- a/crates/agent/src/reconcile/storage.rs +++ b/crates/agent/src/reconcile/storage.rs @@ -59,9 +59,7 @@ impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { .as_ref() .map(|(_, b)| b != target_binary) .unwrap_or(true); - let binary_is_ok = ok_at - .map(|ok| ok.elapsed().as_secs() < 300) // check if the binary has been OK for 5 minutes - .unwrap_or(false); + let binary_is_ok = ok_at.is_some(); // If the binary has not changed and has not expired, we can skip the binary // reconciler From c42add30d8925ae8e08708c87b9857cf6d597028 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sun, 24 Nov 2024 16:54:34 -0500 Subject: [PATCH 13/68] feat(agent): graceful shutdown outside of reconciler --- crates/agent/src/main.rs | 14 +++++++---- crates/agent/src/reconcile/agent.rs | 11 ++++++++- crates/agent/src/reconcile/checkpoint.rs | 2 +- crates/agent/src/reconcile/mod.rs | 4 +-- crates/agent/src/reconcile/process.rs | 31 +++++++++++++++++++++++- 5 files changed, 51 insertions(+), 11 deletions(-) diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 6adbcfe1..a0787082 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -21,14 +21,14 @@ use clap::Parser; use cli::Cli; use futures_util::stream::{FuturesUnordered, StreamExt}; use log::init_logging; -use reconcile::{agent::AgentStateReconciler, process::EndProcessReconciler, Reconcile}; +use reconcile::agent::AgentStateReconciler; use snops_common::{db::Database, util::OpaqueDebug}; use tokio::{ select, signal::unix::{signal, Signal, SignalKind}, sync::{mpsc, RwLock}, }; -use tracing::{error, info, trace}; +use tracing::{error, info}; use crate::state::GlobalState; mod log; @@ -132,7 +132,7 @@ async fn main() { }); // Get the interrupt signals to break the stream connection - let mut interrupt = Signals::new(&[SignalKind::terminate(), SignalKind::interrupt()]); + let mut interrupt = Signals::term_or_interrupt(); let state2 = Arc::clone(&state); tokio::spawn(async move { @@ -156,11 +156,11 @@ async fn main() { select! { _ = root.loop_forever(reconcile_requests) => unreachable!(), _ = interrupt.recv_any() => { + info!("Received interrupt signal, shutting down..."); if let Some(process) = root.context.process.as_mut() { - EndProcessReconciler(process).reconcile().await; + process.graceful_shutdown().await; } - info!("Received interrupt signal, shutting down..."); }, } @@ -179,6 +179,10 @@ impl Signals { } } + pub fn term_or_interrupt() -> Self { + Self::new(&[SignalKind::terminate(), SignalKind::interrupt()]) + } + async fn recv_any(&mut self) { let mut futs = FuturesUnordered::new(); diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index 32860f0e..ba96f6e4 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -162,9 +162,18 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { async fn reconcile(&mut self) -> Result, ReconcileError2> { match self.agent_state.as_ref() { AgentState::Inventory => { - // TODO: cleanup child process // TODO: cleanup other things + // end the process if it is running + if let Some(process) = self.context.process.as_mut() { + reconcile!(end_process, EndProcessReconciler(process), res => { + // If the process has exited, clear the process context + if res.inner.is_some() { + self.context.process = None; + } + }); + } + return Ok(ReconcileStatus::default().add_scope("agent_state/inventory")); } AgentState::Node(env_id, node) => { diff --git a/crates/agent/src/reconcile/checkpoint.rs b/crates/agent/src/reconcile/checkpoint.rs index 9166d633..0be9d3f3 100644 --- a/crates/agent/src/reconcile/checkpoint.rs +++ b/crates/agent/src/reconcile/checkpoint.rs @@ -9,7 +9,7 @@ use snops_common::{ rpc::error::ReconcileError, state::{NetworkId, StorageId}, }; -use tracing::{error, info, trace}; +use tracing::{error, trace}; use crate::{api, state::GlobalState}; diff --git a/crates/agent/src/reconcile/mod.rs b/crates/agent/src/reconcile/mod.rs index 6f02f0f0..ca8c3c96 100644 --- a/crates/agent/src/reconcile/mod.rs +++ b/crates/agent/src/reconcile/mod.rs @@ -1,4 +1,4 @@ -use std::{collections::HashSet, fmt::Display, time::Duration}; +use std::{fmt::Display, time::Duration}; use indexmap::IndexSet; @@ -28,8 +28,6 @@ pub enum ReconcileCondition { PendingConnection, /// Waiting for the node to be shut down PendingShutdown, - /// Waiting for the node to be gracefully shut down - RequestedShutdown, } pub trait Reconcile { diff --git a/crates/agent/src/reconcile/process.rs b/crates/agent/src/reconcile/process.rs index 66141677..296b2ff3 100644 --- a/crates/agent/src/reconcile/process.rs +++ b/crates/agent/src/reconcile/process.rs @@ -1,7 +1,7 @@ use std::time::{Duration, Instant}; use snops_common::rpc::error::ReconcileError2; -use tokio::process::Child; +use tokio::{process::Child, select}; use tracing::{error, info}; use super::{command::NodeCommand, Reconcile, ReconcileCondition, ReconcileStatus}; @@ -45,6 +45,35 @@ impl ProcessContext { pub fn is_running(&self) -> bool { self.child.id().is_some() } + + /// A helper function to gracefully shutdown the node process without + /// a reconciler + pub async fn graceful_shutdown(&mut self) { + if !self.is_running() { + return; + } + + self.send_sigint(); + + select! { + _ = tokio::time::sleep(NODE_GRACEFUL_SHUTDOWN_TIMEOUT) => { + info!("sending SIGKILL to node process"); + self.send_sigkill(); + }, + _ = tokio::signal::ctrl_c() => { + info!("received SIGINT, sending SIGKILL to node process"); + self.send_sigkill(); + }, + _ = self.child.wait() => { + info!("node process has exited gracefully"); + return; + } + } + + let _ = self.child.wait().await; + info!("node process has exited"); + } + /// Send a SIGINT to the child process pub fn send_sigint(&mut self) -> bool { use nix::{ From a38ab53e65b0d7f2e060c0ac619f9a14ba6bf789 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sun, 24 Nov 2024 21:19:30 -0500 Subject: [PATCH 14/68] refactor(agent): remove dead code --- crates/agent/src/api.rs | 22 -- crates/agent/src/reconcile/checkpoint.rs | 109 ------- crates/agent/src/reconcile/files.rs | 327 +------------------ crates/agent/src/reconcile/mod.rs | 14 - crates/agent/src/rpc/control.rs | 379 +++-------------------- 5 files changed, 47 insertions(+), 804 deletions(-) delete mode 100644 crates/agent/src/reconcile/checkpoint.rs diff --git a/crates/agent/src/api.rs b/crates/agent/src/api.rs index c4bbfa7a..0193cdd3 100644 --- a/crates/agent/src/api.rs +++ b/crates/agent/src/api.rs @@ -99,28 +99,6 @@ pub async fn download_file( Ok(Some((file, sha256, downloaded))) } -pub async fn check_file( - url: impl IntoUrl, - to: &Path, - transfer_tx: TransferTx, -) -> anyhow::Result<()> { - let client = reqwest::Client::new(); - - if !should_download_file(&client, url.as_str(), to, None, None, false) - .await - .unwrap_or(true) - { - return Ok(()); - } - - info!("downloading {to:?}"); - - let tx_id = transfers::next_id(); - download_file(tx_id, &client, url, to, transfer_tx).await?; - - Ok(()) -} - pub async fn check_binary( binary: &BinaryEntry, base_url: &str, diff --git a/crates/agent/src/reconcile/checkpoint.rs b/crates/agent/src/reconcile/checkpoint.rs deleted file mode 100644 index 0be9d3f3..00000000 --- a/crates/agent/src/reconcile/checkpoint.rs +++ /dev/null @@ -1,109 +0,0 @@ -use std::{ - collections::BTreeMap, - path::{Path, PathBuf}, -}; - -use snops_checkpoint::{CheckpointHeader, CheckpointManager, RetentionSpan}; -use snops_common::{ - api::CheckpointMeta, - rpc::error::ReconcileError, - state::{NetworkId, StorageId}, -}; -use tracing::{error, trace}; - -use crate::{api, state::GlobalState}; - -pub enum CheckpointSource<'a> { - Manager(&'a CheckpointHeader, &'a PathBuf), - Meta(&'a CheckpointMeta), -} - -impl<'a> CheckpointSource<'a> { - pub async fn acquire( - self, - state: &GlobalState, - storage_path: &Path, - storage_id: StorageId, - network: NetworkId, - ) -> Result { - Ok(match self { - CheckpointSource::Meta(meta) => { - trace!( - "using checkpoint from control plane with height {} and time {}", - meta.height, - meta.timestamp - ); - let checkpoint_url = format!( - "{}/content/storage/{network}/{storage_id}/{}", - &state.endpoint, meta.filename - ); - let path = storage_path.join(&meta.filename); - trace!("downloading {} from {checkpoint_url}...", meta.filename); - - api::check_file(checkpoint_url, &path, state.transfer_tx()) - .await - .map_err(|e| { - error!( - "failed to download {} from the control plane: {e}", - meta.filename - ); - ReconcileError::StorageAcquireError(meta.filename.clone()) - })?; - - path - } - CheckpointSource::Manager(header, path) => { - trace!( - "using checkpoint from manager with height {} and time {}", - header.block_height, - header.time() - ); - path.clone() - } - }) - } -} - -pub fn find_by_height<'a>( - manager: &'a CheckpointManager, - checkpoints: &'a [CheckpointMeta], - height: u32, -) -> Option> { - let sorted: BTreeMap<_, _> = manager - .checkpoints() - .map(|(c, p)| (c.block_height, CheckpointSource::Manager(c, p))) - .chain( - checkpoints - .iter() - .map(|c| (c.height, CheckpointSource::Meta(c))), - ) - .collect(); - - sorted - .into_iter() - .rev() - .find_map(|(h, c)| if h <= height { Some(c) } else { None }) -} - -pub fn find_by_span<'a>( - manager: &'a CheckpointManager, - checkpoints: &'a [CheckpointMeta], - span: RetentionSpan, -) -> Option> { - let timestamp = span.as_timestamp()?; - - let sorted: BTreeMap<_, _> = manager - .checkpoints() - .map(|(c, p)| (c.timestamp, CheckpointSource::Manager(c, p))) - .chain( - checkpoints - .iter() - .map(|c| (c.timestamp, CheckpointSource::Meta(c))), - ) - .collect(); - - sorted - .into_iter() - .rev() - .find_map(|(t, c)| if t <= timestamp { Some(c) } else { None }) -} diff --git a/crates/agent/src/reconcile/files.rs b/crates/agent/src/reconcile/files.rs index 0227caff..b17db0f5 100644 --- a/crates/agent/src/reconcile/files.rs +++ b/crates/agent/src/reconcile/files.rs @@ -1,30 +1,24 @@ use std::{ - fs::Permissions, os::unix::fs::PermissionsExt, path::{Path, PathBuf}, sync::Arc, time::Duration, }; -use chrono::{DateTime, TimeDelta, Utc}; -use snops_checkpoint::CheckpointManager; +use chrono::{TimeDelta, Utc}; use snops_common::{ api::EnvInfo, binaries::{BinaryEntry, BinarySource}, - constant::{ - LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, LEDGER_STORAGE_FILE, SNARKOS_FILE, - SNARKOS_GENESIS_FILE, VERSION_FILE, - }, - rpc::error::{ReconcileError, ReconcileError2}, - state::{HeightRequest, InternedId, NetworkId, StorageId, TransferId, TransferStatusUpdate}, + constant::SNARKOS_GENESIS_FILE, + rpc::error::ReconcileError2, + state::{NetworkId, StorageId, TransferId, TransferStatusUpdate}, }; -use tokio::process::Command; -use tracing::{debug, error, info, trace}; +use tracing::error; use url::Url; -use super::{checkpoint, Reconcile, ReconcileCondition, ReconcileStatus}; +use super::{Reconcile, ReconcileCondition, ReconcileStatus}; use crate::{ - api::{self, download_file, should_download_file}, + api::{download_file, should_download_file}, state::GlobalState, transfers, }; @@ -40,110 +34,10 @@ pub fn default_binary(info: &EnvInfo) -> BinaryEntry { } } -/// Ensure the correct binary is present for running snarkos -pub async fn ensure_correct_binary( - binary_id: Option, - state: &GlobalState, - info: &EnvInfo, -) -> Result<(), ReconcileError> { - let base_path = &state.cli.path; - - // TODO: store binary based on binary id - // download the snarkOS binary - api::check_binary( - info.storage - .binaries - .get(&binary_id.unwrap_or_default()) - .unwrap_or(&default_binary(info)), - &state.endpoint, - &base_path.join(SNARKOS_FILE), - state.transfer_tx(), - ) - .await - .map_err(|e| ReconcileError::BinaryAcquireError(e.to_string()))?; - - Ok(()) -} - pub fn get_genesis_route(endpoint: &str, network: NetworkId, storage_id: StorageId) -> String { format!("{endpoint}/content/storage/{network}/{storage_id}/{SNARKOS_GENESIS_FILE}") } -pub fn get_ledger_route(endpoint: &str, network: NetworkId, storage_id: StorageId) -> String { - format!("{endpoint}/content/storage/{network}/{storage_id}/{LEDGER_STORAGE_FILE}") -} - -/// Ensure all required files are present in the storage directory -pub async fn check_files( - state: &GlobalState, - info: &EnvInfo, - height: &HeightRequest, -) -> Result<(), ReconcileError> { - let base_path = &state.cli.path; - let storage_id = info.storage.id; - let network = info.network; - let storage_path = state.cli.storage_path(network, storage_id); - - // create the directory containing the storage files - tokio::fs::create_dir_all(&storage_path) - .await - .map_err(|_| ReconcileError::StorageSetupError("create storage directory".to_string()))?; - - let version_file = storage_path.join(VERSION_FILE); - - // wipe old storage when the version changes - if get_version_from_path(&version_file).await? != Some(info.storage.version) - && storage_path.exists() - { - let _ = tokio::fs::remove_dir_all(&storage_path).await; - } - - std::fs::create_dir_all(&storage_path).map_err(|e| { - error!("failed to create storage directory: {e}"); - ReconcileError::StorageSetupError("create storage directory".to_string()) - })?; - - let genesis_path = storage_path.join(SNARKOS_GENESIS_FILE); - let genesis_url = get_genesis_route(&state.endpoint, network, storage_id); - let ledger_path = storage_path.join(LEDGER_STORAGE_FILE); - let ledger_url = get_ledger_route(&state.endpoint, network, storage_id); - - // skip genesis download for native genesis storage - if !info.storage.native_genesis { - // download the genesis block - api::check_file(genesis_url, &genesis_path, state.transfer_tx()) - .await - .map_err(|e| { - error!("failed to download {SNARKOS_GENESIS_FILE} from the control plane: {e}"); - ReconcileError::StorageAcquireError(SNARKOS_GENESIS_FILE.to_owned()) - })?; - } - - // don't download - if height.reset() { - info!("skipping ledger check due to 0 height request"); - return Ok(()); - } - - // download the ledger file - api::check_file(ledger_url, &ledger_path, state.transfer_tx()) - .await - .map_err(|e| { - error!("failed to download {SNARKOS_GENESIS_FILE} from the control plane: {e}"); - ReconcileError::StorageAcquireError(LEDGER_STORAGE_FILE.to_owned()) - })?; - - // write the regen version to a "version" file - tokio::fs::write(&version_file, info.storage.version.to_string()) - .await - .map_err(|e| { - error!("failed to write storage version: {e}"); - ReconcileError::StorageSetupError("write storage version".to_string()) - })?; - - Ok(()) -} - /// This reconciler creates a directory if it does not exist pub struct DirectoryReconciler<'a>(pub &'a Path); impl<'a> Reconcile<(), ReconcileError2> for DirectoryReconciler<'a> { @@ -327,210 +221,3 @@ impl Reconcile for FileReconciler { .requeue_after(Duration::from_secs(1))) } } - -/// Untar the ledger file into the storage directory -pub async fn load_ledger( - state: &GlobalState, - info: &EnvInfo, - height: &HeightRequest, - is_new_env: bool, -) -> Result { - let base_path = &state.cli.path; - let storage_id = &info.storage.id; - let storage_path = base_path - .join("storage") - .join(info.network.to_string()) - .join(storage_id.to_string()); - - // use a persisted directory for the untar when configured - let (untar_base, untar_dir) = if info.storage.persist { - info!("using persisted ledger for {storage_id}"); - (&storage_path, LEDGER_PERSIST_DIR) - } else { - info!("using fresh ledger for {storage_id}"); - (base_path, LEDGER_BASE_DIR) - }; - - let ledger_dir = untar_base.join(untar_dir); - - tokio::fs::create_dir_all(&ledger_dir.join(".aleo")) - .await - .map_err(|_| ReconcileError::StorageSetupError("create local aleo home".to_string()))?; - - // skip the top request if the persisted ledger already exists - // this will prevent the ledger from getting wiped in the next step - if info.storage.persist && height.is_top() && ledger_dir.exists() { - info!("persisted ledger already exists for {storage_id}"); - return Ok(false); - } - - let mut changed = false; - - // If there's a retention policy, load the checkpoint manager - // this is so we can wipe all leftover checkpoints for non-persisted storage - // after resets or new environments - let mut manager = info - .storage - .retention_policy - .clone() - .map(|policy| { - debug!("loading checkpoints from {untar_base:?}..."); - CheckpointManager::load(ledger_dir.clone(), policy).map_err(|e| { - error!("failed to load checkpoints: {e}"); - ReconcileError::CheckpointLoadError - }) - }) - .transpose()?; - - if let Some(manager) = &manager { - info!("discovered checkpoints: {manager}"); - } - - // reload the storage if the height is reset or a new environment is created - if height.reset() || is_new_env { - // clean up old storage - if ledger_dir.exists() { - changed = true; - if let Err(err) = tokio::fs::remove_dir_all(&ledger_dir).await { - error!("failed to remove old ledger: {err}"); - } - } - - // cleanup old checkpoints for non-persisted ledgers as they are - // stored in a common location - // - // this also forces the rewind checkpoints to be fetched from the - // control plane - if !info.storage.persist { - if let Some(manager) = manager.as_mut() { - info!("wiping old checkpoints for {storage_id}"); - manager.wipe(); - } - } - } - - let tar_path = storage_path.join(LEDGER_STORAGE_FILE); - - // A reset height will not require untarring the ledger because it is - // created from the genesis block - if is_new_env && !height.reset() && tar_path.exists() { - changed = true; - - // ensure the storage directory exists - tokio::fs::create_dir_all(&ledger_dir) - .await - .map_err(|err| { - error!("failed to create storage directory: {err}"); - ReconcileError::StorageSetupError("create ledger directory".to_string()) - })?; - - trace!("untarring ledger..."); - - // use `tar` to decompress the storage to the untar dir - let status = Command::new("tar") - .current_dir(untar_base) - .arg("xzf") - .arg(&tar_path) - .arg("-C") // the untar_dir must exist. this will extract the contents of the tar to the - // directory - .arg(untar_dir) - .arg("--strip-components") // remove the parent "ledger" directory within the tar - .arg("1") - .kill_on_drop(true) - .spawn() - .map_err(|err| { - error!("failed to spawn tar process: {err}"); - ReconcileError::StorageSetupError("spawn tar process".to_string()) - })? - .wait() - .await - .map_err(|err| { - error!("failed to await tar process: {err}"); - ReconcileError::StorageSetupError("await tar process".to_string()) - })?; - - if !status.success() { - return Err(ReconcileError::StorageSetupError(format!( - "tar failed: {status}" - ))); - } - } - - if matches!(height, HeightRequest::Top | HeightRequest::Absolute(0)) { - return Ok(changed); - } - - // retention policies are required for the rewind operations - let Some(manager) = &manager.as_mut() else { - return Err(ReconcileError::MissingRetentionPolicy); - }; - - // determine which checkpoint to use by the next available height/time - let checkpoint = match height { - HeightRequest::Absolute(block_height) => { - checkpoint::find_by_height(manager, &info.storage.checkpoints, *block_height) - } - HeightRequest::Checkpoint(span) => { - checkpoint::find_by_span(manager, &info.storage.checkpoints, *span) - } - _ => unreachable!("handled by previous match"), - } - .ok_or(ReconcileError::CheckpointAcquireError)?; - - // download checkpoint if necessary, and get the path - let path = checkpoint - .acquire(state, &storage_path, *storage_id, info.network) - .await?; - - // apply the checkpoint to the ledger - let mut command = Command::new(state.cli.path.join(SNARKOS_FILE)); - command - .stdout(std::io::stdout()) - .stderr(std::io::stderr()) - .env("NETWORK", info.network.to_string()) - .arg("ledger") - .arg("--ledger") - .arg(&ledger_dir); - - if !info.storage.native_genesis { - command - .arg("--genesis") - .arg(storage_path.join(SNARKOS_GENESIS_FILE)); - } - - command.arg("checkpoint").arg("apply").arg(path); - - let res = command - .spawn() - .map_err(|e| { - error!("failed to spawn checkpoint apply process: {e}"); - ReconcileError::CheckpointApplyError("spawn checkpoint apply process".to_string()) - })? - .wait() - .await - .map_err(|e| { - error!("failed to await checkpoint apply process: {e}"); - ReconcileError::CheckpointApplyError("await checkpoint apply process".to_string()) - })?; - - if !res.success() { - return Err(ReconcileError::CheckpointApplyError(format!( - "checkpoint apply failed: {res}" - ))); - } - - Ok(true) -} - -pub async fn get_version_from_path(path: &PathBuf) -> Result, ReconcileError> { - if !path.exists() { - return Ok(None); - } - - let data = tokio::fs::read_to_string(path).await.map_err(|e| { - error!("failed to read storage version: {e}"); - ReconcileError::StorageSetupError("failed to read storage version".to_string()) - })?; - - Ok(data.parse().ok()) -} diff --git a/crates/agent/src/reconcile/mod.rs b/crates/agent/src/reconcile/mod.rs index ca8c3c96..3ac7f989 100644 --- a/crates/agent/src/reconcile/mod.rs +++ b/crates/agent/src/reconcile/mod.rs @@ -3,7 +3,6 @@ use std::{fmt::Display, time::Duration}; use indexmap::IndexSet; pub mod agent; -mod checkpoint; pub mod command; mod files; pub use files::*; @@ -69,15 +68,6 @@ impl ReconcileStatus { self.requeue_after.is_some() } - pub fn replace(&self, inner: Option) -> ReconcileStatus { - ReconcileStatus { - scopes: self.scopes.clone(), - inner, - requeue_after: self.requeue_after, - conditions: self.conditions.clone(), - } - } - pub fn emptied(&self) -> ReconcileStatus { ReconcileStatus { inner: None, @@ -87,10 +77,6 @@ impl ReconcileStatus { } } - pub fn take(self) -> Option { - self.inner - } - pub fn requeue_after(mut self, duration: Duration) -> Self { self.requeue_after = Some(duration); self diff --git a/crates/agent/src/rpc/control.rs b/crates/agent/src/rpc/control.rs index f49a7bfc..7cc5ce82 100644 --- a/crates/agent/src/rpc/control.rs +++ b/crates/agent/src/rpc/control.rs @@ -1,16 +1,10 @@ //! Control plane-to-agent RPC. -use std::{ - collections::HashSet, net::IpAddr, ops::Deref, path::PathBuf, process::Stdio, sync::Arc, - time::Duration, -}; +use std::{net::IpAddr, path::PathBuf}; use snops_common::{ aot_cmds::AotCmd, binaries::{BinaryEntry, BinarySource}, - constant::{ - LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, SNARKOS_LOG_FILE, - }, define_rpc_mux, prelude::snarkos_status::SnarkOSLiteBlock, rpc::{ @@ -23,19 +17,12 @@ use snops_common::{ }, error::{AgentError, ReconcileError, SnarkosRequestError}, }, - state::{AgentId, AgentPeer, AgentState, EnvId, InternedId, KeyState, NetworkId, PortConfig}, + state::{AgentState, EnvId, InternedId, NetworkId, PortConfig}, }; use tarpc::context; -use tokio::process::Command; -use tracing::{debug, error, info, trace, warn}; - -use crate::{ - api, - log::make_env_filter, - metrics::MetricComputer, - reconcile::{self, ensure_correct_binary}, - state::AppState, -}; +use tracing::{error, info, trace}; + +use crate::{api, log::make_env_filter, metrics::MetricComputer, state::AppState}; define_rpc_mux!(child; ControlServiceRequest => ControlServiceResponse; @@ -51,11 +38,8 @@ pub struct AgentRpcServer { impl AgentService for AgentRpcServer { async fn kill(self, _: context::Context) { - self.state.node_graceful_shutdown().await; - std::thread::spawn(|| { - std::thread::sleep(std::time::Duration::from_secs(1)); - std::process::exit(0) - }); + info!("Kill RPC invoked..."); + self.state.shutdown().await; } async fn handshake( @@ -110,8 +94,7 @@ impl AgentService for AgentRpcServer { // Queue a reconcile immediately as we have received new state. // The reconciler will decide if anything has actually changed - *self.state.agent_state.write().await = Arc::new(handshake.state); - self.state.queue_reconcile(Duration::ZERO).await; + self.state.update_agent_state(handshake.state).await; Ok(()) } @@ -122,307 +105,8 @@ impl AgentService for AgentRpcServer { target: AgentState, ) -> Result<(), ReconcileError> { info!("queing reconcilation..."); - *self.state.agent_state.write().await = Arc::new(target.clone()); - self.state.queue_reconcile(Duration::ZERO).await; - - // TODO: remove the following code, handled entirely by the reconciler logic - - // acquire the handle lock - let mut handle_container = self.state.reconcilation_handle.lock().await; - - // abort if we are already reconciling - if let Some(handle) = handle_container.take() { - info!("aborting previous reconcilation task..."); - handle.abort(); - } - - // perform the reconcilation - let state = Arc::clone(&self.state); - let handle = tokio::spawn(async move { - // previous state cleanup - let old_state = { - let agent_state_lock = state.agent_state.read().await; - match agent_state_lock.as_ref() { - // kill existing child if running - AgentState::Node(_, node) if node.online => { - info!("cleaning up snarkos process..."); - state.node_graceful_shutdown().await; - } - - _ => (), - } - - agent_state_lock.deref().clone() - }; - - // download new storage if storage_id changed - 'storage: { - let (is_same_env, is_same_index) = match (old_state.as_ref(), &target) { - (AgentState::Node(old_env, old_node), AgentState::Node(new_env, new_node)) => { - (old_env == new_env, old_node.height.0 == new_node.height.0) - } - _ => (false, false), - }; - - // skip if we don't need storage - let AgentState::Node(env_id, node) = &target else { - break 'storage; - }; - - // get the storage info for this environment if we don't have it cached - let info = state - .get_env_info(*env_id) - .await - .map_err(|_| ReconcileError::StorageAcquireError("storage info".to_owned()))?; - - // ensure the binary is correct every reconcile (or restart) - ensure_correct_binary(node.binary, &state, &info).await?; - - if is_same_env && is_same_index { - debug!("skipping storage download"); - break 'storage; - } - - // TODO: download storage to a cache directory (~/config/.snops) to prevent - // multiple agents from having to redownload - // can be configurable to also work from a network drive - - // download and decompress the storage - let height = &node.height.1; - - trace!("checking storage files..."); - - // only download storage if it's a new environment - // if a node starts at height: 0, the node will never - // download the ledger - if !is_same_env { - reconcile::check_files(&state, &info, height).await?; - } - reconcile::load_ledger(&state, &info, height, !is_same_env).await?; - // TODO: checkpoint/absolute height request handling - } - - // reconcile towards new state - match target.clone() { - // inventory state is waiting for a node to be started - AgentState::Inventory => { - // wipe the env info cache. don't want to have stale storage info - state.env_info.write().await.take(); - if let Err(e) = state.db.set_env_info(None) { - error!("failed to clear env info from db: {e}"); - } - } - - // start snarkOS node when node - AgentState::Node(env_id, node) => { - let mut child_lock = state.child.write().await; - let mut command = Command::new(state.cli.path.join(SNARKOS_FILE)); - - // get the storage info for this environment if we don't have it cached - let info = state.get_env_info(env_id).await.map_err(|_| { - ReconcileError::StorageAcquireError("storage info".to_owned()) - })?; - - let storage_id = &info.storage.id; - let storage_path = state - .cli - .path - .join("storage") - .join(info.network.to_string()) - .join(storage_id.to_string()); - let ledger_path = if info.storage.persist { - storage_path.join(LEDGER_PERSIST_DIR) - } else { - state.cli.path.join(LEDGER_BASE_DIR) - }; - - // add loki URL if one is set - if let Some(loki) = &*state.loki.lock().unwrap() { - command - .env( - "SNOPS_LOKI_LABELS", - format!("env_id={},node_key={}", env_id, node.node_key), - ) - .arg("--loki") - .arg(loki.as_str()); - } - - if state.cli.quiet { - command.stdout(Stdio::null()); - } else { - command.stdout(std::io::stdout()); - } - - command - .stderr(std::io::stderr()) - .envs(&node.env) - .env("NETWORK", info.network.to_string()) - .env("HOME", &ledger_path) - .arg("--log") - .arg(state.cli.path.join(SNARKOS_LOG_FILE)) - .arg("run") - .arg("--agent-rpc-port") - .arg(state.agent_rpc_port.to_string()) - .arg("--type") - .arg(node.node_key.ty.to_string()) - .arg("--ledger") - .arg(ledger_path); - - if !info.storage.native_genesis { - command - .arg("--genesis") - .arg(storage_path.join(SNARKOS_GENESIS_FILE)); - } - - // storage configuration - command - // port configuration - .arg("--bind") - .arg(state.cli.bind_addr.to_string()) - .arg("--bft") - .arg(state.cli.ports.bft.to_string()) - .arg("--rest") - .arg(state.cli.ports.rest.to_string()) - .arg("--metrics") - .arg(state.cli.ports.metrics.to_string()) - .arg("--node") - .arg(state.cli.ports.node.to_string()); - - match node.private_key { - KeyState::None => {} - KeyState::Local => { - command.arg("--private-key-file").arg( - state - .cli - .private_key_file - .as_ref() - .ok_or(ReconcileError::NoLocalPrivateKey)?, - ); - } - KeyState::Literal(pk) => { - command.arg("--private-key").arg(pk); - } - } - - // conditionally add retention policy - if let Some(policy) = &info.storage.retention_policy { - command.arg("--retention-policy").arg(policy.to_string()); - } - - // Find agents that do not have cached addresses - let unresolved_addrs: HashSet = { - let resolved_addrs = state.resolved_addrs.read().await; - node.peers - .iter() - .chain(node.validators.iter()) - .filter_map(|p| { - if let AgentPeer::Internal(id, _) = p { - (!resolved_addrs.contains_key(id)).then_some(*id) - } else { - None - } - }) - .collect() - }; - - // Fetch all unresolved addresses and update the cache - if !unresolved_addrs.is_empty() { - tracing::debug!( - "need to resolve addrs: {}", - unresolved_addrs - .iter() - .map(|id| id.to_string()) - .collect::>() - .join(",") - ); - let new_addrs = self - .client - .resolve_addrs(context::current(), unresolved_addrs) - .await - .map_err(|err| { - error!("rpc error while resolving addresses: {err}"); - ReconcileError::Unknown - })? - .map_err(ReconcileError::ResolveAddrError)?; - tracing::debug!( - "resolved new addrs: {}", - new_addrs - .iter() - .map(|(id, addr)| format!("{}: {}", id, addr)) - .collect::>() - .join(", ") - ); - { - let mut guard = state.resolved_addrs.write().await; - guard.extend(new_addrs); - if let Err(e) = state.db.set_resolved_addrs(Some(&guard)) { - error!("failed to save resolved addrs to db: {e}"); - } - } - } - - if !node.peers.is_empty() { - command - .arg("--peers") - .arg(state.agentpeers_to_cli(&node.peers).await.join(",")); - } - - if !node.validators.is_empty() { - command - .arg("--validators") - .arg(state.agentpeers_to_cli(&node.validators).await.join(",")); - } - - if node.online { - tracing::trace!("spawning node process..."); - tracing::debug!("node command: {command:?}"); - let child = command.spawn().expect("failed to start child"); - - *child_lock = Some(child); - - // todo: check to ensure the node actually comes online - // by hitting the REST latest block - } else { - tracing::debug!("skipping node spawn"); - } - } - } - - // After completing the reconcilation, update the agent state - let target = Arc::new(target); - if let Err(e) = state.db.set_agent_state(Some(&target)) { - error!("failed to save agent state to db: {e}"); - } - *state.agent_state.write().await = target; - - Ok(()) - }); - - // update the mutex with our new handle and drop the lock - *handle_container = Some(handle.abort_handle()); - drop(handle_container); - - // await reconcilation completion - let res = match handle.await { - Err(e) if e.is_cancelled() => { - warn!("reconcilation was aborted by a newer reconcilation request"); - - // early return (don't clean up the handle lock) - return Err(ReconcileError::Aborted); - } - - Ok(inner) => inner, - Err(e) => { - warn!("reconcilation task panicked: {e}"); - Err(ReconcileError::Unknown) - } - }; - - // clean up the abort handle - // we can't be here if we were cancelled (see early return above) - self.state.reconcilation_handle.lock().await.take(); - - res + self.state.update_agent_state(target).await; + Ok(()) } async fn get_addrs(self, _: context::Context) -> (PortConfig, Option, Vec) { @@ -438,6 +122,11 @@ impl AgentService for AgentRpcServer { _: context::Context, route: String, ) -> Result { + self.state + .get_node_client() + .await + .ok_or(SnarkosRequestError::OfflineNode)?; + let env_id = if let AgentState::Node(env_id, state) = self.state.agent_state.read().await.as_ref() { if !state.online { @@ -477,6 +166,11 @@ impl AgentService for AgentRpcServer { } async fn broadcast_tx(self, _: context::Context, tx: String) -> Result<(), AgentError> { + self.state + .get_node_client() + .await + .ok_or(AgentError::NodeClientNotReady)?; + let env_id = if let AgentState::Node(env_id, _) = self.state.agent_state.read().await.as_ref() { *env_id @@ -622,9 +316,10 @@ impl AgentService for AgentRpcServer { verbosity: u8, ) -> Result<(), AgentError> { tracing::debug!("agent setting aot log verbosity to {verbosity:?}"); - let lock = self.state.node_client.lock().await; - let node_client = lock.as_ref().ok_or(AgentError::NodeClientNotSet)?; - node_client + self.state + .get_node_client() + .await + .ok_or(AgentError::NodeClientNotSet)? .set_log_level(ctx, verbosity) .await .map_err(|_| AgentError::FailedToChangeLogLevel)? @@ -635,9 +330,10 @@ impl AgentService for AgentRpcServer { ctx: context::Context, block_hash: String, ) -> Result, AgentError> { - let lock = self.state.node_client.lock().await; - let node_client = lock.as_ref().ok_or(AgentError::NodeClientNotSet)?; - node_client + self.state + .get_node_client() + .await + .ok_or(AgentError::NodeClientNotSet)? .get_block_lite(ctx, block_hash) .await .map_err(|_| AgentError::FailedToMakeRequest)? @@ -648,20 +344,25 @@ impl AgentService for AgentRpcServer { context: context::Context, tx_id: String, ) -> Result, AgentError> { - let lock = self.state.node_client.lock().await; - let node_client = lock.as_ref().ok_or(AgentError::NodeClientNotSet)?; - node_client + self.state + .get_node_client() + .await + .ok_or(AgentError::NodeClientNotSet)? .find_transaction(context, tx_id) .await .map_err(|_| AgentError::FailedToMakeRequest)? } async fn get_status(self, ctx: context::Context) -> Result { - let lock = self.state.node_client.lock().await; - let node_client = lock.as_ref().ok_or(AgentError::NodeClientNotSet)?; - Ok(AgentStatus { - aot_online: node_client.status(ctx).await.is_ok(), + aot_online: self + .state + .get_node_client() + .await + .ok_or(AgentError::NodeClientNotSet)? + .status(ctx) + .await + .is_ok(), version: self.version.to_string(), }) } From c9f70e636b4a061d2d25f62e3574412f58169563 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sun, 24 Nov 2024 21:20:34 -0500 Subject: [PATCH 15/68] feat(agent): save reconcile persistence, replace shutdown hook --- crates/agent/src/db.rs | 75 +++++++++-- crates/agent/src/main.rs | 28 ++-- crates/agent/src/metrics/mod.rs | 17 +-- crates/agent/src/reconcile/agent.rs | 182 +++++++++++++++++++++----- crates/agent/src/reconcile/process.rs | 2 +- crates/agent/src/reconcile/storage.rs | 13 +- crates/agent/src/server.rs | 4 +- crates/agent/src/state.rs | 67 ++++------ 8 files changed, 272 insertions(+), 116 deletions(-) diff --git a/crates/agent/src/db.rs b/crates/agent/src/db.rs index ab360b9f..f45b2f7a 100644 --- a/crates/agent/src/db.rs +++ b/crates/agent/src/db.rs @@ -10,11 +10,13 @@ use indexmap::IndexMap; use snops_common::{ api::EnvInfo, db::{error::DatabaseError, tree::DbTree, Database as DatabaseTrait}, - format::{self, read_dataformat, DataFormat, DataReadError, DataWriteError}, - state::{AgentId, AgentState, EnvId}, + format::{self, read_dataformat, DataFormat, DataReadError, DataWriteError, PackedUint}, + state::{AgentId, AgentState, EnvId, HeightRequest}, }; use url::Url; +use crate::reconcile::agent::EnvState; + #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] #[repr(u8)] pub enum AgentDbString { @@ -26,10 +28,14 @@ pub enum AgentDbString { LokiUrl, /// Current state of the agent. AgentState, + /// Current environment state. + EnvState, /// Latest stored environment info. EnvInfo, /// Agent addresses resolved by the controlplane. ResolvedAddrs, + /// Last height of the agent state + LastHeight, } impl DataFormat for AgentDbString { @@ -141,17 +147,11 @@ impl Database { ) } - pub fn set_agent_state(&self, state: Option<&AgentState>) -> Result<(), DatabaseError> { - if let Some(state) = state { - self.documents.save( - &AgentDbString::AgentState, - &format::BinaryData(state.to_byte_vec()?), - ) - } else { - self.documents - .delete(&AgentDbString::AgentState) - .map(|_| ()) - } + pub fn set_agent_state(&self, state: &AgentState) -> Result<(), DatabaseError> { + self.documents.save( + &AgentDbString::AgentState, + &format::BinaryData(state.to_byte_vec()?), + ) } pub fn resolved_addrs(&self) -> Result, DatabaseError> { @@ -181,4 +181,53 @@ impl Database { .map(|_| ()) } } + + pub fn env_state(&self) -> Result, DatabaseError> { + Ok(self + .documents + .restore(&AgentDbString::EnvState)? + .map(|format::BinaryData(bytes)| read_dataformat(&mut bytes.reader())) + .transpose()?) + } + + pub fn set_env_state(&self, state: Option<&EnvState>) -> Result<(), DatabaseError> { + if let Some(state) = state { + self.documents.save( + &AgentDbString::EnvState, + &format::BinaryData(state.to_byte_vec()?), + ) + } else { + self.documents.delete(&AgentDbString::EnvState).map(|_| ()) + } + } + + pub fn last_height(&self) -> Result, DatabaseError> { + Ok( + if let Some(format::BinaryData(bytes)) = + self.documents.restore(&AgentDbString::LastHeight)? + { + let (counter, req) = + read_dataformat::<_, (PackedUint, HeightRequest)>(&mut bytes.reader())?; + Some((counter.into(), req)) + } else { + None + }, + ) + } + + pub fn set_last_height( + &self, + height: Option<(usize, HeightRequest)>, + ) -> Result<(), DatabaseError> { + if let Some((counter, req)) = height { + self.documents.save( + &AgentDbString::LastHeight, + &format::BinaryData((PackedUint::from(counter), req).to_byte_vec()?), + ) + } else { + self.documents + .delete(&AgentDbString::LastHeight) + .map(|_| ()) + } + } } diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index a0787082..fd7742ed 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -21,7 +21,7 @@ use clap::Parser; use cli::Cli; use futures_util::stream::{FuturesUnordered, StreamExt}; use log::init_logging; -use reconcile::agent::AgentStateReconciler; +use reconcile::agent::{AgentStateReconciler, AgentStateReconcilerContext}; use snops_common::{db::Database, util::OpaqueDebug}; use tokio::{ select, @@ -75,6 +75,8 @@ async fn main() { let (queue_reconcile_tx, reconcile_requests) = mpsc::channel(5); + let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel(); + // Create the client state let state = Arc::new(GlobalState { client, @@ -100,8 +102,6 @@ async fn main() { }) .unwrap_or_default(), ), - reconcilation_handle: Default::default(), - child: Default::default(), resolved_addrs: RwLock::new( db.resolved_addrs() .inspect_err(|e| { @@ -116,6 +116,7 @@ async fn main() { node_client: Default::default(), log_level_handler: reload_handler, db: OpaqueDebug(db), + shutdown: RwLock::new(Some(shutdown_tx)), }); // Start the metrics watcher @@ -124,7 +125,7 @@ async fn main() { // Start the status server let status_state = Arc::clone(&state); tokio::spawn(async move { - info!("starting status API server on port {agent_rpc_port}"); + info!("Starting status API server on port {agent_rpc_port}"); if let Err(e) = server::start(agent_rpc_listener, status_state).await { error!("status API server crashed: {e:?}"); std::process::exit(1); @@ -150,22 +151,21 @@ async fn main() { let mut root = AgentStateReconciler { agent_state: Arc::clone(state.agent_state.read().await.deref()), state: Arc::clone(&state), - context: Default::default(), + // Recover context from previous state + context: AgentStateReconcilerContext::hydrate(&state.db), }; select! { _ = root.loop_forever(reconcile_requests) => unreachable!(), - _ = interrupt.recv_any() => { - info!("Received interrupt signal, shutting down..."); - if let Some(process) = root.context.process.as_mut() { - process.graceful_shutdown().await; - - } - }, + _ = interrupt.recv_any() => {}, + _ = shutdown_rx => {}, } - state.node_graceful_shutdown().await; - info!("snops agent has shut down gracefully :)"); + info!("Received interrupt signal, shutting down..."); + if let Some(process) = root.context.process.as_mut() { + process.graceful_shutdown().await; + info!("Agent has shut down gracefully"); + } } struct Signals { diff --git a/crates/agent/src/metrics/mod.rs b/crates/agent/src/metrics/mod.rs index 1dd66b21..d2d2b1e7 100644 --- a/crates/agent/src/metrics/mod.rs +++ b/crates/agent/src/metrics/mod.rs @@ -19,26 +19,21 @@ pub fn init(state: Arc) { tokio::spawn(async move { let mut interval = tokio::time::interval(UPDATE_RATE); let client = reqwest::Client::new(); + let route = format!( + "http://{}/", + SocketAddr::new(state.cli.get_local_ip(), state.cli.ports.metrics) + ); loop { interval.tick().await; - // TODO: this could probably be improved, but we want to avoid scraping metrics - // if the child doesn't exist - if state.child.read().await.is_none() { + if !state.is_node_online().await { continue; } // TODO: maybe this should use bind_addr let metrics_text = 'metrics: { - let response = match client - .get(format!( - "http://{}/", - SocketAddr::new(state.cli.get_local_ip(), state.cli.ports.metrics) - )) - .send() - .await - { + let response = match client.get(&route).send().await { Ok(response) => response, Err(_e) => { break 'metrics Default::default(); diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index ba96f6e4..33a49878 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -8,6 +8,7 @@ use std::{ use snops_common::{ api::EnvInfo, binaries::BinaryEntry, + format::{DataFormat, DataHeaderOf}, rpc::error::ReconcileError2, state::{ AgentId, AgentPeer, AgentState, HeightRequest, NetworkId, NodeState, StorageId, TransferId, @@ -28,6 +29,7 @@ use super::{ DirectoryReconciler, Reconcile, ReconcileStatus, }; use crate::{ + db::Database, reconcile::{process::EndProcessReconciler, storage::LedgerReconciler}, state::GlobalState, }; @@ -40,12 +42,37 @@ pub struct AgentStateReconciler { pub context: AgentStateReconcilerContext, } -#[derive(Default)] -struct TransfersContext { - // TODO: persist network_id, storage_id, storage_version, and ledger_last_height +pub struct EnvState { network_id: NetworkId, storage_id: StorageId, storage_version: u16, +} + +impl From<&EnvInfo> for EnvState { + fn from(info: &EnvInfo) -> Self { + Self { + network_id: info.network, + storage_id: info.storage.id, + storage_version: info.storage.version, + } + } +} + +impl Default for EnvState { + fn default() -> Self { + Self { + network_id: NetworkId::Mainnet, + storage_id: StorageId::default(), + storage_version: 0, + } + } +} + +#[derive(Default)] +struct TransfersContext { + /// Persisted values that determine if the storage has changed + env_state: EnvState, + /// The last ledger height that was successfully configured ledger_last_height: Option<(usize, HeightRequest)>, @@ -68,11 +95,38 @@ struct TransfersContext { ledger_modify_handle: Option<(AbortHandle, Arc>>)>, } -impl TransfersContext { - pub fn changed(&self, env_info: &EnvInfo) -> bool { - env_info.storage.version != self.storage_version - || env_info.storage.id != self.storage_id - || env_info.network != self.network_id +#[derive(Default)] +pub struct AgentStateReconcilerContext { + // TODO: allow transfers to be interrupted. potentially allow them to be resumed by using the + // file range feature. + /// Information about active transfers + transfers: Option, + /// Information about the node process + pub process: Option, + pub shutdown_pending: bool, +} + +impl AgentStateReconcilerContext { + pub fn hydrate(db: &Database) -> Self { + let ledger_last_height = db + .last_height() + .inspect_err(|e| error!("failed to restore last height from db: {e}")) + .unwrap_or_default(); + let env_state = db + .env_state() + .inspect_err(|e| error!("failed to restore env state from db: {e}")) + .unwrap_or_default(); + + Self { + transfers: (ledger_last_height.is_some() || env_state.is_some()).then(|| { + TransfersContext { + env_state: env_state.unwrap_or_default(), + ledger_last_height, + ..Default::default() + } + }), + ..Default::default() + } } } @@ -98,8 +152,9 @@ impl AgentStateReconciler { // Drain the reconcile request queue while reconcile_requests.try_recv().is_ok() {} - // Schedule the next reconcile for 1 week. - next_reconcile_at = Instant::now() + Duration::from_secs(60 * 60 * 24 * 7); + // Schedule the next reconcile for 1 minute (to periodically check if the node + // went offline) + next_reconcile_at = Instant::now() + Duration::from_secs(60); // Update the reconciler with the latest agent state // This prevents the agent state from changing during reconciliation @@ -130,17 +185,6 @@ impl AgentStateReconciler { } } -#[derive(Default)] -pub struct AgentStateReconcilerContext { - // TODO: allow transfers to be interrupted. potentially allow them to be resumed by using the - // file range feature. - /// Information about active transfers - transfers: Option, - /// Information about the node process - pub process: Option, - pub shutdown_pending: bool, -} - /// Run a reconciler and return early if a requeue is needed. A condition is /// added to the scope when a requeue is needed to provide more context when /// monitoring the agent. @@ -164,7 +208,7 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { AgentState::Inventory => { // TODO: cleanup other things - // end the process if it is running + // End the process if it is running if let Some(process) = self.context.process.as_mut() { reconcile!(end_process, EndProcessReconciler(process), res => { // If the process has exited, clear the process context @@ -174,6 +218,20 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { }); } + if let Some(_transfers) = self.context.transfers.as_mut() { + if let Err(e) = self.state.db.set_env_state(None) { + error!("failed to clear env state from db: {e}"); + } + if let Err(e) = self.state.db.set_last_height(None) { + error!("failed to clear last height from db: {e}"); + } + + // TODO: interrupt/kill off pending downloads + + // Destroy the old transfers context + self.context.transfers = None; + } + return Ok(ReconcileStatus::default().add_scope("agent_state/inventory")); } AgentState::Node(env_id, node) => { @@ -184,17 +242,20 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { .context .transfers .as_ref() - .map(|t| t.changed(&env_info)) + .map(|t| t.env_state.changed(&env_info)) .unwrap_or(true); // If the node should be torn down, or the storage has changed, we need to // gracefully shut down the node. let shutdown_pending = !node.online || storage_has_changed; + // TODO: check if addrs have changed, then update the command + if let (true, Some(process)) = ( shutdown_pending || self.context.shutdown_pending, self.context.process.as_mut(), ) { + self.context.shutdown_pending = true; reconcile!(end_process, EndProcessReconciler(process), res => { // If the process has exited, clear the process context if res.inner.is_some() { @@ -203,28 +264,34 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { }); } - // TODO: check if addrs have changed, and update shutdown_pending - // node is offline, no need to reconcile if !node.online { - // TODO: tear down the node if it is running return Ok(ReconcileStatus::default().add_scope("agent_state/offline")); } - let node_arc = Arc::new(*node.clone()); - - if storage_has_changed { - // TODO: abort any ongoing transfers (binary/file), then - // requeue + // Reconcile behavior while the node is running... + if let Some(process) = self.context.process.as_ref() { + // If the process has exited, clear the process context + if !process.is_running() { + info!("node process has exited..."); + self.context.process = None; + } else { + // Prevent other reconcilers from running while the node is running + return Ok(ReconcileStatus::default().add_scope("agent_state/running")); + } } - // initialize the transfers context with the current status + let node_arc = Arc::new(*node.clone()); + + // Initialize the transfers context with the current status if self.context.transfers.is_none() { // TODO: write this to the db + let env_state = EnvState::from(env_info.as_ref()); + if let Err(e) = self.state.db.set_env_state(Some(&env_state)) { + error!("failed to save env state to db: {e}"); + } self.context.transfers = Some(TransfersContext { - network_id: env_info.network, - storage_id: env_info.storage.id, - storage_version: env_info.storage.version, + env_state, ..Default::default() }); } @@ -311,7 +378,8 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { if self.context.process.is_none() { info!("Starting node process"); - self.context.process = Some(ProcessContext::new(command)?); + let process = ProcessContext::new(command)?; + self.context.process = Some(process); } } } @@ -407,3 +475,45 @@ impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { // https://ledger.aleo.network/mainnet/snapshot/latest.txt // https://ledger.aleo.network/testnet/snapshot/latest.txt // https://ledger.aleo.network/canarynet/snapshot/latest.txt + +impl EnvState { + pub fn changed(&self, env_info: &EnvInfo) -> bool { + env_info.storage.version != self.storage_version + || env_info.storage.id != self.storage_id + || env_info.network != self.network_id + } +} + +impl DataFormat for EnvState { + type Header = (u8, DataHeaderOf); + + const LATEST_HEADER: Self::Header = (1u8, NetworkId::LATEST_HEADER); + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + Ok(self.network_id.write_data(writer)? + + self.storage_id.write_data(writer)? + + self.storage_version.write_data(writer)?) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.0 != Self::LATEST_HEADER.0 { + return Err(snops_common::format::DataReadError::unsupported( + "EnvIdentifier", + Self::LATEST_HEADER.0, + header.0, + )); + } + + Ok(Self { + network_id: NetworkId::read_data(reader, &header.1)?, + storage_id: StorageId::read_data(reader, &())?, + storage_version: u16::read_data(reader, &())?, + }) + } +} diff --git a/crates/agent/src/reconcile/process.rs b/crates/agent/src/reconcile/process.rs index 296b2ff3..ad9fe358 100644 --- a/crates/agent/src/reconcile/process.rs +++ b/crates/agent/src/reconcile/process.rs @@ -13,7 +13,7 @@ pub struct ProcessContext { /// the node should be restarted pub command: NodeCommand, /// The child process that is running the node - child: Child, + pub child: Child, /// Time the child process was started #[allow(dead_code)] started_at: Instant, diff --git a/crates/agent/src/reconcile/storage.rs b/crates/agent/src/reconcile/storage.rs index 98d6a115..cde89fc6 100644 --- a/crates/agent/src/reconcile/storage.rs +++ b/crates/agent/src/reconcile/storage.rs @@ -377,6 +377,10 @@ impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { // If the target height is the top, we can skip the ledger reconciler if target_height.1.is_top() { *last_height = target_height; + if let Err(e) = self.state.db.set_last_height(Some(target_height)) { + error!("failed to save last height to db: {e}"); + } + // ledger operation is complete return Ok(ReconcileStatus::default()); } @@ -385,9 +389,13 @@ impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { if target_height.1.reset() { let _ = tokio::fs::remove_dir_all(&ledger_path).await; *last_height = target_height; + if let Err(e) = self.state.db.set_last_height(Some(target_height)) { + error!("failed to save last height to db: {e}"); + } + // Ledger operation is complete... immediately requeue because the ledger was // wiped - return Ok(ReconcileStatus::default().requeue_after(Duration::from_secs(0))); + return Ok(ReconcileStatus::default().requeue_after(Duration::ZERO)); } // Target height is guaranteed to be different, not top, and not 0, which means @@ -439,6 +447,9 @@ impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { // If the ledger was modified successfully, update the last height Ok(true) => { *last_height = pending; + if let Err(e) = self.state.db.set_last_height(Some(pending)) { + error!("failed to save last height to db: {e}"); + } } // A failure in the ledger modification process is handled at the // moment... diff --git a/crates/agent/src/server.rs b/crates/agent/src/server.rs index 3ac220b4..1fb18b65 100644 --- a/crates/agent/src/server.rs +++ b/crates/agent/src/server.rs @@ -41,7 +41,7 @@ async fn node_ws_handler(ws: WebSocketUpgrade, State(state): State) -> } async fn handle_socket(mut socket: WebSocket, state: AppState) { - let mut node_client = state.node_client.lock().await; + let mut node_client = state.node_client.write().await; if node_client.is_some() { warn!("a new node RPC connection tried to establish when one was already established"); let _ = socket.send(Message::Close(None)).await; @@ -121,5 +121,5 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { // abort the RPC server handle server_handle.abort(); - state.node_client.lock().await.take(); + state.node_client.write().await.take(); } diff --git a/crates/agent/src/state.rs b/crates/agent/src/state.rs index 1c8820e8..7169c530 100644 --- a/crates/agent/src/state.rs +++ b/crates/agent/src/state.rs @@ -14,13 +14,8 @@ use snops_common::{ util::OpaqueDebug, }; use tarpc::context; -use tokio::{ - process::Child, - select, - sync::{mpsc::Sender, Mutex as AsyncMutex, RwLock}, - task::AbortHandle, -}; -use tracing::{error, info}; +use tokio::sync::{mpsc::Sender, oneshot, RwLock}; +use tracing::error; use crate::{cli::Cli, db::Database, log::ReloadHandler, metrics::Metrics, transfers::TransferTx}; @@ -48,9 +43,6 @@ pub struct GlobalState { /// Helpful for scheduling the next reconciliation. pub queue_reconcile_tx: Sender, pub env_info: RwLock)>>, - pub reconcilation_handle: AsyncMutex>, - pub child: RwLock>, /* TODO: this may need to be handled by an owning thread, - * not sure yet */ // Map of agent IDs to their resolved addresses. pub resolved_addrs: RwLock>, pub metrics: RwLock, @@ -58,9 +50,10 @@ pub struct GlobalState { pub transfer_tx: TransferTx, pub transfers: Arc>, - pub node_client: AsyncMutex>, - + pub node_client: RwLock>, pub log_level_handler: ReloadHandler, + /// A oneshot sender to shutdown the agent. + pub shutdown: RwLock>>, } impl GlobalState { @@ -125,34 +118,32 @@ impl GlobalState { Ok(env_info.1) } - /// Attempt to gracefully shutdown the node if one is running. - pub async fn node_graceful_shutdown(&self) { - if let Some((mut child, id)) = self.child.write().await.take().and_then(|ch| { - let id = ch.id()?; - Some((ch, id)) - }) { - use nix::{ - sys::signal::{self, Signal}, - unistd::Pid, - }; - - // send SIGINT to the child process - signal::kill(Pid::from_raw(id as i32), Signal::SIGINT).unwrap(); - - // wait for graceful shutdown or kill process after 10 seconds - let timeout = tokio::time::sleep(NODE_GRACEFUL_SHUTDOWN_TIMEOUT); - - select! { - _ = child.wait() => (), - _ = timeout => { - info!("snarkos process did not gracefully shut down, killing..."); - child.kill().await.unwrap(); - } - } + pub fn transfer_tx(&self) -> TransferTx { + self.transfer_tx.clone() + } + + pub async fn shutdown(&self) { + if let Some(tx) = self.shutdown.write().await.take() { + let _ = tx.send(()); } } - pub fn transfer_tx(&self) -> TransferTx { - self.transfer_tx.clone() + pub async fn is_node_online(&self) -> bool { + self.node_client.read().await.is_some() + } + + pub async fn get_node_client(&self) -> Option { + self.node_client.read().await.clone() + } + + pub async fn update_agent_state(&self, state: AgentState) { + if let Err(e) = self.db.set_agent_state(&state) { + error!("failed to save agent state to db: {e}"); + } + let state = Arc::new(state); + *self.agent_state.write().await = state; + + // Queue a reconcile to apply the new state + self.queue_reconcile(Duration::ZERO).await; } } From 376b603d7b3a2c12558d28add29359479249070f Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 25 Nov 2024 01:04:43 -0500 Subject: [PATCH 16/68] chore(aot): update based on canary revert --- Cargo.lock | 152 ++++++++++++++-------------- Cargo.toml | 8 +- crates/aot/src/auth/auth_deploy.rs | 2 +- crates/aot/src/auth/auth_fee.rs | 4 +- crates/aot/src/auth/auth_program.rs | 2 +- crates/aot/src/auth/mod.rs | 2 +- crates/aot/src/lib.rs | 2 +- crates/aot/src/program/cost.rs | 2 +- 8 files changed, 86 insertions(+), 88 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 70d6642c..0fa4de92 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3299,7 +3299,7 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "snarkos-account" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "colored", @@ -3353,7 +3353,7 @@ dependencies = [ [[package]] name = "snarkos-node" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "aleo-std", "anyhow", @@ -3387,7 +3387,7 @@ dependencies = [ [[package]] name = "snarkos-node-bft" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "aleo-std", "anyhow", @@ -3420,7 +3420,7 @@ dependencies = [ [[package]] name = "snarkos-node-bft-events" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "bytes", @@ -3437,7 +3437,7 @@ dependencies = [ [[package]] name = "snarkos-node-bft-ledger-service" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "async-trait", "indexmap 2.6.0", @@ -3453,7 +3453,7 @@ dependencies = [ [[package]] name = "snarkos-node-bft-storage-service" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "aleo-std", "anyhow", @@ -3467,7 +3467,7 @@ dependencies = [ [[package]] name = "snarkos-node-cdn" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "bincode", @@ -3486,7 +3486,7 @@ dependencies = [ [[package]] name = "snarkos-node-consensus" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "aleo-std", "anyhow", @@ -3508,7 +3508,7 @@ dependencies = [ [[package]] name = "snarkos-node-metrics" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "metrics-exporter-prometheus", "parking_lot 0.12.3", @@ -3521,7 +3521,7 @@ dependencies = [ [[package]] name = "snarkos-node-rest" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "axum", @@ -3549,7 +3549,7 @@ dependencies = [ [[package]] name = "snarkos-node-router" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "async-trait", @@ -3580,7 +3580,7 @@ dependencies = [ [[package]] name = "snarkos-node-router-messages" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "bytes", @@ -3598,7 +3598,7 @@ dependencies = [ [[package]] name = "snarkos-node-sync" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "indexmap 2.6.0", @@ -3620,7 +3620,7 @@ dependencies = [ [[package]] name = "snarkos-node-sync-communication-service" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "async-trait", "tokio", @@ -3629,7 +3629,7 @@ dependencies = [ [[package]] name = "snarkos-node-sync-locators" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "indexmap 2.6.0", @@ -3641,7 +3641,7 @@ dependencies = [ [[package]] name = "snarkos-node-tcp" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=6cce476#6cce476fcb6e056d43ec3811dfd36832641e8caf" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "async-trait", "bytes", @@ -3657,7 +3657,7 @@ dependencies = [ [[package]] name = "snarkvm" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "anstyle", "anyhow", @@ -3688,7 +3688,7 @@ dependencies = [ [[package]] name = "snarkvm-algorithms" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -3719,7 +3719,7 @@ dependencies = [ [[package]] name = "snarkvm-algorithms-cuda" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "blst", "cc", @@ -3730,7 +3730,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-account", "snarkvm-circuit-algorithms", @@ -3744,7 +3744,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-account" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-algorithms", "snarkvm-circuit-network", @@ -3755,7 +3755,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-algorithms" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-types", "snarkvm-console-algorithms", @@ -3765,7 +3765,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-collections" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-algorithms", "snarkvm-circuit-types", @@ -3775,7 +3775,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-environment" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "indexmap 2.6.0", "itertools 0.11.0", @@ -3793,12 +3793,12 @@ dependencies = [ [[package]] name = "snarkvm-circuit-environment-witness" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" [[package]] name = "snarkvm-circuit-network" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-algorithms", "snarkvm-circuit-collections", @@ -3809,7 +3809,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-program" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "paste", "snarkvm-circuit-account", @@ -3824,7 +3824,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-address", @@ -3839,7 +3839,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-address" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3852,7 +3852,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-boolean" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-console-types-boolean", @@ -3861,7 +3861,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-field" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3871,7 +3871,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-group" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3883,7 +3883,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-integers" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3895,7 +3895,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-scalar" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3906,7 +3906,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-string" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3918,7 +3918,7 @@ dependencies = [ [[package]] name = "snarkvm-console" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-account", "snarkvm-console-algorithms", @@ -3931,7 +3931,7 @@ dependencies = [ [[package]] name = "snarkvm-console-account" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "bs58", "snarkvm-console-network", @@ -3942,7 +3942,7 @@ dependencies = [ [[package]] name = "snarkvm-console-algorithms" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "blake2s_simd", "smallvec", @@ -3955,7 +3955,7 @@ dependencies = [ [[package]] name = "snarkvm-console-collections" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "rayon", @@ -3966,7 +3966,7 @@ dependencies = [ [[package]] name = "snarkvm-console-network" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "anyhow", "indexmap 2.6.0", @@ -3989,7 +3989,7 @@ dependencies = [ [[package]] name = "snarkvm-console-network-environment" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "anyhow", "bech32", @@ -4007,7 +4007,7 @@ dependencies = [ [[package]] name = "snarkvm-console-program" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "enum-iterator", "enum_index", @@ -4029,7 +4029,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-address", @@ -4044,7 +4044,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-address" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4055,7 +4055,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-boolean" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", ] @@ -4063,7 +4063,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-field" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4073,7 +4073,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-group" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4084,7 +4084,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-integers" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4095,7 +4095,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-scalar" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4106,7 +4106,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-string" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4117,7 +4117,7 @@ dependencies = [ [[package]] name = "snarkvm-curves" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "rand", "rayon", @@ -4131,7 +4131,7 @@ dependencies = [ [[package]] name = "snarkvm-fields" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -4148,7 +4148,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -4172,7 +4172,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-authority" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "anyhow", "rand", @@ -4184,7 +4184,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-block" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4204,7 +4204,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-committee" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4217,7 +4217,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-ledger-narwhal-batch-certificate", "snarkvm-ledger-narwhal-batch-header", @@ -4230,7 +4230,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-batch-certificate" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4243,7 +4243,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-batch-header" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4255,7 +4255,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-data" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "bytes", "serde_json", @@ -4266,7 +4266,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-subdag" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4281,7 +4281,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-transmission" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "bytes", "serde_json", @@ -4294,7 +4294,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-transmission-id" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console", "snarkvm-ledger-puzzle", @@ -4303,7 +4303,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-puzzle" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -4323,7 +4323,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-puzzle-epoch" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -4344,7 +4344,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-query" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "async-trait", "reqwest 0.11.27", @@ -4357,7 +4357,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-store" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std-storage", "anyhow", @@ -4384,7 +4384,7 @@ dependencies = [ [[package]] name = "snarkvm-metrics" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "metrics", "metrics-exporter-prometheus", @@ -4393,7 +4393,7 @@ dependencies = [ [[package]] name = "snarkvm-parameters" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -4418,7 +4418,7 @@ dependencies = [ [[package]] name = "snarkvm-synthesizer" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -4449,12 +4449,11 @@ dependencies = [ [[package]] name = "snarkvm-synthesizer-process" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "colored", "indexmap 2.6.0", - "lru", "once_cell", "parking_lot 0.12.3", "rand", @@ -4469,13 +4468,12 @@ dependencies = [ "snarkvm-synthesizer-program", "snarkvm-synthesizer-snark", "snarkvm-utilities", - "tracing", ] [[package]] name = "snarkvm-synthesizer-program" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "indexmap 2.6.0", "paste", @@ -4489,7 +4487,7 @@ dependencies = [ [[package]] name = "snarkvm-synthesizer-snark" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "bincode", "once_cell", @@ -4502,7 +4500,7 @@ dependencies = [ [[package]] name = "snarkvm-utilities" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -4523,7 +4521,7 @@ dependencies = [ [[package]] name = "snarkvm-utilities-derives" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=0b391d2#0b391d2a0fad8c1ee49b3fc64e6a64d4b801ad2d" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "proc-macro2", "quote 1.0.37", diff --git a/Cargo.toml b/Cargo.toml index 43ca943d..17527a73 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -129,9 +129,9 @@ snops-common = { path = "./crates/common" } # snarkos-node-metrics = { version = "3.0" } # snarkvm = { version = "1.0", features = ["rocks"] } -snarkos-account = { git = "https://github.com/AleoNet/snarkOS", rev = "6cce476" } -snarkos-node = { git = "https://github.com/AleoNet/snarkOS", rev = "6cce476" } -snarkos-node-metrics = { git = "https://github.com/AleoNet/snarkOS", rev = "6cce476" } -snarkvm = { git = "https://github.com/AleoNet/snarkVM", rev = "0b391d2", default-features = false, features = [ +snarkos-account = { git = "https://github.com/AleoNet/snarkOS", rev = "ba41197" } +snarkos-node = { git = "https://github.com/AleoNet/snarkOS", rev = "ba41197" } +snarkos-node-metrics = { git = "https://github.com/AleoNet/snarkOS", rev = "ba41197" } +snarkvm = { git = "https://github.com/AleoNet/snarkVM", rev = "1de86e7", default-features = false, features = [ "rocks", ] } diff --git a/crates/aot/src/auth/auth_deploy.rs b/crates/aot/src/auth/auth_deploy.rs index bb0e3592..f8e836aa 100644 --- a/crates/aot/src/auth/auth_deploy.rs +++ b/crates/aot/src/auth/auth_deploy.rs @@ -35,7 +35,7 @@ impl AuthorizeDeploy { pub fn parse(self) -> Result> { // get the program from the file (or stdin) let program = self.options.program.clone().contents()?; - let mut process = Process::load_no_storage()?; + let mut process = Process::load()?; query::get_process_imports(&mut process, &program, self.options.query.as_deref())?; let deployment = diff --git a/crates/aot/src/auth/auth_fee.rs b/crates/aot/src/auth/auth_fee.rs index 794c3c2e..d7260a61 100644 --- a/crates/aot/src/auth/auth_fee.rs +++ b/crates/aot/src/auth/auth_fee.rs @@ -59,7 +59,7 @@ impl AuthorizeFee { let (id, base_fee) = match (self.auth, self.deployment, self.id, self.cost) { (Some(auth), None, None, None) => { let auth = auth.into_inner(); - let mut process = Process::load_no_storage()?; + let mut process = Process::load()?; if let Some(query) = self.query.as_deref() { let programs = query::get_programs_from_auth(&auth); query::add_many_programs_to_process(&mut process, programs, query)?; @@ -181,7 +181,7 @@ pub fn estimate_cost(process: &Process, func: &Authorization) // Retrieve the function name, program id, and program. let function_name = *transition.function_name(); let stack = process.get_stack(transition.program_id())?; - let cost = cost_in_microcredits_v2(&stack, &function_name)?; + let cost = cost_in_microcredits_v2(stack, &function_name)?; // Accumulate the finalize cost. if let Some(cost) = finalize_cost.checked_add(cost) { diff --git a/crates/aot/src/auth/auth_program.rs b/crates/aot/src/auth/auth_program.rs index 014787ee..7b941ebd 100644 --- a/crates/aot/src/auth/auth_program.rs +++ b/crates/aot/src/auth/auth_program.rs @@ -33,7 +33,7 @@ impl AuthorizeProgram { pub fn parse(self) -> Result<(Authorization, u64)> { let private_key = self.key.try_get()?; - let mut process = Process::load_no_storage()?; + let mut process = Process::load()?; match (self.options.query, self.options.locator.program_id()) { (_, id) if *id == N::credits() => {} (None, id) => { diff --git a/crates/aot/src/auth/mod.rs b/crates/aot/src/auth/mod.rs index e9e8a7bd..51ea6707 100644 --- a/crates/aot/src/auth/mod.rs +++ b/crates/aot/src/auth/mod.rs @@ -126,7 +126,7 @@ impl AuthCommand { // load the programs the auth references into the process // as cost estimation measures the size of values from within the auth's // transitions - let mut process = Process::load_no_storage()?; + let mut process = Process::load()?; if let Some(query) = query.as_deref() { let programs = query::get_programs_from_auth(&auth); query::add_many_programs_to_process(&mut process, programs, query)?; diff --git a/crates/aot/src/lib.rs b/crates/aot/src/lib.rs index 9dfee0a3..e8fc71b9 100644 --- a/crates/aot/src/lib.rs +++ b/crates/aot/src/lib.rs @@ -92,7 +92,7 @@ macro_rules! network_to_circuit { fn process<'a>() -> &'a Process<$net_name> { static PROCESS: OnceLock> = OnceLock::new(); - PROCESS.get_or_init(|| Process::load_no_storage().unwrap()) + PROCESS.get_or_init(|| Process::load().unwrap()) } fn credits() -> ProgramID<$net_name> { diff --git a/crates/aot/src/program/cost.rs b/crates/aot/src/program/cost.rs index bb20e416..60c86cef 100644 --- a/crates/aot/src/program/cost.rs +++ b/crates/aot/src/program/cost.rs @@ -37,7 +37,7 @@ impl CostCommand { } = self; let program = program.contents()?; - let mut process = Process::load_no_storage()?; + let mut process = Process::load()?; query::get_process_imports(&mut process, &program, query.as_deref())?; if let Some(function) = function { From 4ec39e8166686c9a440ec6cad08b44832eda7569 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 25 Nov 2024 01:06:03 -0500 Subject: [PATCH 17/68] feat(db): add a generic DbRecords to support arbitrary dataformat values --- crates/agent/src/db.rs | 128 +++++++------------ crates/common/src/db/tree.rs | 45 +++++++ crates/common/src/format/impl_collections.rs | 57 +++++++-- crates/common/src/format/mod.rs | 10 +- 4 files changed, 143 insertions(+), 97 deletions(-) diff --git a/crates/agent/src/db.rs b/crates/agent/src/db.rs index f45b2f7a..01354580 100644 --- a/crates/agent/src/db.rs +++ b/crates/agent/src/db.rs @@ -9,7 +9,11 @@ use bytes::Buf; use indexmap::IndexMap; use snops_common::{ api::EnvInfo, - db::{error::DatabaseError, tree::DbTree, Database as DatabaseTrait}, + db::{ + error::DatabaseError, + tree::{DbRecords, DbTree}, + Database as DatabaseTrait, + }, format::{self, read_dataformat, DataFormat, DataReadError, DataWriteError, PackedUint}, state::{AgentId, AgentState, EnvId, HeightRequest}, }; @@ -21,21 +25,21 @@ use crate::reconcile::agent::EnvState; #[repr(u8)] pub enum AgentDbString { /// JSON web token of agent. - Jwt, + Jwt = 0, /// Process ID of node. Used to keep track of zombie node processes. - NodePid, + NodePid = 1, // Url to Loki instance, configured by the endpoint. - LokiUrl, + LokiUrl = 2, /// Current state of the agent. - AgentState, + AgentState = 3, /// Current environment state. - EnvState, + EnvState = 4, /// Latest stored environment info. - EnvInfo, + EnvInfo = 5, /// Agent addresses resolved by the controlplane. - ResolvedAddrs, + ResolvedAddrs = 6, /// Last height of the agent state - LastHeight, + LastHeight = 7, } impl DataFormat for AgentDbString { @@ -57,6 +61,9 @@ impl DataFormat for AgentDbString { 2 => Self::LokiUrl, 3 => Self::AgentState, 4 => Self::EnvInfo, + 5 => Self::EnvState, + 6 => Self::ResolvedAddrs, + 7 => Self::LastHeight, _ => return Err(DataReadError::custom("invalid agent DB string type")), }) } @@ -72,14 +79,14 @@ pub struct Database { pub jwt_mutex: Mutex>, pub strings: DbTree, - pub documents: DbTree, + pub documents: DbRecords, } impl DatabaseTrait for Database { fn open(path: &Path) -> Result { let db = sled::open(path)?; let strings = DbTree::new(db.open_tree(b"v1/strings")?); - let documents = DbTree::new(db.open_tree(b"v1/documents")?); + let documents = DbRecords::new(db.open_tree(b"v1/documents")?); let jwt_mutex = Mutex::new(strings.restore(&AgentDbString::Jwt)?); Ok(Self { @@ -118,116 +125,69 @@ impl Database { pub fn env_info(&self) -> Result)>, DatabaseError> { self.documents - .restore(&AgentDbString::EnvInfo)? - .map(|format::BinaryData(bytes)| read_dataformat(&mut bytes.reader())) - .transpose() + .restore(&AgentDbString::EnvInfo) .map_err(DatabaseError::from) } pub fn set_env_info(&self, info: Option<(EnvId, Arc)>) -> Result<(), DatabaseError> { - if let Some(info) = info { - self.documents.save( - &AgentDbString::EnvInfo, - &format::BinaryData(info.to_byte_vec()?), - ) - } else { - self.documents.delete(&AgentDbString::EnvInfo).map(|_| ()) - } + self.documents + .save_option(&AgentDbString::EnvInfo, info.as_ref()) } pub fn agent_state(&self) -> Result { - Ok( - if let Some(format::BinaryData(bytes)) = - self.documents.restore(&AgentDbString::AgentState)? - { - read_dataformat(&mut bytes.reader())? - } else { - AgentState::default() - }, - ) + Ok(self + .documents + .restore(&AgentDbString::AgentState)? + .unwrap_or_default()) } pub fn set_agent_state(&self, state: &AgentState) -> Result<(), DatabaseError> { - self.documents.save( - &AgentDbString::AgentState, - &format::BinaryData(state.to_byte_vec()?), - ) + self.documents.save(&AgentDbString::AgentState, state) } pub fn resolved_addrs(&self) -> Result, DatabaseError> { - Ok( - if let Some(format::BinaryData(bytes)) = - self.documents.restore(&AgentDbString::ResolvedAddrs)? - { - read_dataformat(&mut bytes.reader())? - } else { - IndexMap::new() - }, - ) + Ok(self + .documents + .restore(&AgentDbString::ResolvedAddrs)? + .unwrap_or_default()) } pub fn set_resolved_addrs( &self, addrs: Option<&IndexMap>, ) -> Result<(), DatabaseError> { - if let Some(addrs) = addrs { - self.documents.save( - &AgentDbString::ResolvedAddrs, - &format::BinaryData(addrs.to_byte_vec()?), - ) - } else { - self.documents - .delete(&AgentDbString::ResolvedAddrs) - .map(|_| ()) - } + self.documents + .save_option(&AgentDbString::ResolvedAddrs, addrs) } pub fn env_state(&self) -> Result, DatabaseError> { Ok(self .documents .restore(&AgentDbString::EnvState)? - .map(|format::BinaryData(bytes)| read_dataformat(&mut bytes.reader())) + .map(|format::BytesFormat(bytes)| read_dataformat(&mut bytes.reader())) .transpose()?) } pub fn set_env_state(&self, state: Option<&EnvState>) -> Result<(), DatabaseError> { - if let Some(state) = state { - self.documents.save( - &AgentDbString::EnvState, - &format::BinaryData(state.to_byte_vec()?), - ) - } else { - self.documents.delete(&AgentDbString::EnvState).map(|_| ()) - } + self.documents.save_option(&AgentDbString::EnvState, state) } pub fn last_height(&self) -> Result, DatabaseError> { - Ok( - if let Some(format::BinaryData(bytes)) = - self.documents.restore(&AgentDbString::LastHeight)? - { - let (counter, req) = - read_dataformat::<_, (PackedUint, HeightRequest)>(&mut bytes.reader())?; - Some((counter.into(), req)) - } else { - None - }, - ) + Ok(self + .documents + .restore::<(PackedUint, HeightRequest)>(&AgentDbString::LastHeight)? + .map(|(counter, req)| (counter.into(), req))) } pub fn set_last_height( &self, height: Option<(usize, HeightRequest)>, ) -> Result<(), DatabaseError> { - if let Some((counter, req)) = height { - self.documents.save( - &AgentDbString::LastHeight, - &format::BinaryData((PackedUint::from(counter), req).to_byte_vec()?), - ) - } else { - self.documents - .delete(&AgentDbString::LastHeight) - .map(|_| ()) - } + self.documents.save_option( + &AgentDbString::LastHeight, + height + .map(|(counter, req)| (PackedUint::from(counter), req)) + .as_ref(), + ) } } diff --git a/crates/common/src/db/tree.rs b/crates/common/src/db/tree.rs index 00df8184..113742b2 100644 --- a/crates/common/src/db/tree.rs +++ b/crates/common/src/db/tree.rs @@ -143,3 +143,48 @@ impl DbTree { .sum()) } } + +pub struct DbRecords { + tree: sled::Tree, + _phantom: std::marker::PhantomData, +} + +impl DbRecords { + pub fn new(tree: sled::Tree) -> Self { + Self { + tree, + _phantom: std::marker::PhantomData, + } + } + + pub fn restore(&self, key: &K) -> Result, DatabaseError> { + Ok(self + .tree + .get(key.to_byte_vec()?)? + .map(|value_bytes| read_dataformat(&mut value_bytes.reader())) + .transpose()?) + } + + pub fn save(&self, key: &K, value: &V) -> Result<(), DatabaseError> { + let key_bytes = key.to_byte_vec()?; + let mut value_bytes = Vec::new(); + write_dataformat(&mut value_bytes, value)?; + self.tree.insert(key_bytes, value_bytes)?; + Ok(()) + } + + pub fn save_option( + &self, + key: &K, + value: Option<&V>, + ) -> Result<(), DatabaseError> { + match value { + Some(value) => self.save(key, value), + None => self.delete(key).map(|_| ()), + } + } + + pub fn delete(&self, key: &K) -> Result { + Ok(self.tree.remove(key.to_byte_vec()?)?.is_some()) + } +} diff --git a/crates/common/src/format/impl_collections.rs b/crates/common/src/format/impl_collections.rs index bef6fceb..0ba87e34 100644 --- a/crates/common/src/format/impl_collections.rs +++ b/crates/common/src/format/impl_collections.rs @@ -10,37 +10,62 @@ use super::{ DataWriteError, }; -#[derive(Debug, Clone)] -pub struct BinaryData(pub Vec); -impl From> for BinaryData { +/// BytesFormat is a simple wrapper around a Vec that implements DataFormat +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct BytesFormat(pub Vec); +impl From> for BytesFormat { fn from(data: Vec) -> Self { Self(data) } } -impl From for Vec { - fn from(data: BinaryData) -> Self { +impl From for Vec { + fn from(data: BytesFormat) -> Self { data.0 } } -impl DataFormat for BinaryData { +impl DataFormat for BytesFormat { type Header = (); const LATEST_HEADER: Self::Header = (); fn write_data(&self, writer: &mut W) -> Result { - let written = PackedUint::from(self.0.len()).write_data(writer)?; - writer.write_all(&self.0)?; - Ok(written + self.0.len()) + Ok(PackedUint::from(self.0.len()).write_data(writer)? + writer.write(&self.0)?) } fn read_data(reader: &mut R, _header: &Self::Header) -> Result { - let len = usize::from(PackedUint::read_data(reader, &())?); - let mut data = Vec::with_capacity(len); + let mut data = vec![0; usize::from(PackedUint::read_data(reader, &())?)]; reader.read_exact(&mut data)?; Ok(Self(data)) } } +/// EncodedFormat is a simple wrapper around a DataFormat to encode header data +/// with the data +#[derive(Debug, Clone)] +pub struct EncodedFormat(pub F); + +impl PartialEq for EncodedFormat { + fn eq(&self, other: &Self) -> bool { + self.0 == other.0 + } +} + +impl Eq for EncodedFormat {} + +impl DataFormat for EncodedFormat { + type Header = (); + const LATEST_HEADER: Self::Header = (); + + fn write_data(&self, writer: &mut W) -> Result { + Ok(self.write_header(writer)? + self.write_data(writer)?) + } + + fn read_data(reader: &mut R, _header: &Self::Header) -> Result { + let header = F::read_header(reader)?; + Ok(Self(F::read_data(reader, &header)?)) + } +} + impl DataFormat for [T; N] { type Header = T::Header; const LATEST_HEADER: Self::Header = T::LATEST_HEADER; @@ -162,7 +187,7 @@ impl_map!(IndexMap); #[cfg(test)] #[rustfmt::skip] mod test { - use crate::format::DataFormat; + use crate::format::{BytesFormat, DataFormat}; macro_rules! case { ($name:ident, $ty:ty, $a:expr, $b:expr) => { @@ -213,4 +238,12 @@ mod test { 3, 0, 4, 0 ]); + + // binary data test + case!(test_binary_data, BytesFormat, BytesFormat(vec![1, 2, 3]), [ + 1, 3, + 1, + 2, + 3 + ]); } diff --git a/crates/common/src/format/mod.rs b/crates/common/src/format/mod.rs index 7078fcd1..ed079221 100644 --- a/crates/common/src/format/mod.rs +++ b/crates/common/src/format/mod.rs @@ -14,7 +14,7 @@ mod impl_strings; mod impl_tuples; mod packed_int; -pub use impl_collections::BinaryData; +pub use impl_collections::{BytesFormat, EncodedFormat}; pub use packed_int::*; use thiserror::Error; @@ -109,6 +109,14 @@ pub trait DataFormat: Sized { self.write_data(&mut buf)?; Ok(buf) } + + /// Convert the data to a bytevec and include the header + fn to_byte_vec_headered(&self) -> Result, DataWriteError> { + let mut buf = Vec::new(); + self.write_header(&mut buf)?; + self.write_data(&mut buf)?; + Ok(buf) + } } pub trait DataFormatWriter { From 7ec0f1f3cbf2062281e94f07668c4d8669876094 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 25 Nov 2024 03:07:29 -0500 Subject: [PATCH 18/68] fix(agent): fix various reconcile errors --- crates/agent/src/api.rs | 60 ++++++++++++------- crates/agent/src/net.rs | 6 +- crates/agent/src/reconcile/agent.rs | 86 +++++++++++++++------------ crates/agent/src/reconcile/files.rs | 76 +++++++++++++++++++---- crates/agent/src/reconcile/process.rs | 2 +- crates/agent/src/reconcile/storage.rs | 36 +++++++---- crates/agent/src/rpc/control.rs | 4 +- crates/agent/src/server.rs | 4 +- crates/agent/src/transfers.rs | 3 + crates/common/src/rpc/error.rs | 2 + 10 files changed, 186 insertions(+), 93 deletions(-) diff --git a/crates/agent/src/api.rs b/crates/agent/src/api.rs index 0193cdd3..0e42e8e2 100644 --- a/crates/agent/src/api.rs +++ b/crates/agent/src/api.rs @@ -117,7 +117,7 @@ pub async fn check_binary( // this also checks for sha256 differences, along with last modified time // against the target - if !should_download_file( + if !get_file_issues( &client, &source_url, path, @@ -126,6 +126,7 @@ pub async fn check_binary( false, ) .await + .map(|e| e.is_none()) .unwrap_or(true) { // check permissions and ensure 0o755 @@ -176,49 +177,62 @@ pub async fn check_binary( Ok(()) } -pub async fn should_download_file( +#[derive(Debug)] +pub enum BadFileReason { + /// File is missing + NotFound, + /// File size mismatch + Size, + /// SHA256 mismatch + Sha256, + /// A new version is available based on modified header + Stale, +} + +pub async fn get_file_issues( client: &reqwest::Client, - loc: &str, - path: &Path, + src: &str, + dst: &Path, size: Option, sha256: Option<&str>, offline: bool, -) -> Result { - if !path.exists() { - return Ok(true); +) -> Result, ReconcileError2> { + if !dst.try_exists().unwrap_or(false) { + return Ok(Some(BadFileReason::NotFound)); } - let meta = tokio::fs::metadata(&path) + let meta = tokio::fs::metadata(&dst) .await - .map_err(|e| ReconcileError2::FileStatError(path.to_path_buf(), e.to_string()))?; + .map_err(|e| ReconcileError2::FileStatError(dst.to_path_buf(), e.to_string()))?; let local_content_length = meta.len(); // if the binary entry is provided, check if the file size and sha256 match // file size is incorrect if size.is_some_and(|s| s != local_content_length) { - return Ok(true); + return Ok(Some(BadFileReason::Size)); } // if sha256 is present, only download if the sha256 is different if let Some(sha256) = sha256 { - return Ok(sha256_file(&path.to_path_buf()) - .map_err(|e| ReconcileError2::FileReadError(path.to_path_buf(), e.to_string()))? - != sha256.to_ascii_lowercase()); + let bad_sha256 = sha256_file(&dst.to_path_buf()) + .map_err(|e| ReconcileError2::FileReadError(dst.to_path_buf(), e.to_string()))? + != sha256.to_ascii_lowercase(); + return Ok(bad_sha256.then_some(BadFileReason::Sha256)); } // if we're offline, don't download if offline { - return Ok(false); + return Ok(None); } // check last modified let res = client - .head(loc) + .head(src) .send() .await .map_err(|e| ReconcileError2::HttpError { method: String::from("HEAD"), - url: loc.to_owned(), + url: src.to_owned(), error: e.to_string(), })?; @@ -228,7 +242,7 @@ pub async fn should_download_file( // parse as a string .and_then(|e| e.to_str().ok()) else { - return Ok(true); + return Ok(Some(BadFileReason::Stale)); }; let Some(remote_content_length) = res @@ -237,16 +251,18 @@ pub async fn should_download_file( // parse the header as a u64 .and_then(|e| e.to_str().ok().and_then(|s| s.parse::().ok())) else { - return Ok(true); + return Ok(Some(BadFileReason::Size)); }; let remote_last_modified = httpdate::parse_http_date(last_modified_header); let local_last_modified = meta .modified() - .map_err(|e| ReconcileError2::FileStatError(path.to_path_buf(), e.to_string()))?; + .map_err(|e| ReconcileError2::FileStatError(dst.to_path_buf(), e.to_string()))?; - Ok(remote_last_modified + let is_stale = remote_last_modified .map(|res| res > local_last_modified) - .unwrap_or(true) - || remote_content_length != local_content_length) + .unwrap_or(true); + Ok(is_stale + .then_some(BadFileReason::Stale) + .or_else(|| (remote_content_length != local_content_length).then_some(BadFileReason::Size))) } diff --git a/crates/agent/src/net.rs b/crates/agent/src/net.rs index fb931d7a..8c18e196 100644 --- a/crates/agent/src/net.rs +++ b/crates/agent/src/net.rs @@ -12,7 +12,7 @@ pub fn get_internal_addrs() -> Result> { // loopback addresses can be used when the networks are calculated // to be the same, but they are not useful for peer to peer comms if ip.is_loopback() { - info!("skipping loopback iface {name}: {ip:?}"); + info!("Skipping loopback iface {name}: {ip:?}"); return None; } @@ -21,11 +21,11 @@ pub fn get_internal_addrs() -> Result> { // these addrs are about as useful as their v4 counterpart if let IpAddr::V6(v6) = ip { if (v6.segments()[0] & 0xffc0) == 0xfe80 { - info!("skipping link-local iface {name}: {ip:?}"); + info!("Skipping link-local iface {name}: {ip:?}"); return None; } } - info!("using iface {name}: {ip:?}"); + info!("Using iface {name}: {ip:?}"); Some(ip) }) .collect()) diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index 33a49878..f66fa0ec 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -19,6 +19,7 @@ use tokio::{ select, sync::{mpsc::Receiver, Mutex}, task::AbortHandle, + time::sleep_until, }; use tracing::{error, info, trace, warn}; @@ -26,7 +27,7 @@ use super::{ command::NodeCommand, process::ProcessContext, storage::{BinaryReconciler, GenesisReconciler, LedgerModifyResult, StorageVersionReconciler}, - DirectoryReconciler, Reconcile, ReconcileStatus, + Reconcile, ReconcileStatus, }; use crate::{ db::Database, @@ -137,17 +138,19 @@ impl AgentStateReconciler { // The first reconcile is scheduled for 5 seconds after startup. // Connecting to the controlplane will likely trigger a reconcile sooner. let mut next_reconcile_at = Instant::now() + Duration::from_secs(5); - let mut wait = Box::pin(tokio::time::sleep_until(next_reconcile_at.into())); loop { - // Await for the next reconcile, allowing for it to be moved up sooner - select! { - // Replace the next_reconcile_at with the soonest reconcile time - Some(new_reconcile_at) = reconcile_requests.recv() => { - next_reconcile_at = next_reconcile_at.min(new_reconcile_at); - wait = Box::pin(tokio::time::sleep_until(next_reconcile_at.into())); - }, - _ = &mut wait => {} + loop { + // Await for the next reconcile, allowing for it to be moved up sooner + select! { + // Replace the next_reconcile_at with the soonest reconcile time + Some(new_reconcile_at) = reconcile_requests.recv() => { + next_reconcile_at = next_reconcile_at.min(new_reconcile_at); + }, + _ = sleep_until(next_reconcile_at.into()) => { + break + } + } } // Drain the reconcile request queue @@ -160,16 +163,17 @@ impl AgentStateReconciler { // This prevents the agent state from changing during reconciliation self.agent_state = self.state.agent_state.read().await.deref().clone(); - trace!("reconciling agent state..."); + trace!("Reconciling agent state..."); match self.reconcile().await { Ok(status) => { if status.inner.is_some() { - trace!("reconcile completed"); + trace!("Reconcile completed"); } if !status.conditions.is_empty() { - trace!("reconcile conditions: {:?}", status.conditions); + trace!("Reconcile conditions: {:?}", status.conditions); } if let Some(requeue_after) = status.requeue_after { + trace!("Requeueing after {requeue_after:?}"); next_reconcile_at = Instant::now() + requeue_after; } } @@ -193,9 +197,9 @@ macro_rules! reconcile { reconcile!($id, $e, res => {}) }; ($id:ident, $e:expr, $v:ident => $rest:expr) => { - let $v = $e.reconcile().await?; if $v.is_requeue() { + trace!("Requeue needed for {} ({:?}) {:?}", stringify!($id), $v.scopes, $v.conditions); return Ok($v.add_scope(concat!(stringify!($id), "/requeue"))); } $rest @@ -269,20 +273,39 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { return Ok(ReconcileStatus::default().add_scope("agent_state/offline")); } + let node_arc = Arc::new(*node.clone()); + // Reconcile behavior while the node is running... if let Some(process) = self.context.process.as_ref() { // If the process has exited, clear the process context if !process.is_running() { - info!("node process has exited..."); + info!("Node process has exited..."); self.context.process = None; } else { + // Accumulate all the fields that are used to derive the command that starts + // the node. + let command = NodeCommand::new( + Arc::clone(&self.state), + node_arc, + *env_id, + Arc::clone(&env_info), + ) + .await?; + + // If the command has changed, restart the process + if process.command != command { + info!("Node command has changed, restarting process..."); + self.context.shutdown_pending = true; + return Ok(ReconcileStatus::empty() + .add_scope("agent_state/command_changed") + .requeue_after(Duration::ZERO)); + } + // Prevent other reconcilers from running while the node is running return Ok(ReconcileStatus::default().add_scope("agent_state/running")); } } - let node_arc = Arc::new(*node.clone()); - // Initialize the transfers context with the current status if self.context.transfers.is_none() { // TODO: write this to the db @@ -305,13 +328,10 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { // Ensure the storage version is correct, deleting the storage path // the version changes. reconcile!( - storage, + storage_version, StorageVersionReconciler(&storage_path, env_info.storage.version) ); - // Create the storage path if it does not exist - reconcile!(dir, DirectoryReconciler(&storage_path)); - // Resolve the genesis block reconcile!( genesis, @@ -357,27 +377,19 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { } ); - // Accumulate all the fields that are used to derive the command that starts - // the node. - // This will be used to determine if the command has changed at all. - let command = NodeCommand::new( - Arc::clone(&self.state), - node_arc, - *env_id, - Arc::clone(&env_info), - ) - .await?; - - if self.context.process.as_ref().map(|p| &p.command) != Some(&command) { - // TODO: OK to restart the node -- command has changed - } - - // TODO: spawn the command, manage its state, check that it's up // TODO: if possible, use the NodeCommand as configuration for a node service to // allow running the node outside of the agent if self.context.process.is_none() { info!("Starting node process"); + let command = NodeCommand::new( + Arc::clone(&self.state), + node_arc, + *env_id, + Arc::clone(&env_info), + ) + .await?; + let process = ProcessContext::new(command)?; self.context.process = Some(process); } diff --git a/crates/agent/src/reconcile/files.rs b/crates/agent/src/reconcile/files.rs index b17db0f5..51911a11 100644 --- a/crates/agent/src/reconcile/files.rs +++ b/crates/agent/src/reconcile/files.rs @@ -13,12 +13,12 @@ use snops_common::{ rpc::error::ReconcileError2, state::{NetworkId, StorageId, TransferId, TransferStatusUpdate}, }; -use tracing::error; +use tracing::{error, trace, warn}; use url::Url; use super::{Reconcile, ReconcileCondition, ReconcileStatus}; use crate::{ - api::{download_file, should_download_file}, + api::{download_file, get_file_issues}, state::GlobalState, transfers, }; @@ -131,7 +131,7 @@ impl Reconcile for FileReconciler { let tx_id = self.tx_id.unwrap(); // transfer is pending - match self.state.transfers.entry(tx_id) { + let is_complete = match self.state.transfers.entry(tx_id) { dashmap::Entry::Occupied(occupied_entry) => { let entry = occupied_entry.get(); @@ -167,11 +167,12 @@ impl Reconcile for FileReconciler { } // entry is complete + true } - dashmap::Entry::Vacant(_) => {} - } + dashmap::Entry::Vacant(_) => false, + }; - let is_file_ready = !should_download_file( + let file_problems = get_file_issues( &client, self.src.as_str(), self.dst.as_path(), @@ -181,15 +182,56 @@ impl Reconcile for FileReconciler { ) .await?; + // There is an issue with the file being complete and not existing + if is_complete && !self.dst.exists() { + // Clear the download + self.tx_id = None; + warn!( + "File is complete but does not exist: {} (Problem: {file_problems:?})", + self.dst.display() + ); + + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::MissingFile( + self.dst.display().to_string(), + )) + .requeue_after(Duration::from_secs(1))); + } + + if is_complete && file_problems.is_some() { + warn!( + "Complete file has {file_problems:?} problems: {}", + self.dst.display() + ); + + // if the file is complete, but there are issues, requeue + if self.dst.exists() { + // delete the file + tokio::fs::remove_file(&self.dst).await.map_err(|e| { + ReconcileError2::DeleteFileError(self.dst.clone(), e.to_string()) + })?; + } + + // Clear the download + self.tx_id = None; + + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::MissingFile(self.src.to_string())) + .requeue_after(Duration::from_secs(1))); + } + // Everything is good. Ensure file permissions - if is_file_ready { + if file_problems.is_none() { self.check_and_set_mode()?; + trace!("File reconcile complete: {}", self.dst.display()); return Ok(ReconcileStatus::with(true)); } // file does not exist and cannot be downloaded right now if !self.dst.exists() && self.offline { - return Ok(ReconcileStatus::with(false)); + return Ok( + ReconcileStatus::with(false).add_condition(ReconcileCondition::PendingConnection) + ); } let src = self.src.clone(); @@ -197,11 +239,13 @@ impl Reconcile for FileReconciler { let transfer_tx = self.state.transfer_tx.clone(); // download the file - let handle = - tokio::spawn( - async move { download_file(tx_id, &client, src, &dst, transfer_tx).await }, - ) - .abort_handle(); + let handle = tokio::spawn(async move { + download_file(tx_id, &client, src, &dst, transfer_tx) + .await + // Dropping the File from download_file should close the handle + .map(|res| res.is_some()) + }) + .abort_handle(); // update the transfer with the handle (so it can be canceled if necessary) if let Err(e) = self @@ -212,6 +256,12 @@ impl Reconcile for FileReconciler { error!("failed to send transfer handle: {e}"); } + trace!( + "Started download of {} to {} via tx_id {tx_id}", + self.src, + self.dst.display() + ); + // transfer is pending - requeue after 1 second with the pending condition Ok(ReconcileStatus::empty() .add_condition(ReconcileCondition::PendingTransfer( diff --git a/crates/agent/src/reconcile/process.rs b/crates/agent/src/reconcile/process.rs index ad9fe358..e22e5559 100644 --- a/crates/agent/src/reconcile/process.rs +++ b/crates/agent/src/reconcile/process.rs @@ -120,7 +120,7 @@ pub struct EndProcessReconciler<'a>(pub &'a mut ProcessContext); impl<'a> Reconcile<(), ReconcileError2> for EndProcessReconciler<'a> { async fn reconcile(&mut self) -> Result, ReconcileError2> { - if !self.0.is_running() { + if self.0.child.try_wait().is_ok_and(|status| status.is_some()) { return Ok(ReconcileStatus::default()); } diff --git a/crates/agent/src/reconcile/storage.rs b/crates/agent/src/reconcile/storage.rs index cde89fc6..810bf7fa 100644 --- a/crates/agent/src/reconcile/storage.rs +++ b/crates/agent/src/reconcile/storage.rs @@ -16,7 +16,7 @@ use snops_common::{ state::{HeightRequest, InternedId, TransferId}, }; use tokio::{process::Command, sync::Mutex, task::AbortHandle}; -use tracing::{error, trace}; +use tracing::{error, info, trace}; use url::Url; use super::{ @@ -82,9 +82,8 @@ impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { .with_binary(target_binary) .with_tx_id(transfer.as_ref().map(|(tx, _)| *tx)); let file_res = file_rec.reconcile().await?; - if let Some(tx_id) = file_rec.tx_id { - **transfer = Some((tx_id, target_binary.clone())); - } + + **transfer = file_rec.tx_id.map(|tx_id| (tx_id, target_binary.clone())); // Transfer is pending or a failure occurred if file_res.is_requeue() { @@ -140,7 +139,7 @@ impl<'a> Reconcile<(), ReconcileError2> for GenesisReconciler<'a> { .map(|ok| ok.elapsed().as_secs() < 300) .unwrap_or(false); - if env_info.storage.native_genesis || !genesis_file_ok { + if env_info.storage.native_genesis || genesis_file_ok { return Ok(ReconcileStatus::default()); } @@ -156,9 +155,7 @@ impl<'a> Reconcile<(), ReconcileError2> for GenesisReconciler<'a> { .with_tx_id(**transfer); let file_res = file_rec.reconcile().await?; - if let Some(tx_id) = file_rec.tx_id { - **transfer = Some(tx_id); - } + **transfer = file_rec.tx_id; if file_res.is_requeue() { return Ok(file_res.emptied().add_scope("file/requeue")); @@ -363,7 +360,7 @@ impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { // TODO: only call this after unpacking the ledger // create the ledger path if it doesn't exist - DirectoryReconciler(&ledger_path.join(".aleo")) + DirectoryReconciler(&ledger_path.join(".aleo/storage")) .reconcile() .await?; @@ -489,12 +486,25 @@ impl<'a> Reconcile<(), ReconcileError2> for StorageVersionReconciler<'a> { }; // wipe old storage when the version changes - Ok(if version_file_data != Some(*version) && path.exists() { + if version_file_data != Some(*version) && path.exists() { + info!("Removing storage directory for version mismatch: {version_file_data:?} != {version:?}"); let _ = tokio::fs::remove_dir_all(&path).await; - ReconcileStatus::default() } else { // return an empty status if the version is the same - ReconcileStatus::empty() - }) + return Ok(ReconcileStatus::empty()); + }; + + DirectoryReconciler(path).reconcile().await?; + + if !version_file.exists() { + tokio::fs::write(&version_file, version.to_string()) + .await + .map_err(|e| { + error!("failed to write storage version: {e}"); + ReconcileError2::CreateDirectory(version_file.to_path_buf(), e.to_string()) + })?; + } + + Ok(ReconcileStatus::default()) } } diff --git a/crates/agent/src/rpc/control.rs b/crates/agent/src/rpc/control.rs index 7cc5ce82..760872eb 100644 --- a/crates/agent/src/rpc/control.rs +++ b/crates/agent/src/rpc/control.rs @@ -90,7 +90,7 @@ impl AgentService for AgentRpcServer { error!("failed to send transfer statuses: {err}"); } - info!("queing reconcilation on handshake..."); + info!("Received control-plane handshake"); // Queue a reconcile immediately as we have received new state. // The reconciler will decide if anything has actually changed @@ -104,7 +104,7 @@ impl AgentService for AgentRpcServer { _: context::Context, target: AgentState, ) -> Result<(), ReconcileError> { - info!("queing reconcilation..."); + info!("Received reconcile request..."); self.state.update_agent_state(target).await; Ok(()) } diff --git a/crates/agent/src/server.rs b/crates/agent/src/server.rs index 1fb18b65..9700aad8 100644 --- a/crates/agent/src/server.rs +++ b/crates/agent/src/server.rs @@ -43,7 +43,7 @@ async fn node_ws_handler(ws: WebSocketUpgrade, State(state): State) -> async fn handle_socket(mut socket: WebSocket, state: AppState) { let mut node_client = state.node_client.write().await; if node_client.is_some() { - warn!("a new node RPC connection tried to establish when one was already established"); + warn!("A new node RPC connection tried to establish when one was already established"); let _ = socket.send(Message::Close(None)).await; return; } @@ -56,7 +56,7 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { let client = NodeServiceClient::new(tarpc::client::Config::default(), client_transport).spawn(); // store the client in state - tracing::info!("node client connected"); + tracing::info!("Node client connected"); *node_client = Some(client); drop(node_client); diff --git a/crates/agent/src/transfers.rs b/crates/agent/src/transfers.rs index ef2ef7fa..77d35cc1 100644 --- a/crates/agent/src/transfers.rs +++ b/crates/agent/src/transfers.rs @@ -104,6 +104,9 @@ pub fn start_monitor(client: ClientLock) -> (TransferTx, Arc { let transfer = ent.get_mut(); transfer.handle = Some(handle); + + // prevent broadcasting the handle to the control plane + continue; }, _ => continue, diff --git a/crates/common/src/rpc/error.rs b/crates/common/src/rpc/error.rs index d83234bd..5a1dd9ae 100644 --- a/crates/common/src/rpc/error.rs +++ b/crates/common/src/rpc/error.rs @@ -167,6 +167,8 @@ pub enum ReconcileError2 { MissingLocalPrivateKey, #[error("failed to create directory {0}: {1}")] CreateDirectory(PathBuf, String), + #[error("failed to delete file {0}: {1}")] + DeleteFileError(PathBuf, String), #[error("failed to get metadata for {0}: {1}")] FileStatError(PathBuf, String), #[error("failed to read file {0}: {1}")] From 96ddc5d4ed66a4d516627c5cbeb1a22dc93d1c78 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 25 Nov 2024 22:21:41 -0500 Subject: [PATCH 19/68] feat(agent,control): reconcile can now update node configuration --- crates/agent/src/cli.rs | 8 +- crates/agent/src/db.rs | 11 +- crates/agent/src/main.rs | 4 +- crates/agent/src/reconcile/address.rs | 91 ++++ crates/agent/src/reconcile/agent.rs | 584 ++++++++------------- crates/agent/src/reconcile/mod.rs | 2 + crates/agent/src/reconcile/process.rs | 20 +- crates/agent/src/reconcile/state.rs | 73 +++ crates/agent/src/reconcile/storage.rs | 68 ++- crates/agent/src/rpc/control.rs | 10 +- crates/agent/src/server.rs | 7 +- crates/agent/src/state.rs | 6 +- crates/aot/src/runner/mod.rs | 109 ++-- crates/aot/src/runner/rpc/mod.rs | 182 +++---- crates/common/src/rpc/control/agent.rs | 7 +- crates/common/src/state/agent_state.rs | 10 + crates/common/src/state/agent_status.rs | 6 +- crates/controlplane/src/env/mod.rs | 20 +- crates/controlplane/src/server/mod.rs | 3 + crates/controlplane/src/state/reconcile.rs | 7 +- crates/controlplane/src/state/rpc.rs | 7 +- 21 files changed, 656 insertions(+), 579 deletions(-) create mode 100644 crates/agent/src/reconcile/address.rs create mode 100644 crates/agent/src/reconcile/state.rs diff --git a/crates/agent/src/cli.rs b/crates/agent/src/cli.rs index 7ee34a5c..7ee990a5 100644 --- a/crates/agent/src/cli.rs +++ b/crates/agent/src/cli.rs @@ -129,13 +129,13 @@ impl Cli { if fs::metadata(file).is_ok() { query.push_str("&local_pk=true"); } else { - warn!("private-key-file flag ignored as the file was not found: {file:?}") + warn!("Private-key-file flag ignored as the file was not found: {file:?}") } } // add &labels= if id is present if let Some(labels) = &self.labels { - info!("using labels: {:?}", labels); + info!("Using labels: {:?}", labels); query.push_str(&format!( "&labels={}", labels @@ -182,9 +182,9 @@ impl Cli { let external_addr = self.external; if let Some(addr) = external_addr { - info!("using external addr: {}", addr); + info!("Using external addr: {}", addr); } else { - info!("skipping external addr"); + info!("Skipping external addr"); } (internal_addrs, external_addr) diff --git a/crates/agent/src/db.rs b/crates/agent/src/db.rs index 01354580..2ea92945 100644 --- a/crates/agent/src/db.rs +++ b/crates/agent/src/db.rs @@ -5,7 +5,6 @@ use std::{ sync::{Arc, Mutex}, }; -use bytes::Buf; use indexmap::IndexMap; use snops_common::{ api::EnvInfo, @@ -14,12 +13,12 @@ use snops_common::{ tree::{DbRecords, DbTree}, Database as DatabaseTrait, }, - format::{self, read_dataformat, DataFormat, DataReadError, DataWriteError, PackedUint}, + format::{DataFormat, DataReadError, DataWriteError, PackedUint}, state::{AgentId, AgentState, EnvId, HeightRequest}, }; use url::Url; -use crate::reconcile::agent::EnvState; +use crate::reconcile::state::EnvState; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] #[repr(u8)] @@ -161,11 +160,7 @@ impl Database { } pub fn env_state(&self) -> Result, DatabaseError> { - Ok(self - .documents - .restore(&AgentDbString::EnvState)? - .map(|format::BytesFormat(bytes)| read_dataformat(&mut bytes.reader())) - .transpose()?) + self.documents.restore(&AgentDbString::EnvState) } pub fn set_env_state(&self, state: Option<&EnvState>) -> Result<(), DatabaseError> { diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index fd7742ed..21d20576 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -140,7 +140,9 @@ async fn main() { loop { let req = client::new_ws_request(&ws_uri, state2.db.jwt()); client::ws_connection(req, Arc::clone(&state2)).await; - info!("Attempting to reconnect..."); + // Remove the control client + state2.client.write().await.take(); + info!("Attempting to reconnect to the control plane..."); tokio::time::sleep(Duration::from_secs(5)).await; } }); diff --git a/crates/agent/src/reconcile/address.rs b/crates/agent/src/reconcile/address.rs new file mode 100644 index 00000000..f663af04 --- /dev/null +++ b/crates/agent/src/reconcile/address.rs @@ -0,0 +1,91 @@ +use std::{collections::HashSet, sync::Arc}; + +use snops_common::{ + rpc::error::ReconcileError2, + state::{AgentId, AgentPeer, NodeState}, +}; +use tarpc::context; +use tracing::{error, warn}; + +use super::{Reconcile, ReconcileStatus}; +use crate::state::GlobalState; + +/// Given a node state, resolve the addresses of the agent based peers and +/// validators. Non-agent based peers have their addresses within the state +/// already. +pub struct AddressResolveReconciler { + pub state: Arc, + pub node: Arc, +} + +impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { + async fn reconcile(&mut self) -> Result, ReconcileError2> { + let AddressResolveReconciler { state, node } = self; + + // Find agents that do not have cached addresses + let unresolved_addrs: HashSet = { + let resolved_addrs = state.resolved_addrs.read().await; + node.peers + .iter() + .chain(node.validators.iter()) + .filter_map(|p| { + if let AgentPeer::Internal(id, _) = p { + (!resolved_addrs.contains_key(id)).then_some(*id) + } else { + None + } + }) + .collect() + }; + + // All addrs have been resolved. + // TODO: May need to mark some of these as stale at some point. + if unresolved_addrs.is_empty() { + return Ok(ReconcileStatus::default()); + } + + let Some(client) = state.client.read().await.clone() else { + warn!("Agent state contains {} addresses that need to be resolved, but client is not connected", unresolved_addrs.len()); + + // Client is offline so new addrs cannot be requested + return Ok(ReconcileStatus::default()); + }; + + // Fetch all unresolved addresses and update the cache + tracing::trace!( + "Need to resolve addrs: {}", + unresolved_addrs + .iter() + .map(|id| id.to_string()) + .collect::>() + .join(",") + ); + + // Resolve the addresses + // TODO: turn this into a background process so the reconcile operation can run + // instantly + let new_addrs = client + .resolve_addrs(context::current(), unresolved_addrs) + .await + .map_err(|e| ReconcileError2::RpcError(e.to_string()))? + .map_err(ReconcileError2::AddressResolve)?; + + tracing::trace!( + "Resolved new addrs: {}", + new_addrs + .iter() + .map(|(id, addr)| format!("{}: {}", id, addr)) + .collect::>() + .join(", ") + ); + + // Extend the cache with the new addresses + let mut lock = state.resolved_addrs.write().await; + lock.extend(new_addrs); + if let Err(e) = state.db.set_resolved_addrs(Some(&lock)) { + error!("failed to save resolved addrs to db: {e}"); + } + + Ok(ReconcileStatus::default()) + } +} diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index f66fa0ec..4c4e7cc6 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -1,37 +1,34 @@ use std::{ - collections::HashSet, ops::Deref, sync::Arc, time::{Duration, Instant}, }; use snops_common::{ - api::EnvInfo, binaries::BinaryEntry, - format::{DataFormat, DataHeaderOf}, rpc::error::ReconcileError2, - state::{ - AgentId, AgentPeer, AgentState, HeightRequest, NetworkId, NodeState, StorageId, TransferId, - }, + state::{AgentState, HeightRequest, TransferId}, }; -use tarpc::context; use tokio::{ select, sync::{mpsc::Receiver, Mutex}, task::AbortHandle, time::sleep_until, }; -use tracing::{error, info, trace, warn}; +use tracing::{error, info, trace}; use super::{ command::NodeCommand, process::ProcessContext, + state::EnvState, storage::{BinaryReconciler, GenesisReconciler, LedgerModifyResult, StorageVersionReconciler}, Reconcile, ReconcileStatus, }; use crate::{ db::Database, - reconcile::{process::EndProcessReconciler, storage::LedgerReconciler}, + reconcile::{ + address::AddressResolveReconciler, process::EndProcessReconciler, storage::LedgerReconciler, + }, state::GlobalState, }; @@ -43,40 +40,23 @@ pub struct AgentStateReconciler { pub context: AgentStateReconcilerContext, } -pub struct EnvState { - network_id: NetworkId, - storage_id: StorageId, - storage_version: u16, -} - -impl From<&EnvInfo> for EnvState { - fn from(info: &EnvInfo) -> Self { - Self { - network_id: info.network, - storage_id: info.storage.id, - storage_version: info.storage.version, - } - } -} - -impl Default for EnvState { - fn default() -> Self { - Self { - network_id: NetworkId::Mainnet, - storage_id: StorageId::default(), - storage_version: 0, - } - } -} - #[derive(Default)] -struct TransfersContext { +pub struct AgentStateReconcilerContext { /// Persisted values that determine if the storage has changed - env_state: EnvState, - + pub env_state: Option, /// The last ledger height that was successfully configured - ledger_last_height: Option<(usize, HeightRequest)>, + pub ledger_last_height: Option<(usize, HeightRequest)>, + // TODO: allow transfers to be interrupted. potentially allow them to be resumed by using the + // file range feature. + /// Information about active transfers + transfers: Option, + /// Information about the node process + pub process: Option, + pub shutdown_pending: bool, +} +#[derive(Default)] +struct TransfersContext { /// Metadata about an active binary transfer binary_transfer: Option<(TransferId, BinaryEntry)>, /// Time the binary was marked as OK @@ -96,17 +76,6 @@ struct TransfersContext { ledger_modify_handle: Option<(AbortHandle, Arc>>)>, } -#[derive(Default)] -pub struct AgentStateReconcilerContext { - // TODO: allow transfers to be interrupted. potentially allow them to be resumed by using the - // file range feature. - /// Information about active transfers - transfers: Option, - /// Information about the node process - pub process: Option, - pub shutdown_pending: bool, -} - impl AgentStateReconcilerContext { pub fn hydrate(db: &Database) -> Self { let ledger_last_height = db @@ -119,18 +88,30 @@ impl AgentStateReconcilerContext { .unwrap_or_default(); Self { - transfers: (ledger_last_height.is_some() || env_state.is_some()).then(|| { - TransfersContext { - env_state: env_state.unwrap_or_default(), - ledger_last_height, - ..Default::default() - } - }), + env_state, + ledger_last_height, ..Default::default() } } } +/// Run a reconciler and return early if a requeue is needed. A condition is +/// added to the scope when a requeue is needed to provide more context when +/// monitoring the agent. +macro_rules! reconcile { + ($id:ident, $e:expr) => { + reconcile!($id, $e, res => {}) + }; + ($id:ident, $e:expr, $v:ident => $rest:expr) => { + let $v = $e.reconcile().await?; + if $v.is_requeue() { + trace!("Requeue needed for {} ({:?}) {:?}", stringify!($id), $v.scopes, $v.conditions); + return Ok($v.add_scope(concat!(stringify!($id), "/requeue"))); + } + $rest + }; +} + impl AgentStateReconciler { pub async fn loop_forever(&mut self, mut reconcile_requests: Receiver) { let mut err_backoff = 0; @@ -139,9 +120,10 @@ impl AgentStateReconciler { // Connecting to the controlplane will likely trigger a reconcile sooner. let mut next_reconcile_at = Instant::now() + Duration::from_secs(5); + // Repeated reconcile loop loop { + // Await for the next reconcile, allowing for it to be moved up sooner loop { - // Await for the next reconcile, allowing for it to be moved up sooner select! { // Replace the next_reconcile_at with the soonest reconcile time Some(new_reconcile_at) = reconcile_requests.recv() => { @@ -187,296 +169,212 @@ impl AgentStateReconciler { // TODO: announce reconcile status to the server, throttled } } -} -/// Run a reconciler and return early if a requeue is needed. A condition is -/// added to the scope when a requeue is needed to provide more context when -/// monitoring the agent. -macro_rules! reconcile { - ($id:ident, $e:expr) => { - reconcile!($id, $e, res => {}) - }; - ($id:ident, $e:expr, $v:ident => $rest:expr) => { - let $v = $e.reconcile().await?; - if $v.is_requeue() { - trace!("Requeue needed for {} ({:?}) {:?}", stringify!($id), $v.scopes, $v.conditions); - return Ok($v.add_scope(concat!(stringify!($id), "/requeue"))); - } - $rest - }; -} + pub async fn reconcile_inventory(&mut self) -> Result, ReconcileError2> { + // TODO: cleanup other things -impl Reconcile<(), ReconcileError2> for AgentStateReconciler { - async fn reconcile(&mut self) -> Result, ReconcileError2> { - match self.agent_state.as_ref() { - AgentState::Inventory => { - // TODO: cleanup other things - - // End the process if it is running - if let Some(process) = self.context.process.as_mut() { - reconcile!(end_process, EndProcessReconciler(process), res => { - // If the process has exited, clear the process context - if res.inner.is_some() { - self.context.process = None; - } - }); + // End the process if it is running + if let Some(process) = self.context.process.as_mut() { + reconcile!(end_process, EndProcessReconciler(process), res => { + // If the process has exited, clear the process context + if res.inner.is_some() { + self.context.process = None; } + }); + } - if let Some(_transfers) = self.context.transfers.as_mut() { - if let Err(e) = self.state.db.set_env_state(None) { - error!("failed to clear env state from db: {e}"); - } - if let Err(e) = self.state.db.set_last_height(None) { - error!("failed to clear last height from db: {e}"); - } + if let Some(_transfers) = self.context.transfers.as_mut() { + if let Err(e) = self.state.db.set_env_state(None) { + error!("failed to clear env state from db: {e}"); + } + if let Err(e) = self.state.db.set_last_height(None) { + error!("failed to clear last height from db: {e}"); + } - // TODO: interrupt/kill off pending downloads + // TODO: interrupt/kill off pending downloads - // Destroy the old transfers context - self.context.transfers = None; - } + // Destroy the old transfers context + self.context.transfers = None; + } - return Ok(ReconcileStatus::default().add_scope("agent_state/inventory")); + Ok(ReconcileStatus::default().add_scope("agent_state/inventory")) + } +} + +impl Reconcile<(), ReconcileError2> for AgentStateReconciler { + async fn reconcile(&mut self) -> Result, ReconcileError2> { + let (env_id, node) = match self.agent_state.as_ref() { + AgentState::Inventory => { + return self.reconcile_inventory().await; } - AgentState::Node(env_id, node) => { - let env_info = self.state.get_env_info(*env_id).await?; - - // Check if the storage version, storage id, or network id has changed - let storage_has_changed = self - .context - .transfers - .as_ref() - .map(|t| t.env_state.changed(&env_info)) - .unwrap_or(true); - - // If the node should be torn down, or the storage has changed, we need to - // gracefully shut down the node. - let shutdown_pending = !node.online || storage_has_changed; - - // TODO: check if addrs have changed, then update the command - - if let (true, Some(process)) = ( - shutdown_pending || self.context.shutdown_pending, - self.context.process.as_mut(), - ) { - self.context.shutdown_pending = true; - reconcile!(end_process, EndProcessReconciler(process), res => { - // If the process has exited, clear the process context - if res.inner.is_some() { - self.context.process = None; - } - }); - } + AgentState::Node(env_id, node) => (env_id, node), + }; - // node is offline, no need to reconcile - if !node.online { - return Ok(ReconcileStatus::default().add_scope("agent_state/offline")); + let env_info = self.state.get_env_info(*env_id).await?; + + // Check if the storage version, storage id, or network id has changed + let storage_has_changed = self + .context + .env_state + .as_ref() + .map(|e| e.changed(&env_info)) + .unwrap_or(true); + + // Check if the ledger height is not resolved + let height_has_changed = + self.context.ledger_last_height != Some(node.height) && !node.height.1.is_top(); + + // If the node should be torn down, or the storage has changed, we need to + // gracefully shut down the node. + let shutdown_pending = !node.online || storage_has_changed || height_has_changed; + + if let (true, Some(process)) = ( + shutdown_pending || self.context.shutdown_pending, + self.context.process.as_mut(), + ) { + self.context.shutdown_pending = true; + reconcile!(end_process, EndProcessReconciler(process), res => { + // If the process has exited, clear the process context + if res.inner.is_some() { + self.context.process = None; + self.context.shutdown_pending = false; } + }); + } - let node_arc = Arc::new(*node.clone()); - - // Reconcile behavior while the node is running... - if let Some(process) = self.context.process.as_ref() { - // If the process has exited, clear the process context - if !process.is_running() { - info!("Node process has exited..."); - self.context.process = None; - } else { - // Accumulate all the fields that are used to derive the command that starts - // the node. - let command = NodeCommand::new( - Arc::clone(&self.state), - node_arc, - *env_id, - Arc::clone(&env_info), - ) - .await?; - - // If the command has changed, restart the process - if process.command != command { - info!("Node command has changed, restarting process..."); - self.context.shutdown_pending = true; - return Ok(ReconcileStatus::empty() - .add_scope("agent_state/command_changed") - .requeue_after(Duration::ZERO)); - } - - // Prevent other reconcilers from running while the node is running - return Ok(ReconcileStatus::default().add_scope("agent_state/running")); - } - } + // node is offline, no need to reconcile + if !node.online { + return Ok(ReconcileStatus::default().add_scope("agent_state/offline")); + } - // Initialize the transfers context with the current status - if self.context.transfers.is_none() { - // TODO: write this to the db - let env_state = EnvState::from(env_info.as_ref()); - if let Err(e) = self.state.db.set_env_state(Some(&env_state)) { - error!("failed to save env state to db: {e}"); - } - self.context.transfers = Some(TransfersContext { - env_state, - ..Default::default() - }); - } - let transfers = self.context.transfers.as_mut().unwrap(); - - let storage_path = self - .state - .cli - .storage_path(env_info.network, env_info.storage.id); - - // Ensure the storage version is correct, deleting the storage path - // the version changes. - reconcile!( - storage_version, - StorageVersionReconciler(&storage_path, env_info.storage.version) - ); - - // Resolve the genesis block - reconcile!( - genesis, - GenesisReconciler { - state: Arc::clone(&self.state), - env_info: Arc::clone(&env_info), - transfer: &mut transfers.genesis_transfer, - ok_at: &mut transfers.genesis_ok_at, - } - ); - - // Resolve the node's binary - reconcile!( - binary, - BinaryReconciler { - state: Arc::clone(&self.state), - env_info: Arc::clone(&env_info), - node_binary: node.binary, - transfer: &mut transfers.binary_transfer, - ok_at: &mut transfers.binary_ok_at, - } - ); - - reconcile!( - ledger, - LedgerReconciler { - state: Arc::clone(&self.state), - env_info: Arc::clone(&env_info), - modify_handle: &mut transfers.ledger_modify_handle, - target_height: node.height, - last_height: &mut transfers.ledger_last_height, - pending_height: &mut transfers.ledger_pending_height, - } - ); - - // Resolve the addresses of the peers and validators - // TODO: Set an expiry for resolved addresses - reconcile!( - address_resolve, - AddressResolveReconciler { - node: Arc::clone(&node_arc), - state: Arc::clone(&self.state), - } - ); - - // TODO: if possible, use the NodeCommand as configuration for a node service to - // allow running the node outside of the agent - - if self.context.process.is_none() { - info!("Starting node process"); - let command = NodeCommand::new( - Arc::clone(&self.state), - node_arc, - *env_id, - Arc::clone(&env_info), - ) - .await?; - - let process = ProcessContext::new(command)?; - self.context.process = Some(process); + let node_arc = Arc::new(*node.clone()); + + // Reconcile behavior while the node is running... + if let Some(process) = self.context.process.as_mut() { + // If the process has exited, clear the process context + if !process.is_running() { + info!("Node process has exited..."); + self.context.process = None; + } else { + // Accumulate all the fields that are used to derive the command that starts + // the node. + let command = NodeCommand::new( + Arc::clone(&self.state), + node_arc, + *env_id, + Arc::clone(&env_info), + ) + .await?; + + // If the command has changed, restart the process + if process.command != command { + info!("Node command has changed, restarting process..."); + self.context.shutdown_pending = true; + return Ok(ReconcileStatus::empty() + .add_scope("agent_state/command_changed") + .requeue_after(Duration::ZERO)); } + + // Prevent other reconcilers from running while the node is running + return Ok(ReconcileStatus::default().add_scope("agent_state/running")); } } - Ok(ReconcileStatus::empty()) - } -} - -/// Given a node state, resolve the addresses of the agent based peers and -/// validators. Non-agent based peers have their addresses within the state -/// already. -struct AddressResolveReconciler { - state: Arc, - node: Arc, -} - -impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { - async fn reconcile(&mut self) -> Result, ReconcileError2> { - let AddressResolveReconciler { state, node } = self; - - // Find agents that do not have cached addresses - let unresolved_addrs: HashSet = { - let resolved_addrs = state.resolved_addrs.read().await; - node.peers - .iter() - .chain(node.validators.iter()) - .filter_map(|p| { - if let AgentPeer::Internal(id, _) = p { - (!resolved_addrs.contains_key(id)).then_some(*id) - } else { - None - } - }) - .collect() - }; + let storage_path = self + .state + .cli + .storage_path(env_info.network, env_info.storage.id); + + // Ensure the storage version is correct, deleting the storage path + // the version changes. + reconcile!( + storage_version, + StorageVersionReconciler(&storage_path, env_info.storage.version), + res => { + if res.inner.is_some() { + trace!("Transfers context cleared due to storage version change"); + self.context.transfers = None; + } + } + ); - // All addrs have been resolved. - // TODO: May need to mark some of these as stale at some point. - if unresolved_addrs.is_empty() { - return Ok(ReconcileStatus::default()); + // Initialize the transfers context with the current status + // This happens after the StorageVersionReconciler as storage_version within + // env_state will be guaranteed to match the remote env after it succeeds. + if self.context.transfers.is_none() { + let env_state = EnvState::from(env_info.as_ref()); + if let Err(e) = self.state.db.set_env_state(Some(&env_state)) { + error!("failed to save env state to db: {e}"); + } + self.context.env_state = Some(env_state); + self.context.transfers = Some(Default::default()); } + let transfers = self.context.transfers.as_mut().unwrap(); + + // Resolve the genesis block + reconcile!( + genesis, + GenesisReconciler { + state: Arc::clone(&self.state), + env_info: Arc::clone(&env_info), + transfer: &mut transfers.genesis_transfer, + ok_at: &mut transfers.genesis_ok_at, + } + ); - let Some(client) = state.client.read().await.clone() else { - warn!("Agent state contains {} addresses that need to be resolved, but client is not connected", unresolved_addrs.len()); - - // Client is offline so new addrs cannot be requested - return Ok(ReconcileStatus::default()); - }; + // Resolve the node's binary + reconcile!( + binary, + BinaryReconciler { + state: Arc::clone(&self.state), + env_info: Arc::clone(&env_info), + node_binary: node.binary, + transfer: &mut transfers.binary_transfer, + ok_at: &mut transfers.binary_ok_at, + } + ); - // Fetch all unresolved addresses and update the cache - tracing::debug!( - "need to resolve addrs: {}", - unresolved_addrs - .iter() - .map(|id| id.to_string()) - .collect::>() - .join(",") + reconcile!( + ledger, + LedgerReconciler { + state: Arc::clone(&self.state), + env_info: Arc::clone(&env_info), + modify_handle: &mut transfers.ledger_modify_handle, + target_height: node.height, + last_height: &mut self.context.ledger_last_height, + pending_height: &mut transfers.ledger_pending_height, + } ); - // Resolve the addresses - // TODO: turn this into a background process so the reconcile operation can run - // instantly - let new_addrs = client - .resolve_addrs(context::current(), unresolved_addrs) - .await - .map_err(|e| ReconcileError2::RpcError(e.to_string()))? - .map_err(ReconcileError2::AddressResolve)?; - - tracing::debug!( - "resolved new addrs: {}", - new_addrs - .iter() - .map(|(id, addr)| format!("{}: {}", id, addr)) - .collect::>() - .join(", ") + // Resolve the addresses of the peers and validators + // TODO: Set an expiry for resolved addresses + reconcile!( + address_resolve, + AddressResolveReconciler { + node: Arc::clone(&node_arc), + state: Arc::clone(&self.state), + } ); - // Extend the cache with the new addresses - let mut lock = state.resolved_addrs.write().await; - lock.extend(new_addrs); - if let Err(e) = state.db.set_resolved_addrs(Some(&lock)) { - error!("failed to save resolved addrs to db: {e}"); + // TODO: if possible, use the NodeCommand as configuration for a node service to + // allow running the node outside of the agent + + if self.context.process.is_none() { + info!("Starting node process"); + let command = NodeCommand::new( + Arc::clone(&self.state), + node_arc, + *env_id, + Arc::clone(&env_info), + ) + .await?; + + let process = ProcessContext::new(command)?; + self.context.process = Some(process); + return Ok(ReconcileStatus::default().add_scope("agent_state/starting")); } - Ok(ReconcileStatus::default()) + Ok(ReconcileStatus::empty()) } } @@ -487,45 +385,3 @@ impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { // https://ledger.aleo.network/mainnet/snapshot/latest.txt // https://ledger.aleo.network/testnet/snapshot/latest.txt // https://ledger.aleo.network/canarynet/snapshot/latest.txt - -impl EnvState { - pub fn changed(&self, env_info: &EnvInfo) -> bool { - env_info.storage.version != self.storage_version - || env_info.storage.id != self.storage_id - || env_info.network != self.network_id - } -} - -impl DataFormat for EnvState { - type Header = (u8, DataHeaderOf); - - const LATEST_HEADER: Self::Header = (1u8, NetworkId::LATEST_HEADER); - - fn write_data( - &self, - writer: &mut W, - ) -> Result { - Ok(self.network_id.write_data(writer)? - + self.storage_id.write_data(writer)? - + self.storage_version.write_data(writer)?) - } - - fn read_data( - reader: &mut R, - header: &Self::Header, - ) -> Result { - if header.0 != Self::LATEST_HEADER.0 { - return Err(snops_common::format::DataReadError::unsupported( - "EnvIdentifier", - Self::LATEST_HEADER.0, - header.0, - )); - } - - Ok(Self { - network_id: NetworkId::read_data(reader, &header.1)?, - storage_id: StorageId::read_data(reader, &())?, - storage_version: u16::read_data(reader, &())?, - }) - } -} diff --git a/crates/agent/src/reconcile/mod.rs b/crates/agent/src/reconcile/mod.rs index 3ac7f989..a7a7fa19 100644 --- a/crates/agent/src/reconcile/mod.rs +++ b/crates/agent/src/reconcile/mod.rs @@ -6,7 +6,9 @@ pub mod agent; pub mod command; mod files; pub use files::*; +pub mod address; pub mod process; +pub mod state; pub mod storage; use snops_common::state::TransferId; diff --git a/crates/agent/src/reconcile/process.rs b/crates/agent/src/reconcile/process.rs index e22e5559..fd799b27 100644 --- a/crates/agent/src/reconcile/process.rs +++ b/crates/agent/src/reconcile/process.rs @@ -42,8 +42,10 @@ impl ProcessContext { } /// Returns true when the child process has not exited - pub fn is_running(&self) -> bool { - self.child.id().is_some() + pub fn is_running(&mut self) -> bool { + // This code is mutable because try_wait modifies the Child. Without + // mutability, the current running status would never be updated. + self.child.try_wait().is_ok_and(|status| status.is_none()) } /// A helper function to gracefully shutdown the node process without @@ -57,21 +59,21 @@ impl ProcessContext { select! { _ = tokio::time::sleep(NODE_GRACEFUL_SHUTDOWN_TIMEOUT) => { - info!("sending SIGKILL to node process"); + info!("Sending SIGKILL to node process"); self.send_sigkill(); }, _ = tokio::signal::ctrl_c() => { - info!("received SIGINT, sending SIGKILL to node process"); + info!("Received SIGINT, sending SIGKILL to node process"); self.send_sigkill(); }, _ = self.child.wait() => { - info!("node process has exited gracefully"); + info!("Node process has exited gracefully"); return; } } let _ = self.child.wait().await; - info!("node process has exited"); + info!("Node process has exited"); } /// Send a SIGINT to the child process @@ -120,13 +122,13 @@ pub struct EndProcessReconciler<'a>(pub &'a mut ProcessContext); impl<'a> Reconcile<(), ReconcileError2> for EndProcessReconciler<'a> { async fn reconcile(&mut self) -> Result, ReconcileError2> { - if self.0.child.try_wait().is_ok_and(|status| status.is_some()) { + if !self.0.is_running() { return Ok(ReconcileStatus::default()); } let Some(sigint_at) = self.0.sigint_at else { if self.0.send_sigint() { - info!("sent SIGINT to node process"); + info!("Sent SIGINT to node process"); } return Ok(ReconcileStatus::empty() .add_condition(ReconcileCondition::PendingShutdown) @@ -137,7 +139,7 @@ impl<'a> Reconcile<(), ReconcileError2> for EndProcessReconciler<'a> { && self.0.sigkill_at.is_none() && self.0.send_sigkill() { - info!("sent SIGKILL to node process"); + info!("Sent SIGKILL to node process"); } Ok(ReconcileStatus::empty() diff --git a/crates/agent/src/reconcile/state.rs b/crates/agent/src/reconcile/state.rs new file mode 100644 index 00000000..26e080d3 --- /dev/null +++ b/crates/agent/src/reconcile/state.rs @@ -0,0 +1,73 @@ +use snops_common::{ + api::EnvInfo, + format::{DataFormat, DataHeaderOf}, + state::{NetworkId, StorageId}, +}; + +pub struct EnvState { + network_id: NetworkId, + storage_id: StorageId, + storage_version: u16, +} + +impl EnvState { + pub fn changed(&self, env_info: &EnvInfo) -> bool { + env_info.storage.version != self.storage_version + || env_info.storage.id != self.storage_id + || env_info.network != self.network_id + } +} + +impl From<&EnvInfo> for EnvState { + fn from(info: &EnvInfo) -> Self { + Self { + network_id: info.network, + storage_id: info.storage.id, + storage_version: info.storage.version, + } + } +} + +impl Default for EnvState { + fn default() -> Self { + Self { + network_id: NetworkId::Mainnet, + storage_id: StorageId::default(), + storage_version: 0, + } + } +} + +impl DataFormat for EnvState { + type Header = (u8, DataHeaderOf); + + const LATEST_HEADER: Self::Header = (1u8, NetworkId::LATEST_HEADER); + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + Ok(self.network_id.write_data(writer)? + + self.storage_id.write_data(writer)? + + self.storage_version.write_data(writer)?) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.0 != Self::LATEST_HEADER.0 { + return Err(snops_common::format::DataReadError::unsupported( + "EnvIdentifier", + Self::LATEST_HEADER.0, + header.0, + )); + } + + Ok(Self { + network_id: NetworkId::read_data(reader, &header.1)?, + storage_id: StorageId::read_data(reader, &())?, + storage_version: u16::read_data(reader, &())?, + }) + } +} diff --git a/crates/agent/src/reconcile/storage.rs b/crates/agent/src/reconcile/storage.rs index 810bf7fa..69f9f64e 100644 --- a/crates/agent/src/reconcile/storage.rs +++ b/crates/agent/src/reconcile/storage.rs @@ -59,13 +59,20 @@ impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { .as_ref() .map(|(_, b)| b != target_binary) .unwrap_or(true); - let binary_is_ok = ok_at.is_some(); + + let dst = state.cli.path.join(SNARKOS_FILE); + + // The binary does not exist and is marked as OK... + if ok_at.is_some() && !dst.exists() { + **ok_at = None; + } // If the binary has not changed and has not expired, we can skip the binary // reconciler - if !binary_has_changed && binary_is_ok { + if !binary_has_changed && ok_at.is_some() { return Ok(ReconcileStatus::default()); } + **ok_at = None; let src = match &target_binary.source { BinarySource::Url(url) => url.clone(), @@ -75,7 +82,6 @@ impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { .map_err(|e| ReconcileError2::UrlParseError(url, e.to_string()))? } }; - let dst = state.cli.path.join(SNARKOS_FILE); let mut file_rec = FileReconciler::new(Arc::clone(state), src, dst) .with_offline(target_binary.is_api_file() && !state.is_ws_online()) @@ -134,6 +140,13 @@ impl<'a> Reconcile<(), ReconcileError2> for GenesisReconciler<'a> { .cli .storage_path(env_info.network, env_info.storage.id); + let genesis_file = storage_path.join(SNARKOS_GENESIS_FILE); + + // If the genesis file doesn't exist, it's not okay... + if !genesis_file.exists() && ok_at.is_some() { + **ok_at = None; + } + // Genesis block file has been checked within 5 minutes let genesis_file_ok = ok_at .map(|ok| ok.elapsed().as_secs() < 300) @@ -142,6 +155,7 @@ impl<'a> Reconcile<(), ReconcileError2> for GenesisReconciler<'a> { if env_info.storage.native_genesis || genesis_file_ok { return Ok(ReconcileStatus::default()); } + **ok_at = None; let genesis_url = get_genesis_route(&state.endpoint, env_info.network, env_info.storage.id); let mut file_rec = FileReconciler::new( @@ -149,7 +163,7 @@ impl<'a> Reconcile<(), ReconcileError2> for GenesisReconciler<'a> { genesis_url.parse::().map_err(|e| { ReconcileError2::UrlParseError(genesis_url.to_string(), e.to_string()) })?, - storage_path.join(SNARKOS_GENESIS_FILE), + genesis_file, ) .with_offline(!self.state.is_ws_online()) .with_tx_id(**transfer); @@ -340,23 +354,15 @@ impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { // a persisted env with non-top target heights as a request to delete // the ledger if self.last_height.is_none() { - // The default last height is the top when persisting - // and 0 when not persisting (clean ledger) - *self.last_height = Some(( - 0, - if is_persist { - HeightRequest::Top - } else { - HeightRequest::Absolute(0) - }, - )); + // The default last height is top + *self.last_height = Some((0, HeightRequest::Top)); // delete ledger because no last_height indicates a fresh env if !is_persist { let _ = tokio::fs::remove_dir_all(&ledger_path).await; } } - let last_height = self.last_height.as_mut().unwrap(); + let last_height = self.last_height.unwrap(); // TODO: only call this after unpacking the ledger // create the ledger path if it doesn't exist @@ -367,13 +373,13 @@ impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { // If there is no pending height, check if there should be a pending height if self.pending_height.is_none() { // target height has been realized - if *last_height == target_height { + if last_height == target_height { return Ok(ReconcileStatus::default()); } // If the target height is the top, we can skip the ledger reconciler if target_height.1.is_top() { - *last_height = target_height; + *self.last_height = Some(target_height); if let Err(e) = self.state.db.set_last_height(Some(target_height)) { error!("failed to save last height to db: {e}"); } @@ -385,14 +391,16 @@ impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { // If the target height is 0, we can delete the ledger if target_height.1.reset() { let _ = tokio::fs::remove_dir_all(&ledger_path).await; - *last_height = target_height; + *self.last_height = Some(target_height); if let Err(e) = self.state.db.set_last_height(Some(target_height)) { error!("failed to save last height to db: {e}"); } // Ledger operation is complete... immediately requeue because the ledger was // wiped - return Ok(ReconcileStatus::default().requeue_after(Duration::ZERO)); + return Ok(ReconcileStatus::default() + .add_scope("ledger/wipe") + .requeue_after(Duration::ZERO)); } // Target height is guaranteed to be different, not top, and not 0, which means @@ -443,7 +451,7 @@ impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { match handle { // If the ledger was modified successfully, update the last height Ok(true) => { - *last_height = pending; + *self.last_height = Some(pending); if let Err(e) = self.state.db.set_last_height(Some(pending)) { error!("failed to save last height to db: {e}"); } @@ -485,14 +493,16 @@ impl<'a> Reconcile<(), ReconcileError2> for StorageVersionReconciler<'a> { .ok() }; - // wipe old storage when the version changes - if version_file_data != Some(*version) && path.exists() { - info!("Removing storage directory for version mismatch: {version_file_data:?} != {version:?}"); - let _ = tokio::fs::remove_dir_all(&path).await; - } else { - // return an empty status if the version is the same - return Ok(ReconcileStatus::empty()); - }; + if path.exists() { + // wipe old storage when the version changes + if version_file_data != Some(*version) { + info!("Removing storage directory for version mismatch: local {version_file_data:?} != remote {version:?}"); + let _ = tokio::fs::remove_dir_all(&path).await; + } else { + // return an empty status if the version is the same + return Ok(ReconcileStatus::default()); + }; + } DirectoryReconciler(path).reconcile().await?; @@ -505,6 +515,6 @@ impl<'a> Reconcile<(), ReconcileError2> for StorageVersionReconciler<'a> { })?; } - Ok(ReconcileStatus::default()) + Ok(ReconcileStatus::empty()) } } diff --git a/crates/agent/src/rpc/control.rs b/crates/agent/src/rpc/control.rs index 760872eb..d51f8907 100644 --- a/crates/agent/src/rpc/control.rs +++ b/crates/agent/src/rpc/control.rs @@ -4,6 +4,7 @@ use std::{net::IpAddr, path::PathBuf}; use snops_common::{ aot_cmds::AotCmd, + api::EnvInfo, binaries::{BinaryEntry, BinarySource}, define_rpc_mux, prelude::snarkos_status::SnarkOSLiteBlock, @@ -94,18 +95,21 @@ impl AgentService for AgentRpcServer { // Queue a reconcile immediately as we have received new state. // The reconciler will decide if anything has actually changed - self.state.update_agent_state(handshake.state).await; + self.state + .update_agent_state(handshake.state, handshake.env_info) + .await; Ok(()) } - async fn reconcile( + async fn set_agent_state( self, _: context::Context, target: AgentState, + env_info: Option<(EnvId, EnvInfo)>, ) -> Result<(), ReconcileError> { info!("Received reconcile request..."); - self.state.update_agent_state(target).await; + self.state.update_agent_state(target, env_info).await; Ok(()) } diff --git a/crates/agent/src/server.rs b/crates/agent/src/server.rs index 9700aad8..0dc8eac9 100644 --- a/crates/agent/src/server.rs +++ b/crates/agent/src/server.rs @@ -28,7 +28,10 @@ pub async fn start(listener: tokio::net::TcpListener, state: AppState) -> Result let app = Router::new() .route("/node", get(node_ws_handler)) .with_state(Arc::clone(&state)); - info!("axum router listening on: {}", listener.local_addr()?); + info!( + "Starting internal node RPC server on: {}", + listener.local_addr()? + ); axum::serve(listener, app).await?; @@ -56,7 +59,7 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { let client = NodeServiceClient::new(tarpc::client::Config::default(), client_transport).spawn(); // store the client in state - tracing::info!("Node client connected"); + tracing::info!("Connection established with the node"); *node_client = Some(client); drop(node_client); diff --git a/crates/agent/src/state.rs b/crates/agent/src/state.rs index 7169c530..dbb239f9 100644 --- a/crates/agent/src/state.rs +++ b/crates/agent/src/state.rs @@ -19,7 +19,7 @@ use tracing::error; use crate::{cli::Cli, db::Database, log::ReloadHandler, metrics::Metrics, transfers::TransferTx}; -pub const NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(10); +pub const NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(30); pub type AppState = Arc; pub type ClientLock = Arc>>; @@ -136,7 +136,9 @@ impl GlobalState { self.node_client.read().await.clone() } - pub async fn update_agent_state(&self, state: AgentState) { + pub async fn update_agent_state(&self, state: AgentState, env_info: Option<(EnvId, EnvInfo)>) { + self.set_env_info(env_info.map(|(id, e)| (id, Arc::new(e)))) + .await; if let Err(e) = self.db.set_agent_state(&state) { error!("failed to save agent state to db: {e}"); } diff --git a/crates/aot/src/runner/mod.rs b/crates/aot/src/runner/mod.rs index 60e95914..3af5c6a9 100644 --- a/crates/aot/src/runner/mod.rs +++ b/crates/aot/src/runner/mod.rs @@ -5,7 +5,7 @@ use std::{ }; use aleo_std::StorageMode; -use anyhow::Result; +use anyhow::{anyhow, Result}; use clap::Args; use rpc::RpcClient; use snarkos_node::Node; @@ -116,13 +116,21 @@ impl Runner { let bft_ip = SocketAddr::new(bind_addr, self.bft); let metrics_ip = SocketAddr::new(bind_addr, self.metrics); - let account = Account::try_from(self.key.try_get()?)?; - - let genesis = if let Some(path) = self.genesis.as_ref() { - Block::read_le(std::fs::File::open(path)?)? - } else { - Block::read_le(N::genesis_bytes())? - }; + let account = Account::try_from( + self.key + .try_get() + .map_err(|e| e.context("obtain private key"))?, + )?; + + let genesis = + if let Some(path) = self.genesis.as_ref() { + Block::read_le(std::fs::File::open(path).map_err(|e| { + anyhow!(e).context(format!("open genesis file {}", path.display())) + })?) + .map_err(|e| anyhow!(e).context("parse genesis block from file"))? + } else { + Block::read_le(N::genesis_bytes())? + }; // conditionally create a checkpoint manager based on the presence // of a retention policy @@ -169,50 +177,47 @@ impl Runner { let shutdown = Arc::new(AtomicBool::new(false)); let _node = match self.node_type { - NodeType::Validator => { - Node::new_validator( - node_ip, - Some(bft_ip), - Some(rest_ip), - self.rest_rps, - account, - &self.peers, - &self.validators, - genesis, - None, - storage_mode.clone(), - false, - false, - shutdown, - ) - .await? - } - NodeType::Prover => { - Node::new_prover( - node_ip, - account, - &self.peers, - genesis, - storage_mode.clone(), - shutdown, - ) - .await? - } - NodeType::Client => { - Node::new_client( - node_ip, - Some(rest_ip), - self.rest_rps, - account, - &self.peers, - genesis, - None, - storage_mode.clone(), - false, - shutdown, - ) - .await? - } + NodeType::Validator => Node::new_validator( + node_ip, + Some(bft_ip), + Some(rest_ip), + self.rest_rps, + account, + &self.peers, + &self.validators, + genesis, + None, + storage_mode.clone(), + false, + false, + shutdown, + ) + .await + .map_err(|e| e.context("create validator"))?, + NodeType::Prover => Node::new_prover( + node_ip, + account, + &self.peers, + genesis, + storage_mode.clone(), + shutdown, + ) + .await + .map_err(|e| e.context("create prover"))?, + NodeType::Client => Node::new_client( + node_ip, + Some(rest_ip), + self.rest_rps, + account, + &self.peers, + genesis, + None, + storage_mode.clone(), + false, + shutdown, + ) + .await + .map_err(|e| e.context("create client"))?, }; // only monitor block updates if we have a checkpoint manager or agent status diff --git a/crates/aot/src/runner/rpc/mod.rs b/crates/aot/src/runner/rpc/mod.rs index e43936a6..50a84ccf 100644 --- a/crates/aot/src/runner/rpc/mod.rs +++ b/crates/aot/src/runner/rpc/mod.rs @@ -81,115 +81,119 @@ impl RpcClient { // ws connection loop tokio::spawn(async move { loop { - 'connection: { - let (mut ws_stream, _) = match connect_async(ws_req.to_owned()).await { - Ok(r) => r, - Err(e) => { - error!("An error occurred establishing the connection: {e}"); - break 'connection; + let (mut ws_stream, _) = match connect_async(ws_req.to_owned()).await { + Ok(r) => r, + Err(e) => { + error!("An error occurred establishing the connection: {e}"); + tokio::time::sleep(Duration::from_secs(1)).await; + continue; + } + }; + + let mut interval = tokio::time::interval(Duration::from_secs(PING_INTERVAL_SEC)); + let mut num_pings: u32 = 0; + + 'event: loop { + select! { + // ping + _ = interval.tick() => { + let mut payload = Vec::from(PING_HEADER); + payload.extend_from_slice(&num_pings.to_le_bytes()); + payload.extend_from_slice(&start_time.elapsed().as_micros().to_le_bytes()); + + let send = ws_stream.send(tungstenite::Message::Ping(payload)); + if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { + error!("The connection to the control plane was interrupted while sending ping"); + break 'event; + } } - }; - - let mut interval = - tokio::time::interval(Duration::from_secs(PING_INTERVAL_SEC)); - let mut num_pings: u32 = 0; - - 'event: loop { - select! { - // ping - _ = interval.tick() => { - let mut payload = Vec::from(PING_HEADER); - payload.extend_from_slice(&num_pings.to_le_bytes()); - payload.extend_from_slice(&start_time.elapsed().as_micros().to_le_bytes()); - - let send = ws_stream.send(tungstenite::Message::Ping(payload)); - if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { - error!("The connection to the control plane was interrupted while sending ping"); - break 'event; - } + + // handle outgoing responses + msg = server_response_out.recv() => { + let Some(msg) = msg else { + error!("internal RPC channel closed"); + break 'event; + }; + let bin = bincode::serialize(&MuxedMessageOutgoing::Child(msg)).expect("failed to serialize response"); + let send = ws_stream.send(tungstenite::Message::Binary(bin)); + if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { + error!("The connection to the agent was interrupted while sending node message"); + break 'event; } + } - // handle outgoing responses - msg = server_response_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&MuxedMessageOutgoing::Child(msg)).expect("failed to serialize response"); - let send = ws_stream.send(tungstenite::Message::Binary(bin)); - if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { - error!("The connection to the agent was interrupted while sending node message"); - break 'event; - } + // handle outgoing requests + msg = client_request_out.recv() => { + let Some(msg) = msg else { + error!("internal RPC channel closed"); + break 'event; + }; + let bin = bincode::serialize(&MuxedMessageOutgoing::Parent(msg)).expect("failed to serialize request"); + let send = ws_stream.send(tungstenite::Message::Binary(bin)); + if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { + error!("The connection to the agent was interrupted while sending node message"); + break 'event; } + } - // handle outgoing requests - msg = client_request_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&MuxedMessageOutgoing::Parent(msg)).expect("failed to serialize request"); - let send = ws_stream.send(tungstenite::Message::Binary(bin)); - if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { - error!("The connection to the agent was interrupted while sending node message"); - break 'event; + // handle incoming messages + msg = ws_stream.next() => match msg { + Some(Ok(tungstenite::Message::Close(frame))) => { + match frame { + Some(frame) => info!("The agent closed the connection: {frame}"), + None => info!("The agent closed the connection"), } + break 'event; } - // handle incoming messages - msg = ws_stream.next() => match msg { - Some(Ok(tungstenite::Message::Close(frame))) => { - match frame { - Some(frame) => info!("The agent closed the connection: {frame}"), - None => info!("The agent closed the connection"), - } - break 'event; + Some(Ok(tungstenite::Message::Pong(payload))) => { + let mut payload = payload.as_slice(); + + // check the header + if !payload.starts_with(PING_HEADER) { + warn!("Received a pong payload with an invalid header prefix"); + continue; } - Some(Ok(tungstenite::Message::Pong(payload))) => { - let mut payload = payload.as_slice(); + payload = &payload[PING_HEADER.len()..]; + if payload.len() != PING_LENGTH { + warn!("Received a pong payload with an invalid length {}, expected {PING_LENGTH}", payload.len()); + continue; + } - // check the header - if !payload.starts_with(PING_HEADER) { - warn!("Received a pong payload with an invalid header prefix"); - continue; - } + let (left, right) = payload.split_at(size_of::()); + let ping_index = u32::from_le_bytes(left.try_into().unwrap()); + let _uptime_start = u128::from_le_bytes(right.try_into().unwrap()); - payload = &payload[PING_HEADER.len()..]; - if payload.len() != PING_LENGTH { - warn!("Received a pong payload with an invalid length {}, expected {PING_LENGTH}", payload.len()); - continue; - } + if ping_index != num_pings { + warn!("Received a pong payload with an invalid index {ping_index}, expected {num_pings}"); + continue; + } - let (left, right) = payload.split_at(size_of::()); - let ping_index = u32::from_le_bytes(left.try_into().unwrap()); - let _uptime_start = u128::from_le_bytes(right.try_into().unwrap()); + num_pings += 1; + } - if ping_index != num_pings { - warn!("Received a pong payload with an invalid index {ping_index}, expected {num_pings}"); + Some(Ok(tungstenite::Message::Binary(bin))) => { + let msg = match bincode::deserialize(&bin) { + Ok(msg) => msg, + Err(e) => { + error!("failed to deserialize a message from the agent: {e}"); continue; } + }; - num_pings += 1; - } - - Some(Ok(tungstenite::Message::Binary(bin))) => { - let msg = match bincode::deserialize(&bin) { - Ok(msg) => msg, - Err(e) => { - error!("failed to deserialize a message from the agent: {e}"); - continue; - } - }; - - match msg { - MuxedMessageIncoming::Child(msg) => server_request_in.send(msg).expect("internal RPC channel closed"), - MuxedMessageIncoming::Parent(msg) => client_response_in.send(msg).expect("internal RPC channel closed"), - } - } - - None | Some(Err(_)) => { - error!("The connection to the agent was interrupted"); - break 'event; + match msg { + MuxedMessageIncoming::Child(msg) => server_request_in.send(msg).expect("internal RPC channel closed"), + MuxedMessageIncoming::Parent(msg) => client_response_in.send(msg).expect("internal RPC channel closed"), } + } - Some(Ok(o)) => println!("{o:#?}"), + None | Some(Err(_)) => { + error!("The connection to the agent was interrupted"); + break 'event; } + + Some(Ok(o)) => println!("{o:#?}"), } } } diff --git a/crates/common/src/rpc/control/agent.rs b/crates/common/src/rpc/control/agent.rs index 34403458..dd600766 100644 --- a/crates/common/src/rpc/control/agent.rs +++ b/crates/common/src/rpc/control/agent.rs @@ -2,6 +2,7 @@ use std::net::IpAddr; use serde::{Deserialize, Serialize}; +use crate::api::EnvInfo; use crate::rpc::error::*; use crate::state::snarkos_status::SnarkOSLiteBlock; use crate::{ @@ -14,6 +15,7 @@ pub struct Handshake { pub jwt: Option, pub loki: Option, pub state: AgentState, + pub env_info: Option<(EnvId, EnvInfo)>, } /// The RPC service that agents implement as a server. @@ -28,7 +30,10 @@ pub trait AgentService { /// Control plane instructs the agent to reconcile towards a particular /// state. - async fn reconcile(to: AgentState) -> Result<(), ReconcileError>; + async fn set_agent_state( + to: AgentState, + env_info: Option<(EnvId, EnvInfo)>, + ) -> Result<(), ReconcileError>; /// Broadcast a transaction locally async fn broadcast_tx(tx: String) -> Result<(), AgentError>; diff --git a/crates/common/src/state/agent_state.rs b/crates/common/src/state/agent_state.rs index 861f7e11..986add39 100644 --- a/crates/common/src/state/agent_state.rs +++ b/crates/common/src/state/agent_state.rs @@ -20,6 +20,16 @@ impl AgentState { Self::Node(id, state) => Self::Node(id, Box::new(f(*state))), } } + + pub fn map_env_id(&self, f: F) -> Option + where + F: Fn(EnvId) -> Option, + { + match self { + Self::Inventory => None, + Self::Node(id, _) => f(*id), + } + } } impl DataFormat for AgentState { diff --git a/crates/common/src/state/agent_status.rs b/crates/common/src/state/agent_status.rs index b066b3fa..02f3d6ca 100644 --- a/crates/common/src/state/agent_status.rs +++ b/crates/common/src/state/agent_status.rs @@ -103,9 +103,6 @@ pub enum TransferStatusUpdate { time: DateTime, // The transfer's abort handle, if any. }, - // Client only - specifies a handle to abort the transfer task - #[serde(skip)] - Handle(AbortHandle), /// The transfer has made progress. Progress { /// The current number of bytes transferred. @@ -118,6 +115,9 @@ pub enum TransferStatusUpdate { }, /// The transfer has been cleaned up. Cleanup, + // Client only - specifies a handle to abort the transfer task + #[serde(skip)] + Handle(AbortHandle), } #[derive(Debug, Default, Clone, Serialize, Deserialize)] diff --git a/crates/controlplane/src/env/mod.rs b/crates/controlplane/src/env/mod.rs index 0f9775c1..0828880e 100644 --- a/crates/controlplane/src/env/mod.rs +++ b/crates/controlplane/src/env/mod.rs @@ -178,6 +178,7 @@ impl Environment { // maps of states and peers that are new to this environment let mut incoming_states = IndexMap::default(); + let mut updated_states = IndexMap::::default(); let mut incoming_peers = BiMap::default(); // set of resolved keys that will be present (new and old) @@ -209,9 +210,16 @@ impl Environment { // where the agent state is the same, insert the new state // otherwise keep the old state + // replace the key with a new one + let mut node = doc_node.to_owned(); + if let Some(key) = node.key.as_mut() { + *key = key.with_index(i); + } + // Skip delegating nodes that are already present in the node map if node_peers.contains_left(&node_key) { - info!("{env_id}: skipping node {node_key} - already configured"); + info!("{env_id}: updating node {node_key}"); + updated_states.insert(node_key, EnvNodeState::Internal(node)); continue; } @@ -219,14 +227,7 @@ impl Environment { Entry::Occupied(ent) => { Err(PrepareError::DuplicateNodeKey(ent.key().clone()))? } - Entry::Vacant(ent) => { - // replace the key with a new one - let mut node = doc_node.to_owned(); - if let Some(key) = node.key.as_mut() { - *key = key.with_index(i); - } - ent.insert(EnvNodeState::Internal(node)) - } + Entry::Vacant(ent) => ent.insert(EnvNodeState::Internal(node)), }; } } @@ -339,6 +340,7 @@ impl Environment { node_peers.extend(incoming_peers.into_iter()); node_states.extend(incoming_states.into_iter()); + node_states.extend(updated_states.into_iter()); } _ => warn!("ignored unimplemented document type"), diff --git a/crates/controlplane/src/server/mod.rs b/crates/controlplane/src/server/mod.rs index 5138b738..6f467483 100644 --- a/crates/controlplane/src/server/mod.rs +++ b/crates/controlplane/src/server/mod.rs @@ -180,6 +180,9 @@ async fn handle_socket( // attach the current known agent state to the handshake agent.state().clone_into(&mut handshake.state); + handshake.env_info = handshake + .state + .map_env_id(|id| state.get_env(id).map(|env| (id, env.info(&state)))); // mark the agent as connected, update the flags as well agent.mark_connected(client, query.flags); diff --git a/crates/controlplane/src/state/reconcile.rs b/crates/controlplane/src/state/reconcile.rs index 0549af6e..20620104 100644 --- a/crates/controlplane/src/state/reconcile.rs +++ b/crates/controlplane/src/state/reconcile.rs @@ -35,7 +35,12 @@ impl GlobalState { // if the client is present, queue a reconcile if let Some(client) = client { - handles.push(tokio::spawn(async move { client.reconcile(target).await })); + let env_info = target + .map_env_id(|env_id| self.get_env(env_id).map(|env| (env_id, env.info(self)))); + + handles.push(tokio::spawn(async move { + client.set_agent_state(target, env_info).await + })); // otherwise just change the agent state so it'll inventory on // reconnect diff --git a/crates/controlplane/src/state/rpc.rs b/crates/controlplane/src/state/rpc.rs index 6123d9b2..8c20a013 100644 --- a/crates/controlplane/src/state/rpc.rs +++ b/crates/controlplane/src/state/rpc.rs @@ -2,6 +2,7 @@ use std::{fmt::Display, time::Duration}; use serde::de::DeserializeOwned; use snops_common::{ + api::EnvInfo, rpc::{ control::agent::AgentServiceClient, error::{ReconcileError, SnarkosRequestError}, @@ -16,14 +17,16 @@ use crate::error::StateError; pub struct AgentClient(pub(crate) AgentServiceClient); impl AgentClient { - pub async fn reconcile( + pub async fn set_agent_state( &self, to: AgentState, + env_info: Option<(EnvId, EnvInfo)>, ) -> Result, RpcError> { let mut ctx = context::current(); ctx.deadline += Duration::from_secs(300); + self.0 - .reconcile(ctx, to.clone()) + .set_agent_state(ctx, to.clone(), env_info) .await .map(|res| res.map(|_| to)) } From 67e77398c9edef555820fc2ba4a51973d2ec45dd Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 25 Nov 2024 23:34:17 -0500 Subject: [PATCH 20/68] refactor(controlplane): remove prometheus caching, parallelize httpsd api --- crates/controlplane/src/env/mod.rs | 5 - crates/controlplane/src/server/prometheus.rs | 142 ++++++------------- crates/controlplane/src/state/global.rs | 6 +- crates/controlplane/src/state/reconcile.rs | 2 - 4 files changed, 46 insertions(+), 109 deletions(-) diff --git a/crates/controlplane/src/env/mod.rs b/crates/controlplane/src/env/mod.rs index 0828880e..2ba83d35 100644 --- a/crates/controlplane/src/env/mod.rs +++ b/crates/controlplane/src/env/mod.rs @@ -117,8 +117,6 @@ impl Environment { documents: Vec, state: Arc, ) -> Result { - state.prom_httpsd.lock().await.set_dirty(); - let prev_env = state.get_env(env_id); let mut storage_doc = None; @@ -453,9 +451,6 @@ impl Environment { info!("[env {id}] unloaded storage {}", storage.id); } - trace!("[env {id}] marking prom as dirty"); - state.prom_httpsd.lock().await.set_dirty(); - trace!("[env {id}] inventorying agents..."); if let Err(e) = state diff --git a/crates/controlplane/src/server/prometheus.rs b/crates/controlplane/src/server/prometheus.rs index 8ba53962..d1a5ed1c 100644 --- a/crates/controlplane/src/server/prometheus.rs +++ b/crates/controlplane/src/server/prometheus.rs @@ -1,12 +1,12 @@ -use std::{collections::HashMap, fmt::Write}; +use std::collections::HashMap; use axum::{extract::State, response::IntoResponse, routing::get, Json, Router}; +use rayon::iter::{ParallelBridge, ParallelIterator}; use serde::Serialize; use snops_common::state::AgentState; -use tracing::debug; use super::AppState; -use crate::{cli::PrometheusLocation, env::EnvPeer}; +use crate::cli::PrometheusLocation; pub(super) fn routes() -> Router { Router::new().route("/httpsd", get(get_httpsd)) } @@ -17,102 +17,48 @@ pub struct StaticConfig { pub labels: HashMap, } -/// Caching container for the Prometheus HTTP service discovery response. Marked -/// 'dirty' when environment agents are reallocated. -#[derive(Debug, Clone, Default)] -pub enum HttpsdResponse { - #[default] - Dirty, - Clean(Vec), -} - -impl HttpsdResponse { - pub fn set_dirty(&mut self) { - *self = Self::Dirty; - } -} - async fn get_httpsd(State(state): State) -> impl IntoResponse { - let mut prom_httpsd = state.prom_httpsd.lock().await; - - let static_configs = match &*prom_httpsd { - // use the cached response - HttpsdResponse::Clean(static_configs) => static_configs.to_owned(), - - // recompute the response and save it - HttpsdResponse::Dirty => { - debug!("httpsd response is dirty, regenerating..."); - let mut static_configs = vec![]; - - for agent in state.pool.iter() { - let Some(mut agent_addr) = - (match (state.cli.prometheus_location, agent.has_label_str("local")) { - // agent is external: serve its external IP - (_, false) => agent - .addrs() - .and_then(|addrs| addrs.external.as_ref()) - .map(ToString::to_string), - - // prometheus and agent are local: use internal IP - (PrometheusLocation::Internal, true) => agent - .addrs() - .and_then(|addrs| addrs.internal.first()) - .map(ToString::to_string), - - // prometheus in docker but agent is local: use host.docker.internal - (PrometheusLocation::Docker, true) => { - Some(String::from("host.docker.internal")) - } - - // prometheus is external but agent is local: agent might not be forwarded; - // TODO - (PrometheusLocation::External, true) => continue, - }) - else { - continue; - }; - - match agent.state() { - AgentState::Node(env_id, _) => { - // get the environment this agent belongs to - let Some(env) = state.get_env(*env_id) else { - continue; - }; - - // get the node key that corresponds to this agent - let Some(node_key) = - env.node_peers.get_by_right(&EnvPeer::Internal(agent.id())) - else { - continue; - }; - - agent_addr - .write_fmt(format_args!(":{}", agent.metrics_port())) - .unwrap(); - - static_configs.push(StaticConfig { - targets: [agent_addr], - labels: [ - ("env_id".into(), env_id.to_string()), - ("node_key".into(), node_key.to_string()), - ] - .into_iter() - .collect(), - }); - } - - _ => { - // future-proofing; this comment also disables the - // clippy lint - } - } - } - - *prom_httpsd = HttpsdResponse::Clean(static_configs.to_owned()); - - static_configs - } - }; + let static_configs = state + .pool + .iter() + .par_bridge() + .filter_map(|agent| { + let agent_addr = (match (state.cli.prometheus_location, agent.has_label_str("local")) { + // agent is external: serve its external IP + (_, false) => agent + .addrs() + .and_then(|addrs| addrs.external.as_ref()) + .map(ToString::to_string), + + // prometheus and agent are local: use internal IP + (PrometheusLocation::Internal, true) => agent + .addrs() + .and_then(|addrs| addrs.internal.first()) + .map(ToString::to_string), + + // prometheus in docker but agent is local: use host.docker.internal + (PrometheusLocation::Docker, true) => Some(String::from("host.docker.internal")), + + // prometheus is external but agent is local: agent might not be forwarded; + // TODO + (PrometheusLocation::External, true) => return None, + })?; + + let AgentState::Node(env_id, node) = agent.state() else { + return None; + }; + + Some(StaticConfig { + targets: [format!("{agent_addr}:{}", agent.metrics_port())], + labels: [ + ("env_id".into(), env_id.to_string()), + ("node_key".into(), node.node_key.to_string()), + ] + .into_iter() + .collect(), + }) + }) + .collect::>(); Json(static_configs) } diff --git a/crates/controlplane/src/state/global.rs b/crates/controlplane/src/state/global.rs index 5061fb43..3594dd33 100644 --- a/crates/controlplane/src/state/global.rs +++ b/crates/controlplane/src/state/global.rs @@ -13,7 +13,7 @@ use snops_common::{ }, util::OpaqueDebug, }; -use tokio::sync::{Mutex, Semaphore}; +use tokio::sync::Semaphore; use tracing::info; use super::{ @@ -26,7 +26,7 @@ use crate::{ env::{cache::NetworkCache, error::EnvRequestError, Environment, PortType}, error::StateError, schema::storage::{LoadedStorage, STORAGE_DIR}, - server::{error::StartError, prometheus::HttpsdResponse}, + server::error::StartError, ReloadHandler, }; @@ -45,7 +45,6 @@ pub struct GlobalState { pub envs: EnvMap, pub env_network_cache: OpaqueDebug>, - pub prom_httpsd: Mutex, pub prometheus: OpaqueDebug>, pub log_level_handler: ReloadHandler, @@ -95,7 +94,6 @@ impl GlobalState { pool, storage, envs: EnvMap::default(), - prom_httpsd: Default::default(), prometheus: OpaqueDebug(prometheus), db: OpaqueDebug(db), env_network_cache: Default::default(), diff --git a/crates/controlplane/src/state/reconcile.rs b/crates/controlplane/src/state/reconcile.rs index 20620104..44ae9f81 100644 --- a/crates/controlplane/src/state/reconcile.rs +++ b/crates/controlplane/src/state/reconcile.rs @@ -92,8 +92,6 @@ impl GlobalState { num_reconciliations ); - self.prom_httpsd.lock().await.set_dirty(); - if success == num_reconciliations { Ok(()) } else { From 79939c071d99b7bc0a70f00fb22c5a406d0df380 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 25 Nov 2024 23:35:13 -0500 Subject: [PATCH 21/68] chore(controlplane): fix prom types --- crates/controlplane/src/server/prometheus.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/controlplane/src/server/prometheus.rs b/crates/controlplane/src/server/prometheus.rs index d1a5ed1c..0bbf990c 100644 --- a/crates/controlplane/src/server/prometheus.rs +++ b/crates/controlplane/src/server/prometheus.rs @@ -14,7 +14,7 @@ pub(super) fn routes() -> Router { #[derive(Debug, Clone, Serialize)] pub struct StaticConfig { pub targets: [String; 1], - pub labels: HashMap, + pub labels: HashMap<&'static str, String>, } async fn get_httpsd(State(state): State) -> impl IntoResponse { @@ -51,8 +51,8 @@ async fn get_httpsd(State(state): State) -> impl IntoResponse { Some(StaticConfig { targets: [format!("{agent_addr}:{}", agent.metrics_port())], labels: [ - ("env_id".into(), env_id.to_string()), - ("node_key".into(), node.node_key.to_string()), + ("env_id", env_id.to_string()), + ("node_key", node.node_key.to_string()), ] .into_iter() .collect(), From 7ee48ad2473705ec9da478f7ba7b880e10b144eb Mon Sep 17 00:00:00 2001 From: Meshiest Date: Wed, 27 Nov 2024 00:34:18 -0500 Subject: [PATCH 22/68] refactor(agent,controlplane): add address updating, convert old reconcile into set_agent_state, add EnvInfo lite for agents, remove dead code --- crates/agent/src/api.rs | 12 +- crates/agent/src/db.rs | 9 +- crates/agent/src/reconcile/address.rs | 10 +- crates/agent/src/reconcile/agent.rs | 29 ++-- crates/agent/src/reconcile/command.rs | 10 +- crates/agent/src/reconcile/files.rs | 24 +-- crates/agent/src/reconcile/process.rs | 10 +- crates/agent/src/reconcile/state.rs | 8 +- crates/agent/src/reconcile/storage.rs | 52 +++--- crates/agent/src/rpc/control.rs | 74 ++++----- crates/agent/src/state.rs | 25 +-- crates/common/src/api.rs | 82 +++++++++ crates/common/src/rpc/control/agent.rs | 13 +- crates/common/src/rpc/control/mod.rs | 4 +- crates/common/src/rpc/error.rs | 30 +--- crates/common/src/state/agent_state.rs | 7 + crates/common/src/state/node_state.rs | 18 -- crates/controlplane/src/env/error.rs | 7 - crates/controlplane/src/env/mod.rs | 155 +++++++++++++++--- crates/controlplane/src/env/reconcile.rs | 28 +--- crates/controlplane/src/error.rs | 3 - .../controlplane/src/server/actions/config.rs | 14 +- .../controlplane/src/server/actions/power.rs | 25 +-- crates/controlplane/src/server/error.rs | 5 - crates/controlplane/src/server/mod.rs | 60 ++++--- crates/controlplane/src/server/rpc.rs | 45 +++-- crates/controlplane/src/state/agent.rs | 15 +- crates/controlplane/src/state/error.rs | 10 -- crates/controlplane/src/state/mod.rs | 1 - crates/controlplane/src/state/reconcile.rs | 92 +++++------ crates/controlplane/src/state/rpc.rs | 24 +-- 31 files changed, 482 insertions(+), 419 deletions(-) delete mode 100644 crates/controlplane/src/state/error.rs diff --git a/crates/agent/src/api.rs b/crates/agent/src/api.rs index 0e42e8e2..ab6cfcc1 100644 --- a/crates/agent/src/api.rs +++ b/crates/agent/src/api.rs @@ -12,7 +12,7 @@ use reqwest::IntoUrl; use sha2::{Digest, Sha256}; use snops_common::{ binaries::{BinaryEntry, BinarySource}, - rpc::error::ReconcileError2, + rpc::error::ReconcileError, state::{TransferId, TransferStatusUpdate}, util::sha256_file, }; @@ -196,14 +196,14 @@ pub async fn get_file_issues( size: Option, sha256: Option<&str>, offline: bool, -) -> Result, ReconcileError2> { +) -> Result, ReconcileError> { if !dst.try_exists().unwrap_or(false) { return Ok(Some(BadFileReason::NotFound)); } let meta = tokio::fs::metadata(&dst) .await - .map_err(|e| ReconcileError2::FileStatError(dst.to_path_buf(), e.to_string()))?; + .map_err(|e| ReconcileError::FileStatError(dst.to_path_buf(), e.to_string()))?; let local_content_length = meta.len(); // if the binary entry is provided, check if the file size and sha256 match @@ -215,7 +215,7 @@ pub async fn get_file_issues( // if sha256 is present, only download if the sha256 is different if let Some(sha256) = sha256 { let bad_sha256 = sha256_file(&dst.to_path_buf()) - .map_err(|e| ReconcileError2::FileReadError(dst.to_path_buf(), e.to_string()))? + .map_err(|e| ReconcileError::FileReadError(dst.to_path_buf(), e.to_string()))? != sha256.to_ascii_lowercase(); return Ok(bad_sha256.then_some(BadFileReason::Sha256)); } @@ -230,7 +230,7 @@ pub async fn get_file_issues( .head(src) .send() .await - .map_err(|e| ReconcileError2::HttpError { + .map_err(|e| ReconcileError::HttpError { method: String::from("HEAD"), url: src.to_owned(), error: e.to_string(), @@ -257,7 +257,7 @@ pub async fn get_file_issues( let remote_last_modified = httpdate::parse_http_date(last_modified_header); let local_last_modified = meta .modified() - .map_err(|e| ReconcileError2::FileStatError(dst.to_path_buf(), e.to_string()))?; + .map_err(|e| ReconcileError::FileStatError(dst.to_path_buf(), e.to_string()))?; let is_stale = remote_last_modified .map(|res| res > local_last_modified) diff --git a/crates/agent/src/db.rs b/crates/agent/src/db.rs index 2ea92945..c432d483 100644 --- a/crates/agent/src/db.rs +++ b/crates/agent/src/db.rs @@ -7,7 +7,7 @@ use std::{ use indexmap::IndexMap; use snops_common::{ - api::EnvInfo, + api::AgentEnvInfo, db::{ error::DatabaseError, tree::{DbRecords, DbTree}, @@ -122,13 +122,16 @@ impl Database { .and_then(|url| url.parse::().ok()) } - pub fn env_info(&self) -> Result)>, DatabaseError> { + pub fn env_info(&self) -> Result)>, DatabaseError> { self.documents .restore(&AgentDbString::EnvInfo) .map_err(DatabaseError::from) } - pub fn set_env_info(&self, info: Option<(EnvId, Arc)>) -> Result<(), DatabaseError> { + pub fn set_env_info( + &self, + info: Option<(EnvId, Arc)>, + ) -> Result<(), DatabaseError> { self.documents .save_option(&AgentDbString::EnvInfo, info.as_ref()) } diff --git a/crates/agent/src/reconcile/address.rs b/crates/agent/src/reconcile/address.rs index f663af04..42f2d5e6 100644 --- a/crates/agent/src/reconcile/address.rs +++ b/crates/agent/src/reconcile/address.rs @@ -1,7 +1,7 @@ use std::{collections::HashSet, sync::Arc}; use snops_common::{ - rpc::error::ReconcileError2, + rpc::error::ReconcileError, state::{AgentId, AgentPeer, NodeState}, }; use tarpc::context; @@ -18,8 +18,8 @@ pub struct AddressResolveReconciler { pub node: Arc, } -impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { - async fn reconcile(&mut self) -> Result, ReconcileError2> { +impl Reconcile<(), ReconcileError> for AddressResolveReconciler { + async fn reconcile(&mut self) -> Result, ReconcileError> { let AddressResolveReconciler { state, node } = self; // Find agents that do not have cached addresses @@ -67,8 +67,8 @@ impl Reconcile<(), ReconcileError2> for AddressResolveReconciler { let new_addrs = client .resolve_addrs(context::current(), unresolved_addrs) .await - .map_err(|e| ReconcileError2::RpcError(e.to_string()))? - .map_err(ReconcileError2::AddressResolve)?; + .map_err(|e| ReconcileError::RpcError(e.to_string()))? + .map_err(ReconcileError::AddressResolve)?; tracing::trace!( "Resolved new addrs: {}", diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index 4c4e7cc6..0e9a3e87 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -6,7 +6,7 @@ use std::{ use snops_common::{ binaries::BinaryEntry, - rpc::error::ReconcileError2, + rpc::error::ReconcileError, state::{AgentState, HeightRequest, TransferId}, }; use tokio::{ @@ -170,7 +170,7 @@ impl AgentStateReconciler { } } - pub async fn reconcile_inventory(&mut self) -> Result, ReconcileError2> { + pub async fn reconcile_inventory(&mut self) -> Result, ReconcileError> { // TODO: cleanup other things // End the process if it is running @@ -201,8 +201,8 @@ impl AgentStateReconciler { } } -impl Reconcile<(), ReconcileError2> for AgentStateReconciler { - async fn reconcile(&mut self) -> Result, ReconcileError2> { +impl Reconcile<(), ReconcileError> for AgentStateReconciler { + async fn reconcile(&mut self) -> Result, ReconcileError> { let (env_id, node) = match self.agent_state.as_ref() { AgentState::Inventory => { return self.reconcile_inventory().await; @@ -249,6 +249,17 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { let node_arc = Arc::new(*node.clone()); + // Resolve the addresses of the peers and validators + // This is run before the process is started, as the agent can sometimes have + // new addresses that need to be resolved. + reconcile!( + address_resolve, + AddressResolveReconciler { + node: Arc::clone(&node_arc), + state: Arc::clone(&self.state), + } + ); + // Reconcile behavior while the node is running... if let Some(process) = self.context.process.as_mut() { // If the process has exited, clear the process context @@ -346,16 +357,6 @@ impl Reconcile<(), ReconcileError2> for AgentStateReconciler { } ); - // Resolve the addresses of the peers and validators - // TODO: Set an expiry for resolved addresses - reconcile!( - address_resolve, - AddressResolveReconciler { - node: Arc::clone(&node_arc), - state: Arc::clone(&self.state), - } - ); - // TODO: if possible, use the NodeCommand as configuration for a node service to // allow running the node outside of the agent diff --git a/crates/agent/src/reconcile/command.rs b/crates/agent/src/reconcile/command.rs index 94bd95b5..7fe7a971 100644 --- a/crates/agent/src/reconcile/command.rs +++ b/crates/agent/src/reconcile/command.rs @@ -3,11 +3,11 @@ use std::{net::IpAddr, ops::Deref, path::PathBuf, process::Stdio, sync::Arc}; use indexmap::IndexMap; use snops_checkpoint::RetentionPolicy; use snops_common::{ - api::EnvInfo, + api::AgentEnvInfo, constant::{ LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, SNARKOS_LOG_FILE, }, - rpc::error::ReconcileError2, + rpc::error::ReconcileError, state::{EnvId, KeyState, NetworkId, NodeKey, NodeState, PortConfig}, }; use tokio::process::Command; @@ -60,8 +60,8 @@ impl NodeCommand { state: Arc, node: Arc, env_id: EnvId, - env_info: Arc, - ) -> Result { + env_info: Arc, + ) -> Result { let storage_path = state .cli .storage_path(env_info.network, env_info.storage.id); @@ -101,7 +101,7 @@ impl NodeCommand { .cli .private_key_file .clone() - .ok_or(ReconcileError2::MissingLocalPrivateKey)?, + .ok_or(ReconcileError::MissingLocalPrivateKey)?, ) } else { None diff --git a/crates/agent/src/reconcile/files.rs b/crates/agent/src/reconcile/files.rs index 51911a11..9255d1f8 100644 --- a/crates/agent/src/reconcile/files.rs +++ b/crates/agent/src/reconcile/files.rs @@ -7,10 +7,10 @@ use std::{ use chrono::{TimeDelta, Utc}; use snops_common::{ - api::EnvInfo, + api::AgentEnvInfo, binaries::{BinaryEntry, BinarySource}, constant::SNARKOS_GENESIS_FILE, - rpc::error::ReconcileError2, + rpc::error::ReconcileError, state::{NetworkId, StorageId, TransferId, TransferStatusUpdate}, }; use tracing::{error, trace, warn}; @@ -23,7 +23,7 @@ use crate::{ transfers, }; -pub fn default_binary(info: &EnvInfo) -> BinaryEntry { +pub fn default_binary(info: &AgentEnvInfo) -> BinaryEntry { BinaryEntry { source: BinarySource::Path(PathBuf::from(format!( "/content/storage/{}/{}/binaries/default", @@ -40,11 +40,11 @@ pub fn get_genesis_route(endpoint: &str, network: NetworkId, storage_id: Storage /// This reconciler creates a directory if it does not exist pub struct DirectoryReconciler<'a>(pub &'a Path); -impl<'a> Reconcile<(), ReconcileError2> for DirectoryReconciler<'a> { - async fn reconcile(&mut self) -> Result, ReconcileError2> { +impl<'a> Reconcile<(), ReconcileError> for DirectoryReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError> { std::fs::create_dir_all(self.0) .map(ReconcileStatus::with) - .map_err(|e| ReconcileError2::CreateDirectory(self.0.to_path_buf(), e.to_string())) + .map_err(|e| ReconcileError::CreateDirectory(self.0.to_path_buf(), e.to_string())) } } @@ -96,7 +96,7 @@ impl FileReconciler { self } - pub fn check_and_set_mode(&self) -> Result<(), ReconcileError2> { + pub fn check_and_set_mode(&self) -> Result<(), ReconcileError> { // ensure the file has the correct permissions let Some(check_perms) = self.permissions else { return Ok(()); @@ -105,13 +105,13 @@ impl FileReconciler { let perms = self .dst .metadata() - .map_err(|e| ReconcileError2::FileStatError(self.dst.clone(), e.to_string()))? + .map_err(|e| ReconcileError::FileStatError(self.dst.clone(), e.to_string()))? .permissions(); if perms.mode() != check_perms { std::fs::set_permissions(&self.dst, std::fs::Permissions::from_mode(check_perms)) .map_err(|e| { - ReconcileError2::FilePermissionError(self.dst.clone(), e.to_string()) + ReconcileError::FilePermissionError(self.dst.clone(), e.to_string()) })?; } @@ -119,8 +119,8 @@ impl FileReconciler { } } -impl Reconcile for FileReconciler { - async fn reconcile(&mut self) -> Result, ReconcileError2> { +impl Reconcile for FileReconciler { + async fn reconcile(&mut self) -> Result, ReconcileError> { let client = reqwest::Client::new(); // Create a transfer id if one is not provided @@ -208,7 +208,7 @@ impl Reconcile for FileReconciler { if self.dst.exists() { // delete the file tokio::fs::remove_file(&self.dst).await.map_err(|e| { - ReconcileError2::DeleteFileError(self.dst.clone(), e.to_string()) + ReconcileError::DeleteFileError(self.dst.clone(), e.to_string()) })?; } diff --git a/crates/agent/src/reconcile/process.rs b/crates/agent/src/reconcile/process.rs index fd799b27..d5f28cb4 100644 --- a/crates/agent/src/reconcile/process.rs +++ b/crates/agent/src/reconcile/process.rs @@ -1,6 +1,6 @@ use std::time::{Duration, Instant}; -use snops_common::rpc::error::ReconcileError2; +use snops_common::rpc::error::ReconcileError; use tokio::{process::Child, select}; use tracing::{error, info}; @@ -24,7 +24,7 @@ pub struct ProcessContext { } impl ProcessContext { - pub fn new(command: NodeCommand) -> Result { + pub fn new(command: NodeCommand) -> Result { command .build() .spawn() @@ -37,7 +37,7 @@ impl ProcessContext { }) .map_err(|e| { error!("failed to start node process: {e:?}"); - ReconcileError2::SpawnError(e.to_string()) + ReconcileError::SpawnError(e.to_string()) }) } @@ -120,8 +120,8 @@ impl ProcessContext { /// before sending a SIGKILL (if the childi process has not exited), pub struct EndProcessReconciler<'a>(pub &'a mut ProcessContext); -impl<'a> Reconcile<(), ReconcileError2> for EndProcessReconciler<'a> { - async fn reconcile(&mut self) -> Result, ReconcileError2> { +impl<'a> Reconcile<(), ReconcileError> for EndProcessReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError> { if !self.0.is_running() { return Ok(ReconcileStatus::default()); } diff --git a/crates/agent/src/reconcile/state.rs b/crates/agent/src/reconcile/state.rs index 26e080d3..cc2a69e0 100644 --- a/crates/agent/src/reconcile/state.rs +++ b/crates/agent/src/reconcile/state.rs @@ -1,5 +1,5 @@ use snops_common::{ - api::EnvInfo, + api::AgentEnvInfo, format::{DataFormat, DataHeaderOf}, state::{NetworkId, StorageId}, }; @@ -11,15 +11,15 @@ pub struct EnvState { } impl EnvState { - pub fn changed(&self, env_info: &EnvInfo) -> bool { + pub fn changed(&self, env_info: &AgentEnvInfo) -> bool { env_info.storage.version != self.storage_version || env_info.storage.id != self.storage_id || env_info.network != self.network_id } } -impl From<&EnvInfo> for EnvState { - fn from(info: &EnvInfo) -> Self { +impl From<&AgentEnvInfo> for EnvState { + fn from(info: &AgentEnvInfo) -> Self { Self { network_id: info.network, storage_id: info.storage.id, diff --git a/crates/agent/src/reconcile/storage.rs b/crates/agent/src/reconcile/storage.rs index 69f9f64e..eddfd2f8 100644 --- a/crates/agent/src/reconcile/storage.rs +++ b/crates/agent/src/reconcile/storage.rs @@ -7,12 +7,12 @@ use std::{ use lazysort::SortedBy; use snops_checkpoint::CheckpointManager; use snops_common::{ - api::EnvInfo, + api::AgentEnvInfo, binaries::{BinaryEntry, BinarySource}, constant::{ LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, VERSION_FILE, }, - rpc::error::ReconcileError2, + rpc::error::ReconcileError, state::{HeightRequest, InternedId, TransferId}, }; use tokio::{process::Command, sync::Mutex, task::AbortHandle}; @@ -28,7 +28,7 @@ use crate::state::GlobalState; /// Download a specific binary file needed to run the node pub struct BinaryReconciler<'a> { pub state: Arc, - pub env_info: Arc, + pub env_info: Arc, pub node_binary: Option, /// Metadata about an active binary transfer pub transfer: &'a mut Option<(TransferId, BinaryEntry)>, @@ -36,8 +36,8 @@ pub struct BinaryReconciler<'a> { pub ok_at: &'a mut Option, } -impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { - async fn reconcile(&mut self) -> Result, ReconcileError2> { +impl<'a> Reconcile<(), ReconcileError> for BinaryReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError> { let BinaryReconciler { state, env_info, @@ -79,7 +79,7 @@ impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { BinarySource::Path(path) => { let url = format!("{}{}", &state.endpoint, path.display()); url.parse::() - .map_err(|e| ReconcileError2::UrlParseError(url, e.to_string()))? + .map_err(|e| ReconcileError::UrlParseError(url, e.to_string()))? } }; @@ -120,15 +120,15 @@ impl<'a> Reconcile<(), ReconcileError2> for BinaryReconciler<'a> { /// Download the genesis block needed to run the node pub struct GenesisReconciler<'a> { pub state: Arc, - pub env_info: Arc, + pub env_info: Arc, /// Metadata about an active genesis transfer pub transfer: &'a mut Option, /// Time the genesis was marked as OK pub ok_at: &'a mut Option, } -impl<'a> Reconcile<(), ReconcileError2> for GenesisReconciler<'a> { - async fn reconcile(&mut self) -> Result, ReconcileError2> { +impl<'a> Reconcile<(), ReconcileError> for GenesisReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError> { let GenesisReconciler { state, env_info, @@ -161,7 +161,7 @@ impl<'a> Reconcile<(), ReconcileError2> for GenesisReconciler<'a> { let mut file_rec = FileReconciler::new( Arc::clone(&self.state), genesis_url.parse::().map_err(|e| { - ReconcileError2::UrlParseError(genesis_url.to_string(), e.to_string()) + ReconcileError::UrlParseError(genesis_url.to_string(), e.to_string()) })?, genesis_file, ) @@ -198,11 +198,11 @@ impl<'a> Reconcile<(), ReconcileError2> for GenesisReconciler<'a> { } } -pub type LedgerModifyResult = Result; +pub type LedgerModifyResult = Result; pub struct LedgerReconciler<'a> { pub state: Arc, - pub env_info: Arc, + pub env_info: Arc, pub target_height: (usize, HeightRequest), pub last_height: &'a mut Option<(usize, HeightRequest)>, pub pending_height: &'a mut Option<(usize, HeightRequest)>, @@ -230,7 +230,7 @@ impl<'a> LedgerReconciler<'a> { /// Find the checkpoint to apply to the ledger /// Guaranteed error when target height is not the top, 0, or unlimited span - pub fn find_checkpoint(&self) -> Result { + pub fn find_checkpoint(&self) -> Result { let (untar_base, ledger_dir) = self.untar_paths(); let ledger_path = untar_base.join(ledger_dir); @@ -246,13 +246,11 @@ impl<'a> LedgerReconciler<'a> { trace!("loading checkpoints from {untar_base:?}..."); CheckpointManager::load(ledger_path.clone(), policy).map_err(|e| { error!("failed to load checkpoints: {e}"); - ReconcileError2::CheckpointLoadError(e.to_string()) + ReconcileError::CheckpointLoadError(e.to_string()) }) }) .transpose()? - .ok_or(ReconcileError2::MissingRetentionPolicy( - self.target_height.1, - ))?; + .ok_or(ReconcileError::MissingRetentionPolicy(self.target_height.1))?; // Determine which checkpoint to use by the next available height/time match self.target_height.1 { @@ -269,9 +267,7 @@ impl<'a> LedgerReconciler<'a> { // top cannot be a target height _ => None, } - .ok_or(ReconcileError2::NoAvailableCheckpoints( - self.target_height.1, - )) + .ok_or(ReconcileError::NoAvailableCheckpoints(self.target_height.1)) .cloned() } @@ -316,7 +312,7 @@ impl<'a> LedgerReconciler<'a> { .spawn() .map_err(|e| { error!("failed to spawn checkpoint apply process: {e}"); - mutex.replace(Err(ReconcileError2::CheckpointApplyError(String::from( + mutex.replace(Err(ReconcileError::CheckpointApplyError(String::from( "spawn checkpoint apply process", )))); })? @@ -324,7 +320,7 @@ impl<'a> LedgerReconciler<'a> { .await .map_err(|e| { error!("failed to await checkpoint apply process: {e}"); - mutex.replace(Err(ReconcileError2::CheckpointApplyError(String::from( + mutex.replace(Err(ReconcileError::CheckpointApplyError(String::from( "await checkpoint apply process", )))); })?; @@ -339,8 +335,8 @@ impl<'a> LedgerReconciler<'a> { } } -impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { - async fn reconcile(&mut self) -> Result, ReconcileError2> { +impl<'a> Reconcile<(), ReconcileError> for LedgerReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError> { let env_info = self.env_info.clone(); let target_height = self.target_height; @@ -477,8 +473,8 @@ impl<'a> Reconcile<(), ReconcileError2> for LedgerReconciler<'a> { pub struct StorageVersionReconciler<'a>(pub &'a Path, pub u16); -impl<'a> Reconcile<(), ReconcileError2> for StorageVersionReconciler<'a> { - async fn reconcile(&mut self) -> Result, ReconcileError2> { +impl<'a> Reconcile<(), ReconcileError> for StorageVersionReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError> { let StorageVersionReconciler(path, version) = self; let version_file = path.join(VERSION_FILE); @@ -488,7 +484,7 @@ impl<'a> Reconcile<(), ReconcileError2> for StorageVersionReconciler<'a> { } else { tokio::fs::read_to_string(&version_file) .await - .map_err(|e| ReconcileError2::FileReadError(version_file.clone(), e.to_string()))? + .map_err(|e| ReconcileError::FileReadError(version_file.clone(), e.to_string()))? .parse() .ok() }; @@ -511,7 +507,7 @@ impl<'a> Reconcile<(), ReconcileError2> for StorageVersionReconciler<'a> { .await .map_err(|e| { error!("failed to write storage version: {e}"); - ReconcileError2::CreateDirectory(version_file.to_path_buf(), e.to_string()) + ReconcileError::CreateDirectory(version_file.to_path_buf(), e.to_string()) })?; } diff --git a/crates/agent/src/rpc/control.rs b/crates/agent/src/rpc/control.rs index d51f8907..e5cc6d52 100644 --- a/crates/agent/src/rpc/control.rs +++ b/crates/agent/src/rpc/control.rs @@ -4,7 +4,6 @@ use std::{net::IpAddr, path::PathBuf}; use snops_common::{ aot_cmds::AotCmd, - api::EnvInfo, binaries::{BinaryEntry, BinarySource}, define_rpc_mux, prelude::snarkos_status::SnarkOSLiteBlock, @@ -16,11 +15,11 @@ use snops_common::{ }, ControlServiceClient, ControlServiceRequest, ControlServiceResponse, }, - error::{AgentError, ReconcileError, SnarkosRequestError}, + error::{AgentError, SnarkosRequestError}, }, - state::{AgentState, EnvId, InternedId, NetworkId, PortConfig}, + state::{AgentId, AgentState, EnvId, InternedId, NetworkId, PortConfig}, }; -use tarpc::context; +use tarpc::context::Context; use tracing::{error, info, trace}; use crate::{api, log::make_env_filter, metrics::MetricComputer, state::AppState}; @@ -38,22 +37,17 @@ pub struct AgentRpcServer { } impl AgentService for AgentRpcServer { - async fn kill(self, _: context::Context) { + async fn kill(self, _: Context) { info!("Kill RPC invoked..."); self.state.shutdown().await; } - async fn handshake( - self, - context: context::Context, - handshake: Handshake, - ) -> Result<(), ReconcileError> { + async fn handshake(self, context: Context, handshake: Handshake) { if let Some(token) = handshake.jwt { // cache the JWT in the state JWT mutex - self.state - .db - .set_jwt(Some(token)) - .map_err(|_| ReconcileError::Database)?; + if let Err(e) = self.state.db.set_jwt(Some(token)) { + error!("failed to save JWT to db: {e}"); + } } // store loki server URL @@ -95,25 +89,23 @@ impl AgentService for AgentRpcServer { // Queue a reconcile immediately as we have received new state. // The reconciler will decide if anything has actually changed - self.state - .update_agent_state(handshake.state, handshake.env_info) - .await; + self.state.update_agent_state(handshake.state).await; + } - Ok(()) + async fn set_agent_state(self, _: Context, target: AgentState) { + info!("Received new agent state, queuing reconcile..."); + self.state.update_agent_state(target).await; } - async fn set_agent_state( - self, - _: context::Context, - target: AgentState, - env_info: Option<(EnvId, EnvInfo)>, - ) -> Result<(), ReconcileError> { - info!("Received reconcile request..."); - self.state.update_agent_state(target, env_info).await; - Ok(()) + async fn clear_peer_addr(self, _: Context, agent_id: AgentId) { + self.state + .resolved_addrs + .write() + .await + .swap_remove(&agent_id); } - async fn get_addrs(self, _: context::Context) -> (PortConfig, Option, Vec) { + async fn get_addrs(self, _: Context) -> (PortConfig, Option, Vec) { ( self.state.cli.ports, self.state.external_addr, @@ -121,11 +113,7 @@ impl AgentService for AgentRpcServer { ) } - async fn snarkos_get( - self, - _: context::Context, - route: String, - ) -> Result { + async fn snarkos_get(self, _: Context, route: String) -> Result { self.state .get_node_client() .await @@ -169,7 +157,7 @@ impl AgentService for AgentRpcServer { .map_err(|err| SnarkosRequestError::JsonSerializeError(err.to_string())) } - async fn broadcast_tx(self, _: context::Context, tx: String) -> Result<(), AgentError> { + async fn broadcast_tx(self, _: Context, tx: String) -> Result<(), AgentError> { self.state .get_node_client() .await @@ -218,7 +206,7 @@ impl AgentService for AgentRpcServer { } } - async fn get_metric(self, _: context::Context, metric: AgentMetric) -> f64 { + async fn get_metric(self, _: Context, metric: AgentMetric) -> f64 { let metrics = self.state.metrics.read().await; match metric { @@ -228,7 +216,7 @@ impl AgentService for AgentRpcServer { async fn execute_authorization( self, - _: context::Context, + _: Context, env_id: EnvId, network: NetworkId, query: String, @@ -301,7 +289,7 @@ impl AgentService for AgentRpcServer { } } - async fn set_log_level(self, _: context::Context, level: String) -> Result<(), AgentError> { + async fn set_log_level(self, _: Context, level: String) -> Result<(), AgentError> { tracing::debug!("setting log level to {level}"); let level: tracing_subscriber::filter::LevelFilter = level .parse() @@ -314,11 +302,7 @@ impl AgentService for AgentRpcServer { Ok(()) } - async fn set_aot_log_level( - self, - ctx: context::Context, - verbosity: u8, - ) -> Result<(), AgentError> { + async fn set_aot_log_level(self, ctx: Context, verbosity: u8) -> Result<(), AgentError> { tracing::debug!("agent setting aot log verbosity to {verbosity:?}"); self.state .get_node_client() @@ -331,7 +315,7 @@ impl AgentService for AgentRpcServer { async fn get_snarkos_block_lite( self, - ctx: context::Context, + ctx: Context, block_hash: String, ) -> Result, AgentError> { self.state @@ -345,7 +329,7 @@ impl AgentService for AgentRpcServer { async fn find_transaction( self, - context: context::Context, + context: Context, tx_id: String, ) -> Result, AgentError> { self.state @@ -357,7 +341,7 @@ impl AgentService for AgentRpcServer { .map_err(|_| AgentError::FailedToMakeRequest)? } - async fn get_status(self, ctx: context::Context) -> Result { + async fn get_status(self, ctx: Context) -> Result { Ok(AgentStatus { aot_online: self .state diff --git a/crates/agent/src/state.rs b/crates/agent/src/state.rs index dbb239f9..57ba6197 100644 --- a/crates/agent/src/state.rs +++ b/crates/agent/src/state.rs @@ -8,8 +8,8 @@ use dashmap::DashMap; use indexmap::IndexMap; use reqwest::Url; use snops_common::{ - api::EnvInfo, - rpc::{agent::node::NodeServiceClient, control::ControlServiceClient, error::ReconcileError2}, + api::AgentEnvInfo, + rpc::{agent::node::NodeServiceClient, control::ControlServiceClient, error::ReconcileError}, state::{AgentId, AgentPeer, AgentState, EnvId, TransferId, TransferStatus}, util::OpaqueDebug, }; @@ -42,7 +42,7 @@ pub struct GlobalState { /// A sender for emitting the next time to reconcile the agent. /// Helpful for scheduling the next reconciliation. pub queue_reconcile_tx: Sender, - pub env_info: RwLock)>>, + pub env_info: RwLock)>>, // Map of agent IDs to their resolved addresses. pub resolved_addrs: RwLock>, pub metrics: RwLock, @@ -83,14 +83,14 @@ impl GlobalState { .is_ok() } - pub async fn set_env_info(&self, info: Option<(EnvId, Arc)>) { + pub async fn set_env_info(&self, info: Option<(EnvId, Arc)>) { if let Err(e) = self.db.set_env_info(info.clone()) { error!("failed to save env info to db: {e}"); } *self.env_info.write().await = info; } - pub async fn get_env_info(&self, env_id: EnvId) -> Result, ReconcileError2> { + pub async fn get_env_info(&self, env_id: EnvId) -> Result, ReconcileError> { match self.env_info.read().await.as_ref() { Some((id, info)) if *id == env_id => return Ok(info.clone()), _ => {} @@ -101,13 +101,13 @@ impl GlobalState { .read() .await .clone() - .ok_or(ReconcileError2::Offline)?; + .ok_or(ReconcileError::Offline)?; let info = client .get_env_info(context::current(), env_id) .await - .map_err(|e| ReconcileError2::RpcError(e.to_string()))? - .ok_or(ReconcileError2::MissingEnv(env_id))?; + .map_err(|e| ReconcileError::RpcError(e.to_string()))? + .ok_or(ReconcileError::MissingEnv(env_id))?; let env_info = (env_id, Arc::new(info)); if let Err(e) = self.db.set_env_info(Some(env_info.clone())) { @@ -136,9 +136,12 @@ impl GlobalState { self.node_client.read().await.clone() } - pub async fn update_agent_state(&self, state: AgentState, env_info: Option<(EnvId, EnvInfo)>) { - self.set_env_info(env_info.map(|(id, e)| (id, Arc::new(e)))) - .await; + pub async fn update_agent_state(&self, state: AgentState) { + if state.env() != self.env_info.read().await.as_ref().map(|(id, _)| *id) { + error!("attempted to set agent state with different env"); + return; + } + if let Err(e) = self.db.set_agent_state(&state) { error!("failed to save agent state to db: {e}"); } diff --git a/crates/common/src/api.rs b/crates/common/src/api.rs index f11eb69d..c8a33618 100644 --- a/crates/common/src/api.rs +++ b/crates/common/src/api.rs @@ -24,6 +24,13 @@ pub struct EnvInfo { pub block: Option, } +/// Lighter-weight version of EnvInfo for the agent +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AgentEnvInfo { + pub network: NetworkId, + pub storage: StorageInfo, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct StorageInfo { /// String id of this storage @@ -124,6 +131,81 @@ impl DataFormat for EnvInfo { } } +#[derive(Debug, Clone)] +pub struct AgentEnvInfoHeader { + pub version: u8, + pub network: DataHeaderOf, + pub storage: DataHeaderOf, +} + +impl DataFormat for AgentEnvInfoHeader { + type Header = (u8, DataHeaderOf>); + const LATEST_HEADER: Self::Header = (1, DataHeaderOf::::LATEST_HEADER); + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + let mut written = self.version.write_data(writer)?; + written += self.network.write_data(writer)?; + written += self.storage.write_data(writer)?; + Ok(written) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.0 != Self::LATEST_HEADER.0 { + return Err(crate::format::DataReadError::unsupported( + "EnvInfoHeader", + Self::LATEST_HEADER.0, + header.0, + )); + } + Ok(Self { + version: u8::read_data(reader, &())?, + network: DataHeaderOf::::read_data(reader, &())?, + storage: DataHeaderOf::::read_data(reader, &header.1)?, + }) + } +} + +impl DataFormat for AgentEnvInfo { + type Header = AgentEnvInfoHeader; + const LATEST_HEADER: Self::Header = AgentEnvInfoHeader { + version: 1, + network: NetworkId::LATEST_HEADER, + storage: StorageInfo::LATEST_HEADER, + }; + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + let mut written = self.network.write_data(writer)?; + written += self.storage.write_data(writer)?; + Ok(written) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.version != 1 { + return Err(crate::format::DataReadError::unsupported( + "EnvInfo", + 1, + header.version, + )); + } + Ok(Self { + network: NetworkId::read_data(reader, &header.network)?, + storage: StorageInfo::read_data(reader, &header.storage)?, + }) + } +} + #[derive(Debug, Clone)] pub struct StorageInfoHeader { pub version: u8, diff --git a/crates/common/src/rpc/control/agent.rs b/crates/common/src/rpc/control/agent.rs index dd600766..719f1b77 100644 --- a/crates/common/src/rpc/control/agent.rs +++ b/crates/common/src/rpc/control/agent.rs @@ -2,9 +2,9 @@ use std::net::IpAddr; use serde::{Deserialize, Serialize}; -use crate::api::EnvInfo; use crate::rpc::error::*; use crate::state::snarkos_status::SnarkOSLiteBlock; +use crate::state::AgentId; use crate::{ prelude::EnvId, state::{AgentState, NetworkId, PortConfig}, @@ -15,25 +15,24 @@ pub struct Handshake { pub jwt: Option, pub loki: Option, pub state: AgentState, - pub env_info: Option<(EnvId, EnvInfo)>, } /// The RPC service that agents implement as a server. #[tarpc::service] pub trait AgentService { /// Handshake with some initial connection details. - async fn handshake(handshake: Handshake) -> Result<(), ReconcileError>; + async fn handshake(handshake: Handshake); /// Control plane asks the agent for its external network address, along /// with local addrs. async fn get_addrs() -> (PortConfig, Option, Vec); + /// An agent is instructed to clear the address of a peer. + async fn clear_peer_addr(agent_id: AgentId); + /// Control plane instructs the agent to reconcile towards a particular /// state. - async fn set_agent_state( - to: AgentState, - env_info: Option<(EnvId, EnvInfo)>, - ) -> Result<(), ReconcileError>; + async fn set_agent_state(to: AgentState); /// Broadcast a transaction locally async fn broadcast_tx(tx: String) -> Result<(), AgentError>; diff --git a/crates/common/src/rpc/control/mod.rs b/crates/common/src/rpc/control/mod.rs index a99360a7..1dd16c3d 100644 --- a/crates/common/src/rpc/control/mod.rs +++ b/crates/common/src/rpc/control/mod.rs @@ -7,7 +7,7 @@ use std::{ use super::error::ResolveError; use crate::{ - api::EnvInfo, + api::AgentEnvInfo, state::{AgentId, EnvId, NodeStatus, TransferStatus, TransferStatusUpdate}, }; @@ -21,7 +21,7 @@ pub trait ControlService { ) -> Result, ResolveError>; /// Get the environment info for the given environment. - async fn get_env_info(env_id: EnvId) -> Option; + async fn get_env_info(env_id: EnvId) -> Option; /// Emit an agent transfer status update. async fn post_transfer_status(id: u32, status: TransferStatusUpdate); diff --git a/crates/common/src/rpc/error.rs b/crates/common/src/rpc/error.rs index 5a1dd9ae..beb87aac 100644 --- a/crates/common/src/rpc/error.rs +++ b/crates/common/src/rpc/error.rs @@ -123,36 +123,8 @@ pub enum ResolveError { AgentHasNoAddresses, } -#[derive(Debug, Error, Serialize, Deserialize, AsRefStr)] -pub enum ReconcileError { - #[error("aborted by a more recent reconcilation request")] - Aborted, - #[error("failed setup storage: {0}")] - StorageSetupError(String), - #[error("failed to download {0} from the control plane")] - StorageAcquireError(String), - #[error("failed to get the binary from the control plane: {0}")] - BinaryAcquireError(String), - #[error("failed to find a checkpoint for the requested height/span")] - CheckpointAcquireError, - #[error("failed to apply checkpoint: {0}")] - CheckpointApplyError(String), - #[error("failed to resolve addresses of stated peers")] - ResolveAddrError(ResolveError), - #[error("a rention policy is required to rewind the ledger")] - MissingRetentionPolicy, - #[error("failed to load checkpoints for storage")] - CheckpointLoadError, - #[error("agent did not provide a local private key")] - NoLocalPrivateKey, - #[error("generic database error")] - Database, - #[error("unknown error")] - Unknown, -} - #[derive(Debug, Clone, Error, Serialize, Deserialize, AsRefStr)] -pub enum ReconcileError2 { +pub enum ReconcileError { #[error("node is not connected to the controlplane")] Offline, #[error("env {0} not found")] diff --git a/crates/common/src/state/agent_state.rs b/crates/common/src/state/agent_state.rs index 986add39..cc88d6f5 100644 --- a/crates/common/src/state/agent_state.rs +++ b/crates/common/src/state/agent_state.rs @@ -21,6 +21,13 @@ impl AgentState { } } + pub fn env(&self) -> Option { + match self { + Self::Inventory => None, + Self::Node(id, _) => Some(*id), + } + } + pub fn map_env_id(&self, f: F) -> Option where F: Fn(EnvId) -> Option, diff --git a/crates/common/src/state/node_state.rs b/crates/common/src/state/node_state.rs index 81d2a10b..424e280f 100644 --- a/crates/common/src/state/node_state.rs +++ b/crates/common/src/state/node_state.rs @@ -176,24 +176,6 @@ pub enum AgentPeer { External(SocketAddr), } -impl AgentPeer { - /// Get the port from the peer - pub fn port(&self) -> u16 { - match self { - Self::Internal(_, port) => *port, - Self::External(addr) => addr.port(), - } - } - - /// Return a new peer with the given port. - pub fn with_port(&self, port: u16) -> Self { - match self { - Self::Internal(ip, _) => Self::Internal(*ip, port), - Self::External(addr) => Self::External(SocketAddr::new(addr.ip(), port)), - } - } -} - impl DataFormat for KeyState { type Header = u8; const LATEST_HEADER: Self::Header = 1; diff --git a/crates/controlplane/src/env/error.rs b/crates/controlplane/src/env/error.rs index fdc9ee84..a97dfa98 100644 --- a/crates/controlplane/src/env/error.rs +++ b/crates/controlplane/src/env/error.rs @@ -13,7 +13,6 @@ use tokio::task::JoinError; use crate::{ cannon::error::{AuthorizeError, CannonError}, schema::error::{SchemaError, StorageError}, - state::error::BatchReconcileError, }; #[derive(Debug, Error, AsRefStr)] @@ -63,8 +62,6 @@ pub enum ExecutionError { Cannon(#[from] CannonError), #[error(transparent)] Join(#[from] JoinError), - #[error(transparent)] - Reconcile(#[from] BatchReconcileError), #[error("env `{0}` timeline `{1}` not found")] TimelineNotFound(EnvId, TimelineId), #[error("env timeline is already being executed")] @@ -79,7 +76,6 @@ pub enum ExecutionError { impl_into_status_code!(ExecutionError, |value| match value { Cannon(e) => e.into(), - Reconcile(e) => e.into(), Storage(e) => e.into(), _ => StatusCode::INTERNAL_SERVER_ERROR, }); @@ -190,8 +186,6 @@ impl_into_status_code!(CleanupError, |_| StatusCode::NOT_FOUND); #[derive(Debug, Error, AsRefStr)] pub enum ReconcileError { - #[error(transparent)] - Batch(#[from] BatchReconcileError), #[error("env `{0}` not found")] EnvNotFound(EnvId), #[error("expected internal agent peer for node with key {key}")] @@ -199,7 +193,6 @@ pub enum ReconcileError { } impl_into_status_code!(ReconcileError, |value| match value { - Batch(e) => e.into(), EnvNotFound(_) | ExpectedInternalAgentPeer { .. } => StatusCode::NOT_FOUND, }); diff --git a/crates/controlplane/src/env/mod.rs b/crates/controlplane/src/env/mod.rs index 2ba83d35..a368ec08 100644 --- a/crates/controlplane/src/env/mod.rs +++ b/crates/controlplane/src/env/mod.rs @@ -6,10 +6,11 @@ use std::{ use bimap::BiMap; use dashmap::DashMap; +use futures_util::future::join_all; use indexmap::{map::Entry, IndexMap, IndexSet}; use serde::{Deserialize, Serialize}; use snops_common::{ - api::EnvInfo, + api::{AgentEnvInfo, EnvInfo}, node_targets::NodeTargets, state::{ AgentId, AgentPeer, AgentState, CannonId, EnvId, NetworkId, NodeKey, NodeState, TxPipeId, @@ -400,20 +401,17 @@ impl Environment { agents_to_inventory.len() ); // reconcile agents that are freed up from the delta between environments - if let Err(e) = state - .reconcile_agents( + state + .update_agent_states( agents_to_inventory .into_iter() - .map(|id| (id, state.get_client(id), AgentState::Inventory)), + .map(|id| (id, AgentState::Inventory)), ) - .await - { - error!("an error occurred while attempting to inventory newly freed agents: {e}"); - } + .await; } // reconcile the nodes - initial_reconcile(env_id, &state, prev_env.is_none()).await?; + initial_reconcile(env_id, &state).await?; Ok(env_id) } @@ -453,8 +451,8 @@ impl Environment { trace!("[env {id}] inventorying agents..."); - if let Err(e) = state - .reconcile_agents( + state + .update_agent_states( env.node_peers .right_values() // find all agents associated with the env @@ -462,16 +460,13 @@ impl Environment { EnvPeer::Internal(id) => Some(*id), _ => None, }) - .map(|id| (id, state.get_client(id), AgentState::Inventory)) + .map(|id| (id, AgentState::Inventory)) // this collect is necessary because the iter sent to reconcile_agents // must be owned by this thread. Without this, the iter would hold a reference // to the env.node_peers.right_values(), which is NOT Send .collect::>(), ) - .await - { - error!("an error occurred while attempting to inventory newly freed agents: {e}"); - } + .await; Ok(()) } @@ -555,6 +550,102 @@ impl Environment { }) } + fn nodes_with_peer<'a>( + &'a self, + key: &'a NodeKey, + ) -> impl Iterator> { + self.node_states.iter().filter(move |s| { + // Only internal nodes can be agents + let EnvNodeState::Internal(node) = s.value() else { + return false; + }; + + // Ignore self-reference + if s.key() == key { + return false; + } + + // Only agents that reference the node are relevant + node.peers.matches(key) || node.validators.matches(key) + }) + } + + pub async fn update_peer_addr( + &self, + state: &GlobalState, + agent_id: AgentId, + is_port_change: bool, + is_ip_change: bool, + ) { + let Some(key) = self.get_node_key_by_agent(agent_id) else { + return; + }; + let pending_reconciles = self + .nodes_with_peer(key) + .filter_map(|ent| { + let EnvNodeState::Internal(env_node) = ent.value() else { + return None; + }; + + // Lookup agent and get current state + let agent_id = self.get_agent_by_key(ent.key())?; + + // If the port didn't change, we're not updating the agents' states + if !is_port_change { + return Some((agent_id, None)); + } + + let agent = state.pool.get(&agent_id)?; + + let AgentState::Node(env_id, node_state) = agent.state() else { + return None; + }; + + // Determine if the node's peers and validators have changed + let (peers, validators) = self.resolve_node_peers(&state.pool, agent_id, env_node); + if peers == node_state.peers && validators == node_state.validators { + return None; + } + + // Update the node's peers and validators + let mut new_state = node_state.clone(); + new_state.peers = peers; + new_state.validators = validators; + + Some((agent_id, Some(AgentState::Node(*env_id, new_state)))) + }) + .collect::>(); + + // Call the clear peer addr RPC for all agents that reference the node + if is_ip_change { + join_all(pending_reconciles.iter().filter_map(|(id, _)| { + let client = state.get_client(*id)?; + + Some(tokio::spawn(async move { + client.clear_peer_addr(agent_id).await + })) + })) + .await; + } + + // Update the agent states if there's a port change + if is_port_change { + state + .update_agent_states( + pending_reconciles + .into_iter() + .filter_map(|(id, state)| state.map(|s| (id, s))), + ) + .await; + + // Otherwise do a normal reconcile + } else { + state + .queue_many_reconciles(pending_reconciles.into_iter().map(|(id, _)| id)) + .await; + } + } + pub fn get_cannon(&self, id: CannonId) -> Option> { self.cannons.get(&id).cloned() } @@ -567,6 +658,13 @@ impl Environment { } } + pub fn agent_info(&self) -> AgentEnvInfo { + AgentEnvInfo { + network: self.network, + storage: self.storage.info(), + } + } + /// Resolve node's agent configuration given the context of the environment. pub fn resolve_node_state( &self, @@ -585,23 +683,34 @@ impl Environment { .map(|key| self.storage.lookup_keysource_pk(key)) .unwrap_or_default(); + (node_state.peers, node_state.validators) = self.resolve_node_peers(&state.pool, id, node); + + node_state + } + + pub fn resolve_node_peers( + &self, + pool: &DashMap, + id: AgentId, + node: &Node, + ) -> (Vec, Vec) { // a filter to exclude the current node from the list of peers let not_me = |agent: &AgentPeer| !matches!(agent, AgentPeer::Internal(candidate_id, _) if *candidate_id == id); // resolve the peers and validators from node targets - node_state.peers = self - .matching_nodes(&node.peers, &state.pool, PortType::Node) + let mut peers: Vec<_> = self + .matching_nodes(&node.peers, pool, PortType::Node) .filter(not_me) .collect(); - node_state.peers.sort(); + peers.sort(); - node_state.validators = self - .matching_nodes(&node.validators, &state.pool, PortType::Bft) + let mut validators: Vec<_> = self + .matching_nodes(&node.validators, pool, PortType::Bft) .filter(not_me) .collect(); - node_state.validators.sort(); + validators.sort(); - node_state + (peers, validators) } } diff --git a/crates/controlplane/src/env/reconcile.rs b/crates/controlplane/src/env/reconcile.rs index d5cb2132..9cbbdc83 100644 --- a/crates/controlplane/src/env/reconcile.rs +++ b/crates/controlplane/src/env/reconcile.rs @@ -1,15 +1,10 @@ use snops_common::state::{AgentState, EnvId}; -use tracing::error; use super::{error::*, EnvNodeState}; -use crate::{env::Environment, state::GlobalState}; +use crate::state::GlobalState; /// Reconcile all associated nodes with their initial state. -pub async fn initial_reconcile( - env_id: EnvId, - state: &GlobalState, - is_new_env: bool, -) -> Result<(), EnvError> { +pub async fn initial_reconcile(env_id: EnvId, state: &GlobalState) -> Result<(), EnvError> { let mut pending_reconciliations = vec![]; { let env = state @@ -50,23 +45,10 @@ pub async fn initial_reconcile( let agent_state = AgentState::Node(env_id, Box::new(node_state)); - pending_reconciliations.push((id, state.get_client(id), agent_state)); + pending_reconciliations.push((id, agent_state)); } } - if let Err(e) = state.reconcile_agents(pending_reconciliations).await { - // if this is a patch to an existing environment, avoid inventorying the agents - if !is_new_env { - return Err(ReconcileError::Batch(e).into()); - } - - error!("an error occurred on initial reconciliation, inventorying all agents: {e}"); - if let Err(e) = Environment::cleanup(env_id, state).await { - error!("an error occurred inventorying agents: {e}"); - } - - Err(ReconcileError::Batch(e).into()) - } else { - Ok(()) - } + state.update_agent_states(pending_reconciliations).await; + Ok(()) } diff --git a/crates/controlplane/src/error.rs b/crates/controlplane/src/error.rs index 17dd6bb3..89ad73f2 100644 --- a/crates/controlplane/src/error.rs +++ b/crates/controlplane/src/error.rs @@ -20,8 +20,6 @@ pub enum StateError { Agent(#[from] snops_common::prelude::error::AgentError), #[error("source agent has no addr id: `{0}`")] NoAddress(AgentId), - #[error(transparent)] - Reconcile(#[from] snops_common::prelude::error::ReconcileError), #[error("{0}")] Rpc(#[from] tarpc::client::RpcError), #[error("source agent not found id: `{0}`")] @@ -32,7 +30,6 @@ impl_into_status_code!(StateError); impl_into_type_str!(StateError, |value| match value { Agent(e) => format!("{}.{}", value.as_ref(), e.as_ref()), - Reconcile(e) => format!("{}.{}", value.as_ref(), e.as_ref()), _ => value.as_ref().to_string(), }); diff --git a/crates/controlplane/src/server/actions/config.rs b/crates/controlplane/src/server/actions/config.rs index 777b714c..fcb267fd 100644 --- a/crates/controlplane/src/server/actions/config.rs +++ b/crates/controlplane/src/server/actions/config.rs @@ -27,7 +27,7 @@ pub async fn config( #[allow(unused_variables)] match pending.entry($agent.id()) { Entry::Occupied(mut ent) => { - match ent.get_mut().2 { + match ent.get_mut().1 { AgentState::Inventory => (), AgentState::Node(_, ref mut n) => { $({ @@ -40,7 +40,6 @@ pub async fn config( Entry::Vacant(ent) => { ent.insert(( $agent.id(), - $agent.client_owned(), $agent.state().clone().map_node(|mut n| { $({ let $key = &mut n.$key; @@ -119,13 +118,6 @@ pub async fn config( let pending = pending.into_values().collect::>(); let node_map = pending_reconcile_node_map(pending.iter()); - let res = state - .reconcile_agents(pending) - .await - .map_err(ServerError::from); - - match res { - Ok(_) => Json(node_map).into_response(), - e => e.into_response(), - } + state.update_agent_states(pending).await; + Json(node_map).into_response() } diff --git a/crates/controlplane/src/server/actions/power.rs b/crates/controlplane/src/server/actions/power.rs index 2c32fd7b..f4eb5d2f 100644 --- a/crates/controlplane/src/server/actions/power.rs +++ b/crates/controlplane/src/server/actions/power.rs @@ -5,10 +5,7 @@ use axum::{ use snops_common::action_models::WithTargets; use super::Env; -use crate::{ - server::error::ServerError, - state::{pending_reconcile_node_map, Agent}, -}; +use crate::state::{pending_reconcile_node_map, Agent}; pub async fn online( Env { env, state, .. }: Env, @@ -29,15 +26,9 @@ pub async fn online( let node_map = pending_reconcile_node_map(pending.iter()); - let res = state - .reconcile_agents(pending) - .await - .map_err(ServerError::from); + state.update_agent_states(pending).await; - match res { - Ok(_) => Json(node_map).into_response(), - e => e.into_response(), - } + Json(node_map).into_response() } pub async fn offline( @@ -59,15 +50,9 @@ pub async fn offline( let node_map = pending_reconcile_node_map(pending.iter()); - let res = state - .reconcile_agents(pending) - .await - .map_err(ServerError::from); + state.update_agent_states(pending).await; - match res { - Ok(_) => Json(node_map).into_response(), - e => e.into_response(), - } + Json(node_map).into_response() } pub async fn reboot(env: Env, json: Json) -> Response { diff --git a/crates/controlplane/src/server/error.rs b/crates/controlplane/src/server/error.rs index b27a1c08..d4699ce4 100644 --- a/crates/controlplane/src/server/error.rs +++ b/crates/controlplane/src/server/error.rs @@ -12,13 +12,10 @@ use crate::{ env::error::{EnvError, EnvRequestError, ExecutionError}, error::DeserializeError, schema::error::{SchemaError, StorageError}, - state::error::BatchReconcileError, }; #[derive(Debug, Error, strum_macros::AsRefStr)] pub enum ServerError { - #[error(transparent)] - BatchReconcile(#[from] BatchReconcileError), #[error("Content resource `{0}` not found")] ContentNotFound(String), #[error(transparent)] @@ -50,7 +47,6 @@ pub enum ServerError { } impl_into_status_code!(ServerError, |value| match value { - BatchReconcile(e) => e.into(), ContentNotFound(_) => axum::http::StatusCode::NOT_FOUND, Cannon(e) => e.into(), Deserialize(e) => e.into(), @@ -68,7 +64,6 @@ impl_into_status_code!(ServerError, |value| match value { }); impl_into_type_str!(ServerError, |value| match value { - BatchReconcile(e) => format!("{}.{e}", value.as_ref()), Cannon(e) => format!("{}.{}", value.as_ref(), String::from(e)), Env(e) => format!("{}.{}", value.as_ref(), String::from(e)), Execute(e) => format!("{}.{}", value.as_ref(), String::from(e)), diff --git a/crates/controlplane/src/server/mod.rs b/crates/controlplane/src/server/mod.rs index 6f467483..6388637e 100644 --- a/crates/controlplane/src/server/mod.rs +++ b/crates/controlplane/src/server/mod.rs @@ -180,9 +180,6 @@ async fn handle_socket( // attach the current known agent state to the handshake agent.state().clone_into(&mut handshake.state); - handshake.env_info = handshake - .state - .map_env_id(|id| state.get_env(id).map(|env| (id, env.info(&state)))); // mark the agent as connected, update the flags as well agent.mark_connected(client, query.flags); @@ -205,10 +202,7 @@ async fn handle_socket( let mut ctx = tarpc::context::current(); ctx.deadline += Duration::from_secs(300); match client.handshake(ctx, handshake).await { - Ok(Ok(())) => (), - Ok(Err(e)) => { - error!("failed to perform agent {id} handshake reconciliation: {e}") - } + Ok(()) => (), Err(e) => error!("failed to perform agent {id} handshake: {e}"), } }); @@ -246,8 +240,7 @@ async fn handle_socket( let mut ctx = tarpc::context::current(); ctx.deadline += Duration::from_secs(300); match client.handshake(ctx, handshake).await { - Ok(Ok(())) => (), - Ok(Err(e)) => error!("failed to perform agent {id} handshake reconciliation: {e}"), + Ok(()) => (), Err(e) => error!("failed to perform agent {id} handshake: {e}"), } }); @@ -269,21 +262,42 @@ async fn handle_socket( // fetch the agent's network addresses on connect/reconnect let state2 = Arc::clone(&state); tokio::spawn(async move { - if let Ok((ports, external, internal)) = client.get_addrs(tarpc::context::current()).await { - if let Some(mut agent) = state2.pool.get_mut(&id) { - info!( - "agent {id} [{}], labels: {:?}, addrs: {external:?} {internal:?} @ {ports}, local pk: {}", - agent.modes(), - agent.str_labels(), - if agent.has_local_pk() { "yes" } else { "no" }, - ); - agent.set_ports(ports); - agent.set_addrs(external, internal); - if let Err(e) = state2.db.agents.save(&id, &agent) { - error!("failed to save agent {id} to the database: {e}"); - } - } + let Ok((ports, external, internal)) = client.get_addrs(tarpc::context::current()).await + else { + return; + }; + let Some(mut agent) = state2.pool.get_mut(&id) else { + return; + }; + + info!( + "agent {id} [{}], labels: {:?}, addrs: {external:?} {internal:?} @ {ports}, local pk: {}", + agent.modes(), + agent.str_labels(), + if agent.has_local_pk() { "yes" } else { "no" }, + ); + + let is_port_change = agent.set_ports(ports); + let is_ip_change = agent.set_addrs(external, internal); + + if let Err(e) = state2.db.agents.save(&id, &agent) { + error!("failed to save agent {id} to the database: {e}"); + } + + if !is_ip_change && !is_port_change { + return; } + let Some(env_id) = agent.env() else { + return; + }; + drop(agent); + let Some(env) = state2.get_env(env_id) else { + return; + }; + + info!("Agent {id} updated its network addresses... Submitting changes to associated peers"); + env.update_peer_addr(&state2, id, is_port_change, is_ip_change) + .await; }); // set up the server, for incoming RPC requests diff --git a/crates/controlplane/src/server/rpc.rs b/crates/controlplane/src/server/rpc.rs index 8ca05d97..ec86b715 100644 --- a/crates/controlplane/src/server/rpc.rs +++ b/crates/controlplane/src/server/rpc.rs @@ -5,7 +5,7 @@ use std::{ use chrono::Utc; use snops_common::{ - api::EnvInfo, + api::AgentEnvInfo, define_rpc_mux, rpc::{ control::{ @@ -55,8 +55,8 @@ impl ControlService for ControlRpcServer { resolve_addrs(&addr_map, self.agent, &peers).map_err(|_| ResolveError::SourceAgentNotFound) } - async fn get_env_info(self, _: context::Context, env_id: EnvId) -> Option { - Some(self.state.get_env(env_id)?.info(&self.state)) + async fn get_env_info(self, _: context::Context, env_id: EnvId) -> Option { + Some(self.state.get_env(env_id)?.agent_info()) } async fn post_transfer_status( @@ -203,6 +203,22 @@ impl ControlService for ControlRpcServer { } } +pub fn resolve_one_addr(src_addrs: &AgentAddrs, target_addrs: &AgentAddrs) -> Option { + match ( + src_addrs.external, + target_addrs.external, + target_addrs.internal.first(), + ) { + // if peers have the same external address, use the first internal address + (Some(src_ext), Some(peer_ext), Some(peer_int)) if src_ext == peer_ext => Some(*peer_int), + // if both peers have only internal addresses, use the internal address + (None, None, Some(peer_int)) => Some(*peer_int), + // otherwise use the external address + (_, Some(peer_ext), _) => Some(peer_ext), + _ => None, + } +} + /// Given a map of addresses, resolve the addresses of a set of peers relative /// to a source agent. fn resolve_addrs( @@ -214,10 +230,6 @@ fn resolve_addrs( .get(&src) .ok_or_else(|| StateError::SourceAgentNotFound(src))?; - let all_internal = addr_map - .values() - .all(|AgentAddrs { external, .. }| external.is_none()); - Ok(peers .iter() .filter_map(|id| { @@ -226,24 +238,7 @@ fn resolve_addrs( return None; } - // if the agent has no addresses, skip it - let addrs = addr_map.get(id)?; - - // if there are no external addresses in the entire addr map, - // use the first internal address - if all_internal { - return addrs.internal.first().copied().map(|addr| (*id, addr)); - } - - match (src_addrs.external, addrs.external, addrs.internal.first()) { - // if peers have the same external address, use the first internal address - (Some(src_ext), Some(peer_ext), Some(peer_int)) if src_ext == peer_ext => { - Some((*id, *peer_int)) - } - // otherwise use the external address - (_, Some(peer_ext), _) => Some((*id, peer_ext)), - _ => None, - } + Some((*id, resolve_one_addr(src_addrs, addr_map.get(id)?)?)) }) .collect()) } diff --git a/crates/controlplane/src/state/agent.rs b/crates/controlplane/src/state/agent.rs index 57c854a3..c76ae0a3 100644 --- a/crates/controlplane/src/state/agent.rs +++ b/crates/controlplane/src/state/agent.rs @@ -235,8 +235,10 @@ impl Agent { } /// Set the ports of the agent. This does **not** trigger a reconcile - pub fn set_ports(&mut self, ports: PortConfig) { + pub fn set_ports(&mut self, ports: PortConfig) -> bool { + let changed = self.ports.as_ref() != Some(&ports); self.ports = Some(ports); + changed } // Gets the bft port of the agent. Assumes the agent is ready, returns 0 if not. @@ -278,8 +280,11 @@ impl Agent { /// Set the external and internal addresses of the agent. This does **not** /// trigger a reconcile - pub fn set_addrs(&mut self, external: Option, internal: Vec) { - self.addrs = Some(AgentAddrs { external, internal }); + pub fn set_addrs(&mut self, external: Option, internal: Vec) -> bool { + let addrs = AgentAddrs { external, internal }; + let changed = self.addrs.as_ref() != Some(&addrs); + self.addrs = Some(addrs); + changed } pub fn map_to_reconcile(&self, f: F) -> PendingAgentReconcile @@ -288,7 +293,6 @@ impl Agent { { ( self.id(), - self.client_owned(), match &self.state { AgentState::Node(id, state) => AgentState::Node(*id, Box::new(f(*state.clone()))), s => s.clone(), @@ -302,7 +306,6 @@ impl Agent { { Some(( self.id(), - self.client_owned(), match &self.state { AgentState::Node(id, state) => AgentState::Node(*id, Box::new(f(*state.clone())?)), _ => return None, @@ -318,7 +321,7 @@ pub enum AgentConnection { } /// This is the representation of a public addr or a list of internal addrs. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct AgentAddrs { pub external: Option, pub internal: Vec, diff --git a/crates/controlplane/src/state/error.rs b/crates/controlplane/src/state/error.rs deleted file mode 100644 index 3728ed08..00000000 --- a/crates/controlplane/src/state/error.rs +++ /dev/null @@ -1,10 +0,0 @@ -use snops_common::impl_into_status_code; -use thiserror::Error; - -#[derive(Debug, Error)] -#[error("batch reconciliation failed with `{failures}` failed reconciliations")] -pub struct BatchReconcileError { - pub failures: usize, -} - -impl_into_status_code!(BatchReconcileError); diff --git a/crates/controlplane/src/state/mod.rs b/crates/controlplane/src/state/mod.rs index 97e5bd32..3773fd45 100644 --- a/crates/controlplane/src/state/mod.rs +++ b/crates/controlplane/src/state/mod.rs @@ -5,7 +5,6 @@ use snops_common::state::{AgentId, EnvId, NetworkId, StorageId}; mod agent; mod agent_flags; -pub mod error; pub mod external_peers; mod global; mod reconcile; diff --git a/crates/controlplane/src/state/reconcile.rs b/crates/controlplane/src/state/reconcile.rs index 44ae9f81..ca6b659f 100644 --- a/crates/controlplane/src/state/reconcile.rs +++ b/crates/controlplane/src/state/reconcile.rs @@ -4,17 +4,17 @@ use futures_util::future::join_all; use snops_common::state::{AgentId, AgentState, NodeKey}; use tracing::{error, info}; -use super::{error::BatchReconcileError, AgentClient, GlobalState}; +use super::GlobalState; /// The tuple to pass into `reconcile_agents`. -pub type PendingAgentReconcile = (AgentId, Option, AgentState); +pub type PendingAgentReconcile = (AgentId, AgentState); /// Get a node map (key => agent ID) from an agent reconciliation iterator. pub fn pending_reconcile_node_map<'a>( pending: impl Iterator, ) -> HashMap { pending - .map(|(id, _, state)| match state { + .map(|(id, state)| match state { AgentState::Node(_, node) => (node.node_key.clone(), *id), _ => unreachable!(), }) @@ -23,28 +23,12 @@ pub fn pending_reconcile_node_map<'a>( impl GlobalState { /// Reconcile a bunch of agents at once. - pub async fn reconcile_agents( - &self, - iter: impl IntoIterator, - ) -> Result<(), BatchReconcileError> { - let mut handles = vec![]; + pub async fn update_agent_states(&self, iter: impl IntoIterator) { let mut agent_ids = vec![]; - for (id, client, target) in iter { - agent_ids.push(id); - - // if the client is present, queue a reconcile - if let Some(client) = client { - let env_info = target - .map_env_id(|env_id| self.get_env(env_id).map(|env| (env_id, env.info(self)))); - - handles.push(tokio::spawn(async move { - client.set_agent_state(target, env_info).await - })); - - // otherwise just change the agent state so it'll inventory on - // reconnect - } else if let Some(mut agent) = self.pool.get_mut(&id) { + for (id, target) in iter { + if let Some(mut agent) = self.pool.get_mut(&id) { + agent_ids.push(id); agent.set_state(target); if let Err(e) = self.db.agents.save(&id, &agent) { error!("failed to save agent {id} to the database: {e}"); @@ -52,52 +36,58 @@ impl GlobalState { } } + self.queue_many_reconciles(agent_ids).await; + } + + pub async fn queue_many_reconciles( + &self, + iter: impl IntoIterator, + ) -> (usize, usize) { + let mut handles = vec![]; + let mut agent_ids = vec![]; + + for id in iter { + let agent = self.pool.get(&id); + let Some(agent) = agent else { + continue; + }; + let Some(client) = agent.client_owned() else { + continue; + }; + + agent_ids.push(id); + let target = agent.state.clone(); + + handles.push(tokio::spawn( + async move { client.set_agent_state(target).await }, + )); + } + if handles.is_empty() { - return Ok(()); + return (0, 0); } let num_reconciliations = handles.len(); - info!("beginning reconciliation..."); + info!("Queuing reconciliation..."); let reconciliations = join_all(handles).await; - info!("reconciliation complete, updating agent states..."); let mut success = 0; for (agent_id, result) in agent_ids.into_iter().zip(reconciliations) { - let Some(mut agent) = self.pool.get_mut(&agent_id) else { - continue; - }; - match result { - Ok(Ok(Ok(agent_state))) => { - agent.set_state(agent_state); - if let Err(e) = self.db.agents.save(&agent_id, &agent) { - error!("failed to save agent {agent_id} to the database: {e}"); - } - + Ok(Ok(())) => { success += 1; } - Ok(Ok(Err(e))) => error!( - "agent {} experienced a reconcilation error: {e}", - agent.id(), - ), - - Ok(Err(e)) => error!("agent {} experienced a rpc error: {e}", agent.id(),), - Err(e) => error!("agent {} experienced a join error: {e}", agent.id(),), + Ok(Err(e)) => error!("agent {agent_id} experienced a rpc error: {e}"), + Err(e) => error!("join error during agent {agent_id} reconcile: {e}"), } } info!( - "reconciliation result: {success}/{} nodes reconciled", + "reconciliation result: {success}/{} nodes connected", num_reconciliations ); - if success == num_reconciliations { - Ok(()) - } else { - Err(BatchReconcileError { - failures: num_reconciliations - success, - }) - } + (success, num_reconciliations) } } diff --git a/crates/controlplane/src/state/rpc.rs b/crates/controlplane/src/state/rpc.rs index 8c20a013..e384bbd2 100644 --- a/crates/controlplane/src/state/rpc.rs +++ b/crates/controlplane/src/state/rpc.rs @@ -2,12 +2,8 @@ use std::{fmt::Display, time::Duration}; use serde::de::DeserializeOwned; use snops_common::{ - api::EnvInfo, - rpc::{ - control::agent::AgentServiceClient, - error::{ReconcileError, SnarkosRequestError}, - }, - state::{snarkos_status::SnarkOSLiteBlock, AgentState, EnvId, NetworkId}, + rpc::{control::agent::AgentServiceClient, error::SnarkosRequestError}, + state::{snarkos_status::SnarkOSLiteBlock, AgentId, AgentState, EnvId, NetworkId}, }; use tarpc::{client::RpcError, context}; @@ -17,18 +13,12 @@ use crate::error::StateError; pub struct AgentClient(pub(crate) AgentServiceClient); impl AgentClient { - pub async fn set_agent_state( - &self, - to: AgentState, - env_info: Option<(EnvId, EnvInfo)>, - ) -> Result, RpcError> { - let mut ctx = context::current(); - ctx.deadline += Duration::from_secs(300); + pub async fn set_agent_state(&self, to: AgentState) -> Result<(), RpcError> { + self.0.set_agent_state(context::current(), to).await + } - self.0 - .set_agent_state(ctx, to.clone(), env_info) - .await - .map(|res| res.map(|_| to)) + pub async fn clear_peer_addr(&self, peer: AgentId) -> Result<(), RpcError> { + self.0.clear_peer_addr(context::current(), peer).await } pub async fn snarkos_get( From d51b6adb8c9520273eee4a64f596642eb541a98a Mon Sep 17 00:00:00 2001 From: Meshiest Date: Wed, 27 Nov 2024 20:10:30 -0500 Subject: [PATCH 23/68] chore(agent,controlplane): error, logging, and mild code cleanup --- crates/agent/src/client.rs | 41 +- crates/agent/src/main.rs | 3 +- crates/agent/src/metrics/mod.rs | 2 +- crates/agent/src/reconcile/agent.rs | 3 +- crates/agent/src/rpc/control.rs | 45 ++- crates/agent/src/server.rs | 34 +- crates/agent/src/state.rs | 83 +++- crates/controlplane/src/env/mod.rs | 18 +- crates/controlplane/src/schema/storage/mod.rs | 10 +- .../controlplane/src/server/actions/power.rs | 10 +- crates/controlplane/src/server/mod.rs | 347 +---------------- crates/controlplane/src/server/websocket.rs | 362 ++++++++++++++++++ crates/controlplane/src/state/reconcile.rs | 13 +- 13 files changed, 555 insertions(+), 416 deletions(-) create mode 100644 crates/controlplane/src/server/websocket.rs diff --git a/crates/agent/src/client.rs b/crates/agent/src/client.rs index d66493af..c6edf855 100644 --- a/crates/agent/src/client.rs +++ b/crates/agent/src/client.rs @@ -105,8 +105,18 @@ pub async fn ws_connection(ws_req: Request, state: Arc) { // handle outgoing responses msg = server_response_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&control::MuxedMessageOutgoing::Child(msg)).expect("failed to serialize response"); + let Some(msg) = msg else { + error!("internal agent RPC channel closed"); + break; + }; + let bin = match bincode::serialize(&control::MuxedMessageOutgoing::Child(msg)) { + Ok(bin) => bin, + Err(e) => { + error!("failed to serialize response: {e}"); + continue; + } + }; + let send = stream.send(tungstenite::Message::Binary(bin)); if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { error!("The connection to the control plane was interrupted while sending agent message"); @@ -116,8 +126,17 @@ pub async fn ws_connection(ws_req: Request, state: Arc) { // handle outgoing requests msg = client_request_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&control::MuxedMessageOutgoing::Parent(msg)).expect("failed to serialize request"); + let Some(msg) = msg else { + error!("internal agent RPC channel closed"); + break; + }; + let bin = match bincode::serialize(&control::MuxedMessageOutgoing::Parent(msg)) { + Ok(bin) => bin, + Err(e) => { + error!("failed to serialize request: {e}"); + continue; + } + }; let send = stream.send(tungstenite::Message::Binary(bin)); if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { error!("The connection to the control plane was interrupted while sending control message"); @@ -174,8 +193,18 @@ pub async fn ws_connection(ws_req: Request, state: Arc) { }; match msg { - control::MuxedMessageIncoming::Child(msg) => server_request_in.send(msg).expect("internal RPC channel closed"), - control::MuxedMessageIncoming::Parent(msg) => client_response_in.send(msg).expect("internal RPC channel closed"), + control::MuxedMessageIncoming::Child(msg) => { + if let Err(e) = server_request_in.send(msg) { + error!("internal agent RPC channel closed: {e}"); + break; + } + }, + control::MuxedMessageIncoming::Parent(msg) => { + if let Err(e) = client_response_in.send(msg) { + error!("internal agent RPC channel closed: {e}"); + break; + } + } } } diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 21d20576..cc4ce543 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -12,7 +12,6 @@ mod transfers; use std::{ net::Ipv4Addr, - ops::Deref, sync::{Arc, Mutex}, time::{Duration, Instant}, }; @@ -151,7 +150,7 @@ async fn main() { // The context is mutated while reconciling to keep track of things // like downloads, ledger manipulations, node command, and more. let mut root = AgentStateReconciler { - agent_state: Arc::clone(state.agent_state.read().await.deref()), + agent_state: state.get_agent_state().await, state: Arc::clone(&state), // Recover context from previous state context: AgentStateReconcilerContext::hydrate(&state.db), diff --git a/crates/agent/src/metrics/mod.rs b/crates/agent/src/metrics/mod.rs index d2d2b1e7..8a90c3b2 100644 --- a/crates/agent/src/metrics/mod.rs +++ b/crates/agent/src/metrics/mod.rs @@ -27,7 +27,7 @@ pub fn init(state: Arc) { loop { interval.tick().await; - if !state.is_node_online().await { + if !state.is_node_online() { continue; } diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index 0e9a3e87..b2d6a9fe 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -1,5 +1,4 @@ use std::{ - ops::Deref, sync::Arc, time::{Duration, Instant}, }; @@ -143,7 +142,7 @@ impl AgentStateReconciler { // Update the reconciler with the latest agent state // This prevents the agent state from changing during reconciliation - self.agent_state = self.state.agent_state.read().await.deref().clone(); + self.agent_state = self.state.get_agent_state().await; trace!("Reconciling agent state..."); match self.reconcile().await { diff --git a/crates/agent/src/rpc/control.rs b/crates/agent/src/rpc/control.rs index e5cc6d52..283bed54 100644 --- a/crates/agent/src/rpc/control.rs +++ b/crates/agent/src/rpc/control.rs @@ -87,6 +87,9 @@ impl AgentService for AgentRpcServer { info!("Received control-plane handshake"); + // Re-fetch peer addresses to ensure no addresses changed while offline + self.state.re_fetch_peer_addrs().await; + // Queue a reconcile immediately as we have received new state. // The reconciler will decide if anything has actually changed self.state.update_agent_state(handshake.state).await; @@ -119,15 +122,12 @@ impl AgentService for AgentRpcServer { .await .ok_or(SnarkosRequestError::OfflineNode)?; - let env_id = - if let AgentState::Node(env_id, state) = self.state.agent_state.read().await.as_ref() { - if !state.online { - return Err(SnarkosRequestError::OfflineNode); - } - *env_id - } else { - return Err(SnarkosRequestError::InvalidState); - }; + let env_id = self + .state + .get_agent_state() + .await + .env() + .ok_or(SnarkosRequestError::InvalidState)?; let network = self .state @@ -163,12 +163,12 @@ impl AgentService for AgentRpcServer { .await .ok_or(AgentError::NodeClientNotReady)?; - let env_id = - if let AgentState::Node(env_id, _) = self.state.agent_state.read().await.as_ref() { - *env_id - } else { - return Err(AgentError::InvalidState); - }; + let env_id = self + .state + .get_agent_state() + .await + .env() + .ok_or(AgentError::InvalidState)?; let network = self .state @@ -342,15 +342,14 @@ impl AgentService for AgentRpcServer { } async fn get_status(self, ctx: Context) -> Result { + let aot_online = if let Some(c) = self.state.get_node_client().await { + c.status(ctx).await.is_ok() + } else { + false + }; + Ok(AgentStatus { - aot_online: self - .state - .get_node_client() - .await - .ok_or(AgentError::NodeClientNotSet)? - .status(ctx) - .await - .is_ok(), + aot_online, version: self.version.to_string(), }) } diff --git a/crates/agent/src/server.rs b/crates/agent/src/server.rs index 0dc8eac9..d3aaa815 100644 --- a/crates/agent/src/server.rs +++ b/crates/agent/src/server.rs @@ -94,8 +94,18 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { }; match msg { - MuxedMessageIncoming::Parent(msg) => server_request_in.send(msg).expect("internal RPC channel closed"), - MuxedMessageIncoming::Child(msg) => client_response_in.send(msg).expect("internal RPC channel closed"), + MuxedMessageIncoming::Parent(msg) => { + if let Err(e) = server_request_in.send(msg) { + error!("internal node RPC channel closed: {e}"); + break; + } + }, + MuxedMessageIncoming::Child(msg) => { + if let Err(e) = client_response_in.send(msg) { + error!("internal node RPC channel closed: {e}"); + break; + } + } } } _ => (), @@ -104,8 +114,14 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { // handle outgoing requests msg = client_request_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&MuxedMessageOutgoing::Child(msg)).expect("failed to serialize request"); + let Some(msg) = msg else { error!("internal node RPC channel closed"); break; }; + let bin = match bincode::serialize(&MuxedMessageOutgoing::Child(msg)) { + Ok(bin) => bin, + Err(e) => { + error!("failed to serialize a request to node: {e}"); + continue; + } + }; if socket.send(Message::Binary(bin)).await.is_err() { break; } @@ -113,8 +129,14 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { // handle outgoing response msg = server_response_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&MuxedMessageOutgoing::Parent(msg)).expect("failed to serialize response"); + let Some(msg) = msg else { error!("internal node RPC channel closed"); break; }; + let bin = match bincode::serialize(&MuxedMessageOutgoing::Parent(msg)) { + Ok(bin) => bin, + Err(e) => { + error!("failed to serialize a response to node: {e}"); + continue; + } + }; if socket.send(Message::Binary(bin)).await.is_err() { break; } diff --git a/crates/agent/src/state.rs b/crates/agent/src/state.rs index 57ba6197..f19b1555 100644 --- a/crates/agent/src/state.rs +++ b/crates/agent/src/state.rs @@ -1,4 +1,5 @@ use std::{ + collections::HashSet, net::IpAddr, sync::{Arc, Mutex}, time::{Duration, Instant}, @@ -15,7 +16,7 @@ use snops_common::{ }; use tarpc::context; use tokio::sync::{mpsc::Sender, oneshot, RwLock}; -use tracing::error; +use tracing::{error, info}; use crate::{cli::Cli, db::Database, log::ReloadHandler, metrics::Metrics, transfers::TransferTx}; @@ -61,6 +62,14 @@ impl GlobalState { self.client.try_read().is_ok_and(|c| c.is_some()) } + pub async fn get_ws_client(&self) -> Option { + self.client.read().await.clone() + } + + pub async fn get_agent_state(&self) -> Arc { + self.agent_state.read().await.clone() + } + // Resolve the addresses of the given agents. // Locks resolve_addrs pub async fn agentpeers_to_cli(&self, peers: &[AgentPeer]) -> Vec { @@ -115,6 +124,12 @@ impl GlobalState { } *self.env_info.write().await = Some(env_info.clone()); + // clear the resolved addrs cache when the env info changes + self.resolved_addrs.write().await.clear(); + if let Err(e) = self.db.set_resolved_addrs(None) { + error!("failed to save resolved addrs to db: {e}"); + } + Ok(env_info.1) } @@ -128,8 +143,8 @@ impl GlobalState { } } - pub async fn is_node_online(&self) -> bool { - self.node_client.read().await.is_some() + pub fn is_node_online(&self) -> bool { + self.node_client.try_read().is_ok_and(|c| c.is_some()) } pub async fn get_node_client(&self) -> Option { @@ -137,11 +152,6 @@ impl GlobalState { } pub async fn update_agent_state(&self, state: AgentState) { - if state.env() != self.env_info.read().await.as_ref().map(|(id, _)| *id) { - error!("attempted to set agent state with different env"); - return; - } - if let Err(e) = self.db.set_agent_state(&state) { error!("failed to save agent state to db: {e}"); } @@ -151,4 +161,61 @@ impl GlobalState { // Queue a reconcile to apply the new state self.queue_reconcile(Duration::ZERO).await; } + + pub async fn re_fetch_peer_addrs(&self) { + let agent_state = self.get_agent_state().await; + let AgentState::Node(_, node) = agent_state.as_ref() else { + return; + }; + + let Some(client) = self.get_ws_client().await else { + return; + }; + + let peer_ids = node + .peers + .iter() + .chain(node.validators.iter()) + .filter_map(|p| { + if let snops_common::state::AgentPeer::Internal(id, _) = p { + Some(*id) + } else { + None + } + }) + .collect::>(); + + if peer_ids.is_empty() { + return; + } + + let new_addrs = match client.resolve_addrs(context::current(), peer_ids).await { + Ok(Ok(new_addrs)) => new_addrs, + Ok(Err(e)) => { + error!("Control plane failed to resolve addresses: {e}"); + return; + } + Err(e) => { + error!("RPC failed to resolve addresses: {e}"); + return; + } + }; + + // Extend the cache with the updated addrs + let mut lock = self.resolved_addrs.write().await; + let has_new_addr = new_addrs + .iter() + .any(|(id, addr)| lock.get(id) != Some(addr)); + + if !has_new_addr { + return; + } + + info!("Resolved updated addrs from handshake"); + + lock.extend(new_addrs); + if let Err(e) = self.db.set_resolved_addrs(Some(&lock)) { + error!("failed to save resolved addrs to db: {e}"); + } + } } diff --git a/crates/controlplane/src/env/mod.rs b/crates/controlplane/src/env/mod.rs index a368ec08..8bc967ae 100644 --- a/crates/controlplane/src/env/mod.rs +++ b/crates/controlplane/src/env/mod.rs @@ -418,38 +418,38 @@ impl Environment { pub async fn cleanup(id: EnvId, state: &GlobalState) -> Result<(), EnvError> { // clear the env state - info!("[env {id}] deleting persistence..."); + info!("{id}: Deleting persistence..."); let env = state.remove_env(id).ok_or(CleanupError::EnvNotFound(id))?; if let Err(e) = state.db.envs.delete(&id) { - error!("[env {id}] failed to delete env persistence: {e}"); + error!("{id}: Failed to delete env persistence: {e}"); } // TODO: write all of these values to a file before deleting them // cleanup cannon transaction trackers if let Err(e) = state.db.tx_attempts.delete_with_prefix(&id) { - error!("[env {id}] failed to delete env tx_attempts persistence: {e}"); + error!("{id}: Failed to delete env tx_attempts persistence: {e}"); } if let Err(e) = state.db.tx_auths.delete_with_prefix(&id) { - error!("[env {id}] failed to delete env tx_auths persistence: {e}"); + error!("{id}: Failed to delete env tx_auths persistence: {e}"); } if let Err(e) = state.db.tx_blobs.delete_with_prefix(&id) { - error!("[env {id}] failed to delete env tx_blobs persistence: {e}"); + error!("{id}: Failed to delete env tx_blobs persistence: {e}"); } if let Err(e) = state.db.tx_index.delete_with_prefix(&id) { - error!("[env {id}] failed to delete env tx_index persistence: {e}"); + error!("{id}: Failed to delete env tx_index persistence: {e}"); } if let Err(e) = state.db.tx_status.delete_with_prefix(&id) { - error!("[env {id}] failed to delete env tx_status persistence: {e}"); + error!("{id}: Failed to delete env tx_status persistence: {e}"); } if let Some(storage) = state.try_unload_storage(env.network, env.storage.id) { - info!("[env {id}] unloaded storage {}", storage.id); + info!("{id}: Unloaded storage {}", storage.id); } - trace!("[env {id}] inventorying agents..."); + trace!("{id}: Inventorying agents..."); state .update_agent_states( diff --git a/crates/controlplane/src/schema/storage/mod.rs b/crates/controlplane/src/schema/storage/mod.rs index 0e7d4424..ed345169 100644 --- a/crates/controlplane/src/schema/storage/mod.rs +++ b/crates/controlplane/src/schema/storage/mod.rs @@ -174,19 +174,19 @@ impl Document { // warn if an existing block/ledger already exists if exists { - warn!("the specified storage ID {id} already exists"); + warn!("The specified storage ID {id} already exists"); } let old_version = get_version_from_path(&version_file).await?; info!( - "storage {id} has version {old_version:?}. incoming version is {}", + "Storage {id} has version {old_version:?}. incoming version is {}", self.regen ); // wipe old storage when the version changes if old_version != Some(self.regen) && exists { - info!("storage {id} version changed, removing old storage"); + info!("Storage {id} version changed, removing old storage"); tokio::fs::remove_dir_all(&base) .await .map_err(|e| StorageError::RemoveStorage(version_file.clone(), e))?; @@ -212,7 +212,7 @@ impl Document { *p = canon } } - info!("resolved binary {id}: {entry}"); + info!("Resolved binary {id}: {entry}"); binaries.insert(id, entry); } @@ -232,7 +232,7 @@ impl Document { // generate the block and ledger if we have generation params if let (Some(generation), false) = (self.generate.as_ref(), exists) { - tracing::debug!("generating storage for {id}"); + tracing::debug!("Generating storage for {id}"); // generate the genesis block using the aot cli let output = base.join(SNARKOS_GENESIS_FILE); diff --git a/crates/controlplane/src/server/actions/power.rs b/crates/controlplane/src/server/actions/power.rs index f4eb5d2f..83acb42c 100644 --- a/crates/controlplane/src/server/actions/power.rs +++ b/crates/controlplane/src/server/actions/power.rs @@ -5,7 +5,7 @@ use axum::{ use snops_common::action_models::WithTargets; use super::Env; -use crate::state::{pending_reconcile_node_map, Agent}; +use crate::state::pending_reconcile_node_map; pub async fn online( Env { env, state, .. }: Env, @@ -14,8 +14,7 @@ pub async fn online( let pending = env .matching_agents(&nodes, &state.pool) .filter_map(|a| { - let agent: &Agent = a.value(); - agent.filter_map_to_reconcile(|mut s| { + a.value().filter_map_to_reconcile(|mut s| { (!s.online).then(|| { s.online = true; s @@ -38,8 +37,7 @@ pub async fn offline( let pending = env .matching_agents(&nodes, &state.pool) .filter_map(|a| { - let agent: &Agent = a.value(); - agent.filter_map_to_reconcile(|mut s| { + a.value().filter_map_to_reconcile(|mut s| { s.online.then(|| { s.online = false; s @@ -58,6 +56,8 @@ pub async fn offline( pub async fn reboot(env: Env, json: Json) -> Response { let offline_res = offline(env.clone(), json.clone()).await; + // TODO: wait for nodes to reconcile offline + if !offline_res.status().is_success() { offline_res } else { diff --git a/crates/controlplane/src/server/mod.rs b/crates/controlplane/src/server/mod.rs index 6388637e..2ee47179 100644 --- a/crates/controlplane/src/server/mod.rs +++ b/crates/controlplane/src/server/mod.rs @@ -1,41 +1,11 @@ -use std::{net::SocketAddr, sync::Arc, time::Duration}; +use std::{net::SocketAddr, sync::Arc}; -use ::jwt::VerifyWithKey; -use axum::{ - extract::{ - ws::{Message, WebSocket}, - Query, State, WebSocketUpgrade, - }, - http::HeaderMap, - middleware, - response::{IntoResponse, Response}, - routing::get, - Extension, Router, -}; -use futures_util::stream::StreamExt; -use http::StatusCode; -use serde::Deserialize; -use snops_common::{ - constant::HEADER_AGENT_KEY, - prelude::*, - rpc::control::{ - agent::{AgentServiceClient, Handshake}, - ControlService, - }, -}; -use tarpc::server::Channel; -use tokio::select; -use tracing::{error, info, warn}; +use axum::{middleware, routing::get, Extension, Router}; -use self::{ - error::StartError, - jwt::{Claims, JWT_SECRET}, - rpc::ControlRpcServer, -}; +use self::error::StartError; use crate::{ logging::{log_request, req_stamp}, - server::rpc::{MuxedMessageIncoming, MuxedMessageOutgoing}, - state::{Agent, AgentFlags, AppState, GlobalState}, + state::GlobalState, }; pub mod actions; @@ -46,10 +16,11 @@ pub mod jwt; pub mod models; pub mod prometheus; mod rpc; +mod websocket; pub async fn start(state: Arc, socket_addr: SocketAddr) -> Result<(), StartError> { let app = Router::new() - .route("/agent", get(agent_ws_handler)) + .route("/agent", get(websocket::agent_ws_handler)) .nest("/api/v1", api::routes()) .nest("/prometheus", prometheus::routes()) .nest("/content", content::init_routes(&state).await) @@ -68,309 +39,3 @@ pub async fn start(state: Arc, socket_addr: SocketAddr) -> Result<( Ok(()) } - -#[derive(Debug, Deserialize)] -struct AgentWsQuery { - id: Option, - #[serde(flatten)] - flags: AgentFlags, -} - -async fn agent_ws_handler( - ws: WebSocketUpgrade, - headers: HeaderMap, - State(state): State, - Query(query): Query, -) -> Response { - match (&state.agent_key, headers.get(HEADER_AGENT_KEY)) { - // assert key equals passed header - (Some(key), Some(header)) if key == header.to_str().unwrap_or_default() => (), - - // forbid if key is incorrect - (Some(_), _) => { - warn!("an agent has attempted to connect with a mismatching agent key"); - return StatusCode::UNAUTHORIZED.into_response(); - } - - // allow if no key is present - _ => (), - } - - ws.on_upgrade(|socket| handle_socket(socket, headers, state, query)) - .into_response() -} - -async fn handle_socket( - mut socket: WebSocket, - headers: HeaderMap, - state: AppState, - query: AgentWsQuery, -) { - let claims = headers - .get("Authorization") - .and_then(|auth| -> Option { - let auth = auth.to_str().ok()?; - if !auth.starts_with("Bearer ") { - return None; - } - - let token = &auth[7..]; - - // get claims out of the specified JWT - token.verify_with_key(&*JWT_SECRET).ok() - }) - .filter(|claims| { - // ensure the id is correct - if let Some(id) = query.id { - if claims.id != id { - warn!("connecting agent specified an id different than the claim"); - return false; - } - } - - true - }); - - // TODO: the client should provide us with some information about itself (num - // cpus, etc.) before we categorize it and add it as an agent to the agent pool - - // set up the RPC channels - let (client_response_in, client_transport, mut client_request_out) = RpcTransport::new(); - let (server_request_in, server_transport, mut server_response_out) = RpcTransport::new(); - - // set up the client, facing the agent server - let client = - AgentServiceClient::new(tarpc::client::Config::default(), client_transport).spawn(); - - let id: AgentId = 'insertion: { - let client = client.clone(); - let mut handshake = Handshake { - loki: state.cli.loki.as_ref().map(|u| u.to_string()), - ..Default::default() - }; - - // attempt to reconnect if claims were passed - 'reconnect: { - if let Some(claims) = claims { - let Some(mut agent) = state.pool.get_mut(&claims.id) else { - warn!("connecting agent is trying to identify as an unrecognized agent"); - break 'reconnect; - }; - - let id = agent.id(); - if agent.is_connected() { - warn!( - "connecting agent is trying to identify as an already-connected agent {id}" - ); - break 'reconnect; - } - - // compare the stored nonce with the JWT's nonce - if agent.claims().nonce != claims.nonce { - warn!("connecting agent {id} is trying to identify with an invalid nonce"); - break 'reconnect; - } - - if let AgentState::Node(env, _) = agent.state() { - if !state.envs.contains_key(env) { - info!("setting agent {id} to Inventory state due to missing env {env}"); - agent.set_state(AgentState::Inventory); - } - } - - // attach the current known agent state to the handshake - agent.state().clone_into(&mut handshake.state); - - // mark the agent as connected, update the flags as well - agent.mark_connected(client, query.flags); - - info!("agent {id} reconnected"); - if let Err(e) = state.db.agents.save(&id, &agent) { - error!("failed to save agent {id} to the database: {e}"); - } - - // handshake with client - // note: this may cause a reconciliation, so this *may* be non-instant - // unwrap safety: this agent was just `mark_connected` with a valid client - let client = agent.rpc().cloned().unwrap(); - - // drop agent ref to allow for mutable borrow in handshake requests - drop(agent); - - tokio::spawn(async move { - // we do this in a separate task because we don't want to hold up pool insertion - let mut ctx = tarpc::context::current(); - ctx.deadline += Duration::from_secs(300); - match client.handshake(ctx, handshake).await { - Ok(()) => (), - Err(e) => error!("failed to perform agent {id} handshake: {e}"), - } - }); - - break 'insertion id; - } - } - - // otherwise, we need to create an agent and give it a new JWT - // TODO: remove unnamed agents - let id = query.id.unwrap_or_else(AgentId::rand); - - // check if an agent with this id is already online - if state - .pool - .get(&id) - .map(|a| a.is_connected()) - .unwrap_or_default() - { - warn!("an agent is trying to identify as an already-connected agent {id}"); - let _ = socket.send(Message::Close(None)).await; - return; - } - - // create a new agent - let agent = Agent::new(client.to_owned(), id, query.flags); - - // sign the jwt - let signed_jwt = agent.sign_jwt(); - handshake.jwt = Some(signed_jwt); - - // handshake with the client - tokio::spawn(async move { - // we do this in a separate task because we don't want to hold up pool insertion - let mut ctx = tarpc::context::current(); - ctx.deadline += Duration::from_secs(300); - match client.handshake(ctx, handshake).await { - Ok(()) => (), - Err(e) => error!("failed to perform agent {id} handshake: {e}"), - } - }); - - // insert a new agent into the pool - if let Err(e) = state.db.agents.save(&id, &agent) { - error!("failed to save agent {id} to the database: {e}"); - } - state.pool.insert(id, agent); - - info!( - "agent {id} connected; pool is now {} nodes", - state.pool.len() - ); - - id - }; - - // fetch the agent's network addresses on connect/reconnect - let state2 = Arc::clone(&state); - tokio::spawn(async move { - let Ok((ports, external, internal)) = client.get_addrs(tarpc::context::current()).await - else { - return; - }; - let Some(mut agent) = state2.pool.get_mut(&id) else { - return; - }; - - info!( - "agent {id} [{}], labels: {:?}, addrs: {external:?} {internal:?} @ {ports}, local pk: {}", - agent.modes(), - agent.str_labels(), - if agent.has_local_pk() { "yes" } else { "no" }, - ); - - let is_port_change = agent.set_ports(ports); - let is_ip_change = agent.set_addrs(external, internal); - - if let Err(e) = state2.db.agents.save(&id, &agent) { - error!("failed to save agent {id} to the database: {e}"); - } - - if !is_ip_change && !is_port_change { - return; - } - let Some(env_id) = agent.env() else { - return; - }; - drop(agent); - let Some(env) = state2.get_env(env_id) else { - return; - }; - - info!("Agent {id} updated its network addresses... Submitting changes to associated peers"); - env.update_peer_addr(&state2, id, is_port_change, is_ip_change) - .await; - }); - - // set up the server, for incoming RPC requests - let server = tarpc::server::BaseChannel::with_defaults(server_transport); - let server_handle = tokio::spawn( - server - .execute( - ControlRpcServer { - state: state.to_owned(), - agent: id, - } - .serve(), - ) - .for_each(|r| async move { - tokio::spawn(r); - }), - ); - - loop { - select! { - // handle incoming messages - msg = socket.recv() => { - match msg { - Some(Err(_)) | None => break, - Some(Ok(Message::Binary(bin))) => { - let msg = match bincode::deserialize(&bin) { - Ok(msg) => msg, - Err(e) => { - error!("failed to deserialize a message from agent {id}: {e}"); - continue; - } - }; - - match msg { - MuxedMessageIncoming::Parent(msg) => server_request_in.send(msg).expect("internal RPC channel closed"), - MuxedMessageIncoming::Child(msg) => client_response_in.send(msg).expect("internal RPC channel closed"), - } - } - _ => (), - } - } - - // handle outgoing requests - msg = client_request_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&MuxedMessageOutgoing::Child(msg)).expect("failed to serialize request"); - if socket.send(Message::Binary(bin)).await.is_err() { - break; - } - } - - // handle outgoing responses - msg = server_response_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&MuxedMessageOutgoing::Parent(msg)).expect("failed to serialize response"); - if socket.send(Message::Binary(bin)).await.is_err() { - break; - } - } - } - } - - // abort the RPC server handle - server_handle.abort(); - - // remove the client from the agent in the agent pool - { - // TODO: remove agent after 10 minutes of inactivity - - if let Some(mut agent) = state.pool.get_mut(&id) { - agent.mark_disconnected(); - } - - info!("agent {id} disconnected"); - } -} diff --git a/crates/controlplane/src/server/websocket.rs b/crates/controlplane/src/server/websocket.rs new file mode 100644 index 00000000..16268b8c --- /dev/null +++ b/crates/controlplane/src/server/websocket.rs @@ -0,0 +1,362 @@ +use std::{sync::Arc, time::Duration}; + +use ::jwt::VerifyWithKey; +use axum::{ + extract::{ + ws::{Message, WebSocket}, + Query, State, WebSocketUpgrade, + }, + http::HeaderMap, + response::{IntoResponse, Response}, +}; +use futures_util::stream::StreamExt; +use http::StatusCode; +use serde::Deserialize; +use snops_common::{ + constant::HEADER_AGENT_KEY, + prelude::*, + rpc::control::{ + agent::{AgentServiceClient, Handshake}, + ControlService, + }, +}; +use tarpc::server::Channel; +use tokio::select; +use tracing::{error, info, warn}; + +use super::{jwt::Claims, rpc::ControlRpcServer}; +use crate::{ + server::{ + jwt::JWT_SECRET, + rpc::{MuxedMessageIncoming, MuxedMessageOutgoing}, + }, + state::{Agent, AgentFlags, AppState}, +}; + +#[derive(Debug, Deserialize)] +pub struct AgentWsQuery { + pub id: Option, + #[serde(flatten)] + pub flags: AgentFlags, +} + +pub async fn agent_ws_handler( + ws: WebSocketUpgrade, + headers: HeaderMap, + State(state): State, + Query(query): Query, +) -> Response { + match (&state.agent_key, headers.get(HEADER_AGENT_KEY)) { + // assert key equals passed header + (Some(key), Some(header)) if key == header.to_str().unwrap_or_default() => (), + + // forbid if key is incorrect + (Some(_), _) => { + warn!("An agent has attempted to connect with a mismatching agent key"); + return StatusCode::UNAUTHORIZED.into_response(); + } + + // allow if no key is present + _ => (), + } + + ws.on_upgrade(|socket| handle_socket(socket, headers, state, query)) + .into_response() +} + +async fn handle_socket( + mut socket: WebSocket, + headers: HeaderMap, + state: AppState, + query: AgentWsQuery, +) { + let claims = headers + .get("Authorization") + .and_then(|auth| -> Option { + let auth = auth.to_str().ok()?; + if !auth.starts_with("Bearer ") { + return None; + } + + let token = &auth[7..]; + + // get claims out of the specified JWT + token.verify_with_key(&*JWT_SECRET).ok() + }) + .filter(|claims| { + // ensure the id is correct + if let Some(id) = query.id { + if claims.id != id { + warn!("connecting agent specified an id different than the claim"); + return false; + } + } + + true + }); + + // TODO: the client should provide us with some information about itself (num + // cpus, etc.) before we categorize it and add it as an agent to the agent pool + + // set up the RPC channels + let (client_response_in, client_transport, mut client_request_out) = RpcTransport::new(); + let (server_request_in, server_transport, mut server_response_out) = RpcTransport::new(); + + // set up the client, facing the agent server + let client = + AgentServiceClient::new(tarpc::client::Config::default(), client_transport).spawn(); + + let id: AgentId = 'insertion: { + let client = client.clone(); + let mut handshake = Handshake { + loki: state.cli.loki.as_ref().map(|u| u.to_string()), + ..Default::default() + }; + + // attempt to reconnect if claims were passed + 'reconnect: { + if let Some(claims) = claims { + let Some(mut agent) = state.pool.get_mut(&claims.id) else { + warn!("Connecting agent is trying to identify as an unrecognized agent"); + break 'reconnect; + }; + + let id = agent.id(); + if agent.is_connected() { + warn!( + "Connecting agent is trying to identify as an already-connected agent {id}" + ); + break 'reconnect; + } + + // compare the stored nonce with the JWT's nonce + if agent.claims().nonce != claims.nonce { + warn!("Connecting agent {id} is trying to identify with an invalid nonce"); + break 'reconnect; + } + + match agent.env() { + Some(env) if !state.envs.contains_key(&env) => { + info!("setting agent {id} to Inventory state due to missing env {env}"); + agent.set_state(AgentState::Inventory); + } + _ => {} + } + + // attach the current known agent state to the handshake + agent.state().clone_into(&mut handshake.state); + + // mark the agent as connected, update the flags as well + agent.mark_connected(client.clone(), query.flags); + + info!("Agent {id} reconnected"); + if let Err(e) = state.db.agents.save(&id, &agent) { + error!("failed to save agent {id} to the database: {e}"); + } + + // drop agent ref to allow for mutable borrow in handshake requests + drop(agent); + + tokio::spawn(async move { + // we do this in a separate task because we don't want to hold up pool insertion + let mut ctx = tarpc::context::current(); + ctx.deadline += Duration::from_secs(300); + match client.handshake(ctx, handshake).await { + Ok(()) => (), + Err(e) => error!("failed to perform agent {id} handshake: {e}"), + } + }); + + break 'insertion id; + } + } + + // otherwise, we need to create an agent and give it a new JWT + // TODO: remove unnamed agents + let id = query.id.unwrap_or_else(AgentId::rand); + + // check if an agent with this id is already online + if state + .pool + .get(&id) + .map(|a| a.is_connected()) + .unwrap_or_default() + { + warn!("An agent is trying to identify as an already-connected agent {id}"); + let _ = socket.send(Message::Close(None)).await; + return; + } + + // create a new agent + let agent = Agent::new(client.to_owned(), id, query.flags); + + // sign the jwt + let signed_jwt = agent.sign_jwt(); + handshake.jwt = Some(signed_jwt); + + // handshake with the client + tokio::spawn(async move { + // we do this in a separate task because we don't want to hold up pool insertion + let mut ctx = tarpc::context::current(); + ctx.deadline += Duration::from_secs(300); + match client.handshake(ctx, handshake).await { + Ok(()) => (), + Err(e) => error!("failed to perform agent {id} handshake: {e}"), + } + }); + + // insert a new agent into the pool + if let Err(e) = state.db.agents.save(&id, &agent) { + error!("failed to save agent {id} to the database: {e}"); + } + state.pool.insert(id, agent); + + info!( + "Agent {id} connected; pool is now {} nodes", + state.pool.len() + ); + + id + }; + + // fetch the agent's network addresses on connect/reconnect + let state2 = Arc::clone(&state); + tokio::spawn(async move { + let Ok((ports, external, internal)) = client.get_addrs(tarpc::context::current()).await + else { + return; + }; + let Some(mut agent) = state2.pool.get_mut(&id) else { + return; + }; + + info!( + "Agent {id} [{}], labels: {:?}, addrs: {external:?} {internal:?} @ {ports}, local pk: {}", + agent.modes(), + agent.str_labels(), + if agent.has_local_pk() { "yes" } else { "no" }, + ); + + let is_port_change = agent.set_ports(ports); + let is_ip_change = agent.set_addrs(external, internal); + + if let Err(e) = state2.db.agents.save(&id, &agent) { + error!("failed to save agent {id} to the database: {e}"); + } + + if !is_ip_change && !is_port_change { + return; + } + let Some(env_id) = agent.env() else { + return; + }; + drop(agent); + let Some(env) = state2.get_env(env_id) else { + return; + }; + + info!("Agent {id} updated its network addresses... Submitting changes to associated peers"); + env.update_peer_addr(&state2, id, is_port_change, is_ip_change) + .await; + }); + + // set up the server, for incoming RPC requests + let server = tarpc::server::BaseChannel::with_defaults(server_transport); + let server_handle = tokio::spawn( + server + .execute( + ControlRpcServer { + state: state.to_owned(), + agent: id, + } + .serve(), + ) + .for_each(|r| async move { + tokio::spawn(r); + }), + ); + + loop { + select! { + // handle incoming messages + msg = socket.recv() => { + match msg { + Some(Err(_)) | None => break, + Some(Ok(Message::Binary(bin))) => { + let msg = match bincode::deserialize(&bin) { + Ok(msg) => msg, + Err(e) => { + error!("failed to deserialize a message from agent {id}: {e}"); + break; + } + }; + + match msg { + MuxedMessageIncoming::Parent(msg) => { + if let Err(e) = server_request_in.send(msg) { + error!("internal RPC channel closed: {e}"); + break; + } + }, + MuxedMessageIncoming::Child(msg) => { + if let Err(e) = client_response_in.send(msg) { + error!("internal RPC channel closed: {e}"); + break; + } + } + } + } + _ => (), + } + } + + // handle outgoing requests + msg = client_request_out.recv() => { + let Some(msg) = msg else { + error!("Agent {id} internal RPC channel closed"); + break; + }; + let bin = match bincode::serialize(&MuxedMessageOutgoing::Child(msg)) { + Ok(bin) => bin, + Err(e) => { + error!("Agent {id} failed to serialize request: {e}"); + break; + } + }; + if let Err(e) = socket.send(Message::Binary(bin)).await { + error!("Agent {id} failed to send request to agent {id}: {e}"); + break; + } + } + + // handle outgoing responses + msg = server_response_out.recv() => { + let Some(msg) = msg else { + error!("Agent {id} internal RPC channel closed"); + break; + }; + let bin = match bincode::serialize(&MuxedMessageOutgoing::Parent(msg)) { + Ok(bin) => bin, + Err(e) => { + error!("Agent {id} failed to serialize response: {e}"); + break; + } + }; + if let Err(e) = socket.send(Message::Binary(bin)).await { + error!("Agent {id} failed to send response to agent {id}: {e}"); + break; + } + } + } + } + + // abort the RPC server handle + server_handle.abort(); + + // remove the client from the agent in the agent pool + if let Some(mut agent) = state.pool.get_mut(&id) { + agent.mark_disconnected(); + } + + info!("Agent {id} disconnected"); +} diff --git a/crates/controlplane/src/state/reconcile.rs b/crates/controlplane/src/state/reconcile.rs index ca6b659f..a12dda11 100644 --- a/crates/controlplane/src/state/reconcile.rs +++ b/crates/controlplane/src/state/reconcile.rs @@ -67,9 +67,9 @@ impl GlobalState { return (0, 0); } - let num_reconciliations = handles.len(); + let num_reqs = handles.len(); - info!("Queuing reconciliation..."); + info!("Requesting reconcile from {num_reqs} agents..."); let reconciliations = join_all(handles).await; let mut success = 0; @@ -79,15 +79,12 @@ impl GlobalState { success += 1; } Ok(Err(e)) => error!("agent {agent_id} experienced a rpc error: {e}"), - Err(e) => error!("join error during agent {agent_id} reconcile: {e}"), + Err(e) => error!("join error during agent {agent_id} reconcile request: {e}"), } } - info!( - "reconciliation result: {success}/{} nodes connected", - num_reconciliations - ); + info!("Requested {success}/{num_reqs} agents"); - (success, num_reconciliations) + (success, num_reqs) } } From 0b5a4ea63944ef794937e9fe866f40038f6a46a4 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Wed, 27 Nov 2024 20:12:16 -0500 Subject: [PATCH 24/68] feat(snops): emit reconcile status to control plane, move reconcile status structs to common --- crates/agent/src/reconcile/agent.rs | 24 +++++- crates/agent/src/reconcile/command.rs | 7 +- crates/agent/src/reconcile/files.rs | 8 +- crates/agent/src/reconcile/mod.rs | 85 +------------------ crates/agent/src/reconcile/process.rs | 7 +- crates/agent/src/reconcile/storage.rs | 36 +++++--- crates/common/src/constant.rs | 2 + crates/common/src/rpc/control/mod.rs | 7 +- crates/common/src/state/agent_status.rs | 10 ++- crates/common/src/state/mod.rs | 2 + crates/common/src/state/reconcile.rs | 88 ++++++++++++++++++++ crates/controlplane/src/env/mod.rs | 56 +++++++++++-- crates/controlplane/src/env/reconcile.rs | 54 ------------ crates/controlplane/src/server/api.rs | 3 +- crates/controlplane/src/server/content.rs | 3 +- crates/controlplane/src/server/prometheus.rs | 3 +- crates/controlplane/src/server/rpc.rs | 18 +++- 17 files changed, 232 insertions(+), 181 deletions(-) create mode 100644 crates/common/src/state/reconcile.rs delete mode 100644 crates/controlplane/src/env/reconcile.rs diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index b2d6a9fe..95a8dc4c 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -6,7 +6,7 @@ use std::{ use snops_common::{ binaries::BinaryEntry, rpc::error::ReconcileError, - state::{AgentState, HeightRequest, TransferId}, + state::{AgentState, HeightRequest, ReconcileCondition, TransferId}, }; use tokio::{ select, @@ -145,9 +145,11 @@ impl AgentStateReconciler { self.agent_state = self.state.get_agent_state().await; trace!("Reconciling agent state..."); - match self.reconcile().await { + let res = self.reconcile().await; + match res { Ok(status) => { if status.inner.is_some() { + err_backoff = 0; trace!("Reconcile completed"); } if !status.conditions.is_empty() { @@ -183,9 +185,13 @@ impl AgentStateReconciler { } if let Some(_transfers) = self.context.transfers.as_mut() { + // Clear the env state + self.context.env_state = None; if let Err(e) = self.state.db.set_env_state(None) { error!("failed to clear env state from db: {e}"); } + // Clear the last height + self.context.ledger_last_height = None; if let Err(e) = self.state.db.set_last_height(None) { error!("failed to clear last height from db: {e}"); } @@ -286,7 +292,18 @@ impl Reconcile<(), ReconcileError> for AgentStateReconciler { } // Prevent other reconcilers from running while the node is running - return Ok(ReconcileStatus::default().add_scope("agent_state/running")); + if self.state.is_node_online() { + return Ok(ReconcileStatus::default().add_scope("agent_state/running")); + } else { + // If the node is not online, the process is still running, but the node + // has not connected to the controlplane. + // This can happen if the node is still syncing, or if the controlplane + // is not reachable. + return Ok(ReconcileStatus::empty() + .requeue_after(Duration::from_secs(1)) + .add_condition(ReconcileCondition::PendingStartup) + .add_scope("agent_state/starting")); + } } } @@ -318,6 +335,7 @@ impl Reconcile<(), ReconcileError> for AgentStateReconciler { } self.context.env_state = Some(env_state); self.context.transfers = Some(Default::default()); + trace!("Cleared transfers state..."); } let transfers = self.context.transfers.as_mut().unwrap(); diff --git a/crates/agent/src/reconcile/command.rs b/crates/agent/src/reconcile/command.rs index 7fe7a971..08a30020 100644 --- a/crates/agent/src/reconcile/command.rs +++ b/crates/agent/src/reconcile/command.rs @@ -5,7 +5,8 @@ use snops_checkpoint::RetentionPolicy; use snops_common::{ api::AgentEnvInfo, constant::{ - LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, SNARKOS_LOG_FILE, + LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, NODE_DATA_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, + SNARKOS_LOG_FILE, }, rpc::error::ReconcileError, state::{EnvId, KeyState, NetworkId, NodeKey, NodeState, PortConfig}, @@ -69,7 +70,9 @@ impl NodeCommand { let ledger_path = if env_info.storage.persist { storage_path.join(LEDGER_PERSIST_DIR) } else { - state.cli.path.join(LEDGER_BASE_DIR) + let mut dir = state.cli.path.join(NODE_DATA_DIR); + dir.push(LEDGER_BASE_DIR); + dir }; Ok(NodeCommand { diff --git a/crates/agent/src/reconcile/files.rs b/crates/agent/src/reconcile/files.rs index 9255d1f8..be3e8bba 100644 --- a/crates/agent/src/reconcile/files.rs +++ b/crates/agent/src/reconcile/files.rs @@ -11,12 +11,14 @@ use snops_common::{ binaries::{BinaryEntry, BinarySource}, constant::SNARKOS_GENESIS_FILE, rpc::error::ReconcileError, - state::{NetworkId, StorageId, TransferId, TransferStatusUpdate}, + state::{ + NetworkId, ReconcileCondition, ReconcileStatus, StorageId, TransferId, TransferStatusUpdate, + }, }; use tracing::{error, trace, warn}; use url::Url; -use super::{Reconcile, ReconcileCondition, ReconcileStatus}; +use super::Reconcile; use crate::{ api::{download_file, get_file_issues}, state::GlobalState, @@ -41,7 +43,7 @@ pub fn get_genesis_route(endpoint: &str, network: NetworkId, storage_id: Storage /// This reconciler creates a directory if it does not exist pub struct DirectoryReconciler<'a>(pub &'a Path); impl<'a> Reconcile<(), ReconcileError> for DirectoryReconciler<'a> { - async fn reconcile(&mut self) -> Result, ReconcileError> { + async fn reconcile(&mut self) -> Result, ReconcileError> { std::fs::create_dir_all(self.0) .map(ReconcileStatus::with) .map_err(|e| ReconcileError::CreateDirectory(self.0.to_path_buf(), e.to_string())) diff --git a/crates/agent/src/reconcile/mod.rs b/crates/agent/src/reconcile/mod.rs index a7a7fa19..2f39407c 100644 --- a/crates/agent/src/reconcile/mod.rs +++ b/crates/agent/src/reconcile/mod.rs @@ -1,96 +1,13 @@ -use std::{fmt::Display, time::Duration}; - -use indexmap::IndexSet; - pub mod agent; pub mod command; mod files; pub use files::*; +use snops_common::state::ReconcileStatus; pub mod address; pub mod process; pub mod state; pub mod storage; -use snops_common::state::TransferId; - -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub enum ReconcileCondition { - /// A file is being transferred. - PendingTransfer(String, TransferId), - /// A process is being spawned / confirmed. Could be starting the node or - /// manipulating the ledger - PendingProcess(String), - /// A tranfer was started and interrupted. - InterruptedTransfer(String, TransferId, String), - /// A modify operation was started and interrupted. - InterruptedModify(String), - /// A file is missing and cannot be downloaded at the moment. - MissingFile(String), - /// Waiting to reconnect to the controlplane - PendingConnection, - /// Waiting for the node to be shut down - PendingShutdown, -} pub trait Reconcile { async fn reconcile(&mut self) -> Result, E>; } - -pub struct ReconcileStatus { - pub scopes: Vec, - pub inner: Option, - pub requeue_after: Option, - pub conditions: IndexSet, -} - -impl Default for ReconcileStatus { - fn default() -> Self { - Self::new(Some(Default::default())) - } -} - -impl ReconcileStatus { - pub fn new(inner: Option) -> Self { - Self { - scopes: Vec::new(), - inner, - requeue_after: None, - conditions: IndexSet::new(), - } - } - - pub fn with(inner: T) -> Self { - Self::new(Some(inner)) - } - - pub fn empty() -> Self { - Self::new(None) - } - - pub fn is_requeue(&self) -> bool { - self.requeue_after.is_some() - } - - pub fn emptied(&self) -> ReconcileStatus { - ReconcileStatus { - inner: None, - scopes: self.scopes.clone(), - requeue_after: self.requeue_after, - conditions: self.conditions.clone(), - } - } - - pub fn requeue_after(mut self, duration: Duration) -> Self { - self.requeue_after = Some(duration); - self - } - - pub fn add_scope(mut self, scope: impl Display) -> Self { - self.scopes.push(scope.to_string()); - self - } - - pub fn add_condition(mut self, condition: ReconcileCondition) -> Self { - self.conditions.insert(condition); - self - } -} diff --git a/crates/agent/src/reconcile/process.rs b/crates/agent/src/reconcile/process.rs index d5f28cb4..656529e8 100644 --- a/crates/agent/src/reconcile/process.rs +++ b/crates/agent/src/reconcile/process.rs @@ -1,10 +1,13 @@ use std::time::{Duration, Instant}; -use snops_common::rpc::error::ReconcileError; +use snops_common::{ + rpc::error::ReconcileError, + state::{ReconcileCondition, ReconcileStatus}, +}; use tokio::{process::Child, select}; use tracing::{error, info}; -use super::{command::NodeCommand, Reconcile, ReconcileCondition, ReconcileStatus}; +use super::{command::NodeCommand, Reconcile}; use crate::state::NODE_GRACEFUL_SHUTDOWN_TIMEOUT; /// Information about the current process diff --git a/crates/agent/src/reconcile/storage.rs b/crates/agent/src/reconcile/storage.rs index eddfd2f8..e691204b 100644 --- a/crates/agent/src/reconcile/storage.rs +++ b/crates/agent/src/reconcile/storage.rs @@ -10,19 +10,17 @@ use snops_common::{ api::AgentEnvInfo, binaries::{BinaryEntry, BinarySource}, constant::{ - LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, VERSION_FILE, + LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, NODE_DATA_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, + VERSION_FILE, }, rpc::error::ReconcileError, - state::{HeightRequest, InternedId, TransferId}, + state::{HeightRequest, InternedId, ReconcileCondition, ReconcileStatus, TransferId}, }; use tokio::{process::Command, sync::Mutex, task::AbortHandle}; use tracing::{error, info, trace}; use url::Url; -use super::{ - default_binary, get_genesis_route, DirectoryReconciler, FileReconciler, Reconcile, - ReconcileCondition, ReconcileStatus, -}; +use super::{default_binary, get_genesis_route, DirectoryReconciler, FileReconciler, Reconcile}; use crate::state::GlobalState; /// Download a specific binary file needed to run the node @@ -219,7 +217,7 @@ impl<'a> LedgerReconciler<'a> { LEDGER_PERSIST_DIR, ) } else { - (self.state.cli.path.clone(), LEDGER_BASE_DIR) + (self.state.cli.path.join(NODE_DATA_DIR), LEDGER_BASE_DIR) } } @@ -408,10 +406,12 @@ impl<'a> Reconcile<(), ReconcileError> for LedgerReconciler<'a> { // Find the checkpoint for the reconciler's target height let checkpoint = self.find_checkpoint()?; + trace!("Applying checkpoint: {}", checkpoint.display()); // Start a task to modify the ledger with the checkpoint *self.modify_handle = Some(self.spawn_modify(checkpoint)); // Now that a task is running, set the pending height *self.pending_height = Some(target_height); + trace!("Pending ledger modification to height {}", target_height.1); return Ok(ReconcileStatus::empty() .add_condition(ReconcileCondition::PendingProcess(format!( @@ -420,9 +420,8 @@ impl<'a> Reconcile<(), ReconcileError> for LedgerReconciler<'a> { ))) .requeue_after(Duration::from_secs(5))); } - let pending = self.pending_height.unwrap(); - let Some(modify_handle) = self.modify_handle.as_mut() else { + let Some(modify_handle) = self.modify_handle.as_ref() else { // This should be an unreachable condition, but may not be unreachable // when more complex ledger operations are implemented error!("modify handle missing for pending height"); @@ -436,6 +435,7 @@ impl<'a> Reconcile<(), ReconcileError> for LedgerReconciler<'a> { // If the modify handle is locked, requeue until it's unlocked let Ok(Some(handle)) = modify_handle.1.try_lock().map(|r| r.clone()) else { + trace!("Waiting for modify handle to unlock..."); return Ok(ReconcileStatus::empty() .add_condition(ReconcileCondition::PendingProcess(format!( "ledger modification to height {}", @@ -444,9 +444,15 @@ impl<'a> Reconcile<(), ReconcileError> for LedgerReconciler<'a> { .requeue_after(Duration::from_secs(1))); }; + let pending = self.pending_height.unwrap(); + match handle { // If the ledger was modified successfully, update the last height Ok(true) => { + info!( + "Ledger modification to height {} succeeded", + target_height.1 + ); *self.last_height = Some(pending); if let Err(e) = self.state.db.set_last_height(Some(pending)) { error!("failed to save last height to db: {e}"); @@ -459,7 +465,13 @@ impl<'a> Reconcile<(), ReconcileError> for LedgerReconciler<'a> { // TODO: handle this failure } // Bubble an actual error up to the caller - Err(err) => return Err(err.clone()), + Err(err) => { + error!( + "ledger modification to height {} errored: {err}", + target_height.1 + ); + return Err(err.clone()); + } }; // Modification is complete. The last height is change dhwen the modification @@ -496,7 +508,7 @@ impl<'a> Reconcile<(), ReconcileError> for StorageVersionReconciler<'a> { let _ = tokio::fs::remove_dir_all(&path).await; } else { // return an empty status if the version is the same - return Ok(ReconcileStatus::default()); + return Ok(ReconcileStatus::empty()); }; } @@ -511,6 +523,6 @@ impl<'a> Reconcile<(), ReconcileError> for StorageVersionReconciler<'a> { })?; } - Ok(ReconcileStatus::empty()) + Ok(ReconcileStatus::default()) } } diff --git a/crates/common/src/constant.rs b/crates/common/src/constant.rs index deffaa84..6fb73394 100644 --- a/crates/common/src/constant.rs +++ b/crates/common/src/constant.rs @@ -17,3 +17,5 @@ pub const LEDGER_PERSIST_DIR: &str = "persist"; pub const LEDGER_STORAGE_FILE: &str = "ledger.tar.gz"; /// File containing a version counter for a ledger pub const VERSION_FILE: &str = "version"; +/// Directory name for the node's data. +pub const NODE_DATA_DIR: &str = "node"; diff --git a/crates/common/src/rpc/control/mod.rs b/crates/common/src/rpc/control/mod.rs index 1dd16c3d..38d76b4a 100644 --- a/crates/common/src/rpc/control/mod.rs +++ b/crates/common/src/rpc/control/mod.rs @@ -5,10 +5,10 @@ use std::{ net::IpAddr, }; -use super::error::ResolveError; +use super::error::{ReconcileError, ResolveError}; use crate::{ api::AgentEnvInfo, - state::{AgentId, EnvId, NodeStatus, TransferStatus, TransferStatusUpdate}, + state::{AgentId, EnvId, NodeStatus, ReconcileStatus, TransferStatus, TransferStatusUpdate}, }; pub const PING_HEADER: &[u8] = b"snops-agent"; @@ -40,4 +40,7 @@ pub trait ControlService { /// Emit an agent node status update. async fn post_node_status(update: NodeStatus); + + /// Emit an agent reconcile status update. + async fn post_reconcile_status(status: Result, ReconcileError>); } diff --git a/crates/common/src/state/agent_status.rs b/crates/common/src/state/agent_status.rs index 02f3d6ca..c9e3217c 100644 --- a/crates/common/src/state/agent_status.rs +++ b/crates/common/src/state/agent_status.rs @@ -1,10 +1,12 @@ +use std::time::Instant; + use chrono::{DateTime, Utc}; use indexmap::IndexMap; use serde::{Deserialize, Serialize}; use tokio::task::AbortHandle; -use super::snarkos_status::SnarkOSStatus; -use crate::format::DataFormat; +use super::{snarkos_status::SnarkOSStatus, ReconcileStatus}; +use crate::{format::DataFormat, rpc::error::ReconcileError}; #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum NodeStatus { @@ -151,7 +153,7 @@ impl TransferStatus { } } -#[derive(Debug, Default, Clone, Serialize, Deserialize)] +#[derive(Debug, Default, Clone)] pub struct AgentStatus { /// Version of the agent binary pub agent_version: Option, @@ -165,6 +167,8 @@ pub struct AgentStatus { pub connected_time: Option>, /// A map of transfers in progress pub transfers: IndexMap, + /// Latest reconcile status of the agent + pub reconcile: Option<(Instant, Result, ReconcileError>)>, } impl DataFormat for LatestBlockInfo { diff --git a/crates/common/src/state/mod.rs b/crates/common/src/state/mod.rs index a2198e6b..b39fe99a 100644 --- a/crates/common/src/state/mod.rs +++ b/crates/common/src/state/mod.rs @@ -11,6 +11,7 @@ mod node_key; mod node_state; mod node_type; mod port_config; +mod reconcile; pub mod snarkos_status; pub mod strings; @@ -24,6 +25,7 @@ pub use node_key::*; pub use node_state::*; pub use node_type::*; pub use port_config::*; +pub use reconcile::*; lazy_static! { static ref NODE_KEY_REGEX: Regex = Regex::new( diff --git a/crates/common/src/state/reconcile.rs b/crates/common/src/state/reconcile.rs new file mode 100644 index 00000000..755f4641 --- /dev/null +++ b/crates/common/src/state/reconcile.rs @@ -0,0 +1,88 @@ +use std::{fmt::Display, time::Duration}; + +use indexmap::IndexSet; +use serde::{Deserialize, Serialize}; + +use super::TransferId; + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub enum ReconcileCondition { + /// A file is being transferred. + PendingTransfer(String, TransferId), + /// A process is being spawned / confirmed. Could be starting the node or + /// manipulating the ledger + PendingProcess(String), + /// A tranfer was started and interrupted. + InterruptedTransfer(String, TransferId, String), + /// A modify operation was started and interrupted. + InterruptedModify(String), + /// A file is missing and cannot be downloaded at the moment. + MissingFile(String), + /// Waiting to reconnect to the controlplane + PendingConnection, + /// Waiting for the node to be shut down + PendingShutdown, + /// Waiting for the node to start up + PendingStartup, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ReconcileStatus { + pub scopes: Vec, + pub inner: Option, + pub requeue_after: Option, + pub conditions: IndexSet, +} + +impl Default for ReconcileStatus { + fn default() -> Self { + Self::new(Some(Default::default())) + } +} + +impl ReconcileStatus { + pub fn new(inner: Option) -> Self { + Self { + scopes: Vec::new(), + inner, + requeue_after: None, + conditions: IndexSet::new(), + } + } + + pub fn with(inner: T) -> Self { + Self::new(Some(inner)) + } + + pub fn empty() -> Self { + Self::new(None) + } + + pub fn is_requeue(&self) -> bool { + self.requeue_after.is_some() + } + + pub fn emptied(&self) -> ReconcileStatus { + ReconcileStatus { + inner: None, + scopes: self.scopes.clone(), + requeue_after: self.requeue_after, + conditions: self.conditions.clone(), + } + } + + pub fn requeue_after(mut self, duration: Duration) -> Self { + self.requeue_after = Some(duration); + self + } + + pub fn add_scope(mut self, scope: impl Display) -> Self { + self.scopes.push(scope.to_string()); + self + } + + pub fn add_condition(mut self, condition: ReconcileCondition) -> Self { + self.conditions.insert(condition); + self + } +} diff --git a/crates/controlplane/src/env/mod.rs b/crates/controlplane/src/env/mod.rs index 8bc967ae..97f40e0e 100644 --- a/crates/controlplane/src/env/mod.rs +++ b/crates/controlplane/src/env/mod.rs @@ -38,11 +38,9 @@ use crate::{ state::{Agent, GlobalState}, }; +pub mod cache; pub mod error; -mod reconcile; pub mod set; -pub use reconcile::*; -pub mod cache; #[derive(Debug)] pub struct Environment { @@ -205,10 +203,6 @@ impl Environment { // nodes in flattened_nodes have replicas unset doc_node.replicas.take(); - // TODO: compare existing agent state with old node state - // where the agent state is the same, insert the new state - // otherwise keep the old state - // replace the key with a new one let mut node = doc_node.to_owned(); if let Some(key) = node.key.as_mut() { @@ -216,6 +210,8 @@ impl Environment { } // Skip delegating nodes that are already present in the node map + // Agents are able to determine what updates need to be applied + // based on their resolved node states. if node_peers.contains_left(&node_key) { info!("{env_id}: updating node {node_key}"); updated_states.insert(node_key, EnvNodeState::Internal(node)); @@ -410,12 +406,54 @@ impl Environment { .await; } - // reconcile the nodes - initial_reconcile(env_id, &state).await?; + // Emit state changes to all agents within this environment + env.update_all_agents(&state).await?; Ok(env_id) } + async fn update_all_agents(&self, state: &GlobalState) -> Result<(), EnvError> { + let mut pending_changes = vec![]; + + for entry in self.node_states.iter() { + let key = entry.key(); + let node = entry.value(); + let EnvNodeState::Internal(node) = node else { + continue; + }; + let Some(agent_id) = self.get_agent_by_key(key) else { + continue; + }; + let Some(agent) = state.pool.get(&agent_id) else { + continue; + }; + + let mut next_state = self.resolve_node_state(state, agent_id, key, node); + + // determine if this reconcile will reset the agent's height (and potentially + // trigger a ledger wipe) + match agent.state() { + // new environment -> reset height + AgentState::Node(old_env, _) if *old_env != self.id => {} + // height request is the same -> keep the height + AgentState::Node(_, prev_state) if prev_state.height.1 == next_state.height.1 => { + next_state.height.0 = prev_state.height.0; + } + // otherwise, reset height + AgentState::Node(_, _) => {} + // moving from inventory -> reset height + AgentState::Inventory => {} + } + + let agent_state = AgentState::Node(self.id, Box::new(next_state)); + + pending_changes.push((agent_id, agent_state)); + } + + state.update_agent_states(pending_changes).await; + Ok(()) + } + pub async fn cleanup(id: EnvId, state: &GlobalState) -> Result<(), EnvError> { // clear the env state info!("{id}: Deleting persistence..."); diff --git a/crates/controlplane/src/env/reconcile.rs b/crates/controlplane/src/env/reconcile.rs deleted file mode 100644 index 9cbbdc83..00000000 --- a/crates/controlplane/src/env/reconcile.rs +++ /dev/null @@ -1,54 +0,0 @@ -use snops_common::state::{AgentState, EnvId}; - -use super::{error::*, EnvNodeState}; -use crate::state::GlobalState; - -/// Reconcile all associated nodes with their initial state. -pub async fn initial_reconcile(env_id: EnvId, state: &GlobalState) -> Result<(), EnvError> { - let mut pending_reconciliations = vec![]; - { - let env = state - .get_env(env_id) - .ok_or(ReconcileError::EnvNotFound(env_id))? - .clone(); - - for entry in env.node_states.iter() { - let key = entry.key(); - let node = entry.value(); - let EnvNodeState::Internal(node) = node else { - continue; - }; - - // get the internal agent ID from the node key - let id = env - .get_agent_by_key(key) - .ok_or_else(|| ReconcileError::ExpectedInternalAgentPeer { key: key.clone() })?; - - let mut node_state = env.resolve_node_state(state, id, key, node); - - // determine if this reconcile will reset the agent's height (and potentially - // trigger a ledger wipe) - if let Some(agent) = state.pool.get(&id) { - match agent.state() { - // new environment -> reset height - AgentState::Node(old_env, _) if *old_env != env_id => {} - // height request is the same -> keep the height - AgentState::Node(_, state) if state.height.1 == node_state.height.1 => { - node_state.height.0 = state.height.0; - } - // otherwise, reset height - AgentState::Node(_, _) => {} - // moving from inventory -> reset height - AgentState::Inventory => {} - } - } - - let agent_state = AgentState::Node(env_id, Box::new(node_state)); - - pending_reconciliations.push((id, agent_state)); - } - } - - state.update_agent_states(pending_reconciliations).await; - Ok(()) -} diff --git a/crates/controlplane/src/server/api.rs b/crates/controlplane/src/server/api.rs index af1c895e..4ded0384 100644 --- a/crates/controlplane/src/server/api.rs +++ b/crates/controlplane/src/server/api.rs @@ -19,10 +19,11 @@ use snops_common::{ }; use tarpc::context; -use super::{actions, error::ServerError, models::AgentStatusResponse, AppState}; +use super::{actions, error::ServerError, models::AgentStatusResponse}; use crate::{ cannon::{router::redirect_cannon_routes, source::QueryTarget}, make_env_filter, + state::AppState, }; use crate::{ env::{EnvPeer, Environment}, diff --git a/crates/controlplane/src/server/content.rs b/crates/controlplane/src/server/content.rs index 682bb066..a5f9b670 100644 --- a/crates/controlplane/src/server/content.rs +++ b/crates/controlplane/src/server/content.rs @@ -15,14 +15,13 @@ use snops_common::{ use tower::Service; use tower_http::services::ServeFile; -use super::AppState; use crate::{ schema::{ error::StorageError, storage::{DEFAULT_AGENT_BINARY, DEFAULT_AOT_BINARY}, }, server::error::ServerError, - state::GlobalState, + state::{AppState, GlobalState}, unwrap_or_not_found, }; diff --git a/crates/controlplane/src/server/prometheus.rs b/crates/controlplane/src/server/prometheus.rs index 0bbf990c..af76bb53 100644 --- a/crates/controlplane/src/server/prometheus.rs +++ b/crates/controlplane/src/server/prometheus.rs @@ -5,8 +5,7 @@ use rayon::iter::{ParallelBridge, ParallelIterator}; use serde::Serialize; use snops_common::state::AgentState; -use super::AppState; -use crate::cli::PrometheusLocation; +use crate::{cli::PrometheusLocation, state::AppState}; pub(super) fn routes() -> Router { Router::new().route("/httpsd", get(get_httpsd)) } diff --git a/crates/controlplane/src/server/rpc.rs b/crates/controlplane/src/server/rpc.rs index ec86b715..598539fd 100644 --- a/crates/controlplane/src/server/rpc.rs +++ b/crates/controlplane/src/server/rpc.rs @@ -1,12 +1,14 @@ use std::{ collections::{HashMap, HashSet}, net::IpAddr, + time::Instant, }; use chrono::Utc; use snops_common::{ api::AgentEnvInfo, define_rpc_mux, + prelude::{error::ReconcileError, ReconcileStatus}, rpc::{ control::{ agent::{AgentServiceRequest, AgentServiceResponse}, @@ -22,10 +24,9 @@ use snops_common::{ use tarpc::context; use tracing::warn; -use super::AppState; use crate::{ error::StateError, - state::{AddrMap, AgentAddrs}, + state::{AddrMap, AgentAddrs, AppState}, }; define_rpc_mux!(parent; @@ -201,6 +202,19 @@ impl ControlService for ControlRpcServer { agent.status.node_status = status; } + + async fn post_reconcile_status( + self, + _: context::Context, + status: Result, ReconcileError>, + ) { + let Some(mut agent) = self.state.pool.get_mut(&self.agent) else { + return; + }; + + // TODO: pipe these status updates to some event stream + agent.status.reconcile = Some((Instant::now(), status)); + } } pub fn resolve_one_addr(src_addrs: &AgentAddrs, target_addrs: &AgentAddrs) -> Option { From 21cca785eee45fcdb9e99942ca46fe1ac038d3c8 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Wed, 27 Nov 2024 20:12:46 -0500 Subject: [PATCH 25/68] fix(aot): fix proposal cache preventing address flexibility for validators --- crates/aot/src/runner/mod.rs | 62 +++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/crates/aot/src/runner/mod.rs b/crates/aot/src/runner/mod.rs index 3af5c6a9..f701cfa6 100644 --- a/crates/aot/src/runner/mod.rs +++ b/crates/aot/src/runner/mod.rs @@ -8,7 +8,10 @@ use aleo_std::StorageMode; use anyhow::{anyhow, Result}; use clap::Args; use rpc::RpcClient; -use snarkos_node::Node; +use snarkos_node::{ + bft::helpers::{proposal_cache_path, ProposalCache}, + Node, +}; use snarkvm::{ ledger::store::{ helpers::rocksdb::{BlockDB, CommitteeDB}, @@ -20,7 +23,7 @@ use snarkvm::{ use snops_checkpoint::{CheckpointManager, RetentionPolicy}; use snops_common::state::{snarkos_status::SnarkOSStatus, NodeType}; -use crate::{cli::ReloadHandler, Account, DbLedger, Key, Network}; +use crate::{cli::ReloadHandler, Account, Address, DbLedger, Key, Network}; mod metrics; mod rpc; @@ -177,23 +180,26 @@ impl Runner { let shutdown = Arc::new(AtomicBool::new(false)); let _node = match self.node_type { - NodeType::Validator => Node::new_validator( - node_ip, - Some(bft_ip), - Some(rest_ip), - self.rest_rps, - account, - &self.peers, - &self.validators, - genesis, - None, - storage_mode.clone(), - false, - false, - shutdown, - ) - .await - .map_err(|e| e.context("create validator"))?, + NodeType::Validator => { + Self::check_proposal_cache(account.address()); + Node::new_validator( + node_ip, + Some(bft_ip), + Some(rest_ip), + self.rest_rps, + account, + &self.peers, + &self.validators, + genesis, + None, + storage_mode.clone(), + false, + false, + shutdown, + ) + .await + .map_err(|e| e.context("create validator"))? + } NodeType::Prover => Node::new_prover( node_ip, account, @@ -272,6 +278,24 @@ impl Runner { Ok(()) } + /// Check the proposal cache for this address and remove it if it is + /// invalid. + fn check_proposal_cache(addr: Address) { + let proposal_cache_path = proposal_cache_path(N::ID, None); + if !proposal_cache_path.exists() { + return; + } + + let Err(e) = ProposalCache::::load(addr, None) else { + return; + }; + + tracing::error!("failed to load proposal cache: {e}"); + if let Err(e) = std::fs::remove_dir_all(&proposal_cache_path) { + tracing::error!("failed to remove proposal cache: {e}"); + } + } + /// Returns a runtime for the node. pub fn runtime() -> tokio::runtime::Runtime { // Retrieve the number of cores. From 266930f639e54fc123cbe0d646070ccf8fc73fbc Mon Sep 17 00:00:00 2001 From: Meshiest Date: Wed, 27 Nov 2024 21:46:11 -0500 Subject: [PATCH 26/68] feat(controlplane): event stream, subscribing, and filtering. WIP action fixes --- crates/agent/src/client.rs | 9 ++ crates/controlplane/src/events/filter.rs | 49 +++++++ crates/controlplane/src/events/filter_ops.rs | 137 ++++++++++++++++++ crates/controlplane/src/events/mod.rs | 15 ++ crates/controlplane/src/events/models.rs | 112 ++++++++++++++ crates/controlplane/src/events/stream.rs | 87 +++++++++++ crates/controlplane/src/main.rs | 1 + .../controlplane/src/server/actions/power.rs | 18 +++ crates/controlplane/src/server/rpc.rs | 27 +++- crates/controlplane/src/server/websocket.rs | 38 +++-- crates/controlplane/src/state/global.rs | 3 + 11 files changed, 481 insertions(+), 15 deletions(-) create mode 100644 crates/controlplane/src/events/filter.rs create mode 100644 crates/controlplane/src/events/filter_ops.rs create mode 100644 crates/controlplane/src/events/mod.rs create mode 100644 crates/controlplane/src/events/models.rs create mode 100644 crates/controlplane/src/events/stream.rs diff --git a/crates/agent/src/client.rs b/crates/agent/src/client.rs index c6edf855..3d455fc4 100644 --- a/crates/agent/src/client.rs +++ b/crates/agent/src/client.rs @@ -52,6 +52,15 @@ pub async fn ws_connection(ws_req: Request, state: Arc) { let (mut stream, _response) = match connect_async(ws_req).await { Ok(res) => res, Err(e) => { + match e { + // Ignore connection refused errors, we only care if something interesting is + // causing the connection to fail. + tungstenite::Error::Io(e) if e.kind() == std::io::ErrorKind::ConnectionRefused => { + return + } + _ => {} + } + error!("failed to connect to websocket: {e}"); return; } diff --git a/crates/controlplane/src/events/filter.rs b/crates/controlplane/src/events/filter.rs new file mode 100644 index 00000000..4a1019cf --- /dev/null +++ b/crates/controlplane/src/events/filter.rs @@ -0,0 +1,49 @@ +use super::{Event, EventFilter, EventKind, EventKindFilter}; + +impl EventKind { + pub fn filter(&self, filter: &EventKindFilter) -> bool { + matches!( + (self, filter), + (EventKind::AgentConnected, EventKindFilter::AgentConnected) + | ( + EventKind::AgentHandshakeComplete, + EventKindFilter::AgentHandshakeComplete + ) + | ( + EventKind::AgentDisconnected, + EventKindFilter::AgentDisconnected + ) + | ( + EventKind::ReconcileComplete, + EventKindFilter::ReconcileComplete + ) + | (EventKind::Reconcile(_), EventKindFilter::Reconcile) + | ( + EventKind::ReconcileError(_), + EventKindFilter::ReconcileError + ) + | (EventKind::NodeStatus(_), EventKindFilter::NodeStatus) + | (EventKind::Block(_), EventKindFilter::Block) + ) + } +} + +impl Event { + pub fn matches(&self, filter: &EventFilter) -> bool { + match filter { + EventFilter::Unfiltered => true, + EventFilter::AllOf(filters) => filters.iter().all(|f| self.matches(f)), + EventFilter::AnyOf(filters) => filters.iter().any(|f| self.matches(f)), + EventFilter::OneOf(filters) => filters.iter().filter(|f| self.matches(f)).count() == 1, + EventFilter::Not(f) => !self.matches(f), + EventFilter::AgentIs(agent) => self.agent == Some(*agent), + EventFilter::EnvIs(env) => self.env == Some(*env), + EventFilter::EventIs(kind) => self.kind.filter(kind), + EventFilter::NodeKeyIs(node_key) => self.node_key.as_ref() == Some(node_key), + EventFilter::NodeTargetIs(node_targets) => self + .node_key + .as_ref() + .is_some_and(|key| node_targets.matches(key)), + } + } +} diff --git a/crates/controlplane/src/events/filter_ops.rs b/crates/controlplane/src/events/filter_ops.rs new file mode 100644 index 00000000..001f8d08 --- /dev/null +++ b/crates/controlplane/src/events/filter_ops.rs @@ -0,0 +1,137 @@ +use super::{EventFilter, EventKind, EventKindFilter}; + +impl std::ops::BitAnd for EventFilter { + type Output = Self; + + fn bitand(self, rhs: Self) -> Self::Output { + match (self, rhs) { + (EventFilter::Unfiltered, rhs) => rhs, + (lhs, EventFilter::Unfiltered) => lhs, + (EventFilter::AllOf(mut filters), EventFilter::AllOf(rhs_filters)) => { + filters.extend(rhs_filters); + EventFilter::AllOf(filters) + } + (EventFilter::AllOf(mut filters), rhs) => { + filters.push(rhs); + EventFilter::AllOf(filters) + } + (lhs, EventFilter::AllOf(mut rhs_filters)) => { + rhs_filters.push(lhs); + EventFilter::AllOf(rhs_filters) + } + (lhs, rhs) => EventFilter::AllOf(vec![lhs, rhs]), + } + } +} + +impl std::ops::BitOr for EventFilter { + type Output = Self; + + fn bitor(self, rhs: Self) -> Self::Output { + match (self, rhs) { + (EventFilter::Unfiltered, _) => EventFilter::Unfiltered, + (_, EventFilter::Unfiltered) => EventFilter::Unfiltered, + (EventFilter::AnyOf(mut filters), EventFilter::AnyOf(rhs_filters)) => { + filters.extend(rhs_filters); + EventFilter::AnyOf(filters) + } + (EventFilter::AnyOf(mut filters), rhs) => { + filters.push(rhs); + EventFilter::AnyOf(filters) + } + (lhs, EventFilter::AnyOf(mut rhs_filters)) => { + rhs_filters.push(lhs); + EventFilter::AnyOf(rhs_filters) + } + (lhs, rhs) => EventFilter::AnyOf(vec![lhs, rhs]), + } + } +} + +impl std::ops::BitXor for EventFilter { + type Output = Self; + + fn bitxor(self, rhs: Self) -> Self::Output { + match (self, rhs) { + (EventFilter::Unfiltered, rhs) => rhs, + (lhs, EventFilter::Unfiltered) => lhs, + (EventFilter::OneOf(mut filters), EventFilter::OneOf(rhs_filters)) => { + filters.extend(rhs_filters); + EventFilter::OneOf(filters) + } + (EventFilter::OneOf(mut filters), rhs) => { + filters.push(rhs); + EventFilter::OneOf(filters) + } + (lhs, EventFilter::OneOf(mut rhs_filters)) => { + rhs_filters.push(lhs); + EventFilter::OneOf(rhs_filters) + } + (lhs, rhs) => EventFilter::OneOf(vec![lhs, rhs]), + } + } +} + +impl std::ops::Not for EventFilter { + type Output = Self; + + fn not(self) -> Self::Output { + EventFilter::Not(Box::new(self)) + } +} + +impl std::ops::Not for EventKindFilter { + type Output = EventFilter; + + fn not(self) -> Self::Output { + !EventFilter::EventIs(self) + } +} + +impl std::ops::BitOr for EventKindFilter { + type Output = EventFilter; + + fn bitor(self, rhs: EventFilter) -> Self::Output { + EventFilter::EventIs(self) | rhs + } +} + +impl std::ops::BitAnd for EventKindFilter { + type Output = EventFilter; + + fn bitand(self, rhs: EventFilter) -> Self::Output { + EventFilter::EventIs(self) & rhs + } +} + +impl std::ops::BitOr for EventFilter { + type Output = EventFilter; + + fn bitor(self, rhs: EventKindFilter) -> Self::Output { + self | EventFilter::EventIs(rhs) + } +} + +impl std::ops::BitAnd for EventFilter { + type Output = EventFilter; + + fn bitand(self, rhs: EventKindFilter) -> Self::Output { + self & EventFilter::EventIs(rhs) + } +} + +impl std::ops::BitOr for EventKindFilter { + type Output = EventFilter; + + fn bitor(self, rhs: Self) -> Self::Output { + EventFilter::EventIs(self) | EventFilter::EventIs(rhs) + } +} + +impl std::ops::BitAnd for EventKindFilter { + type Output = EventFilter; + + fn bitand(self, rhs: Self) -> Self::Output { + EventFilter::EventIs(self) & EventFilter::EventIs(rhs) + } +} diff --git a/crates/controlplane/src/events/mod.rs b/crates/controlplane/src/events/mod.rs new file mode 100644 index 00000000..29a72365 --- /dev/null +++ b/crates/controlplane/src/events/mod.rs @@ -0,0 +1,15 @@ +mod models; +pub use models::*; +mod stream; +pub use stream::*; + +mod filter; +mod filter_ops; +pub use filter::*; + +pub mod prelude { + pub use super::filter::*; + pub use super::models::EventFilter::*; + pub use super::models::EventKindFilter::*; + pub use super::models::*; +} diff --git a/crates/controlplane/src/events/models.rs b/crates/controlplane/src/events/models.rs new file mode 100644 index 00000000..184e6c06 --- /dev/null +++ b/crates/controlplane/src/events/models.rs @@ -0,0 +1,112 @@ +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use snops_common::{ + node_targets::NodeTargets, + rpc::error::ReconcileError, + state::{AgentId, AgentState, EnvId, LatestBlockInfo, NodeKey, NodeStatus, ReconcileStatus}, +}; + +use crate::state::Agent; + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct Event { + pub created_at: DateTime, + pub agent: Option, + pub node_key: Option, + pub env: Option, + pub kind: EventKind, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "kind")] +pub enum EventKind { + /// An agent connects to the control plane + AgentConnected, + /// An agent completes a handshake with the control plane + AgentHandshakeComplete, + /// An agent disconnects from the control plane + AgentDisconnected, + /// An agent finishes a reconcile + ReconcileComplete, + /// An agent updates its reconcile status + Reconcile(ReconcileStatus<()>), + /// An error occurs during reconcile + ReconcileError(ReconcileError), + /// An agent emits a node status + NodeStatus(NodeStatus), + /// An agent emits a block update + Block(LatestBlockInfo), +} + +#[derive(Clone, Copy, Debug)] +pub enum EventKindFilter { + AgentConnected, + AgentHandshakeComplete, + AgentDisconnected, + ReconcileComplete, + Reconcile, + ReconcileError, + NodeStatus, + Block, +} + +pub enum EventFilter { + /// No filter + Unfiltered, + + /// Logical AND of filters + AllOf(Vec), + /// Logical OR of filters + AnyOf(Vec), + /// Logical XOR of filters + OneOf(Vec), + /// Logical NOT of filter + Not(Box), + + /// Filter by agent ID + AgentIs(AgentId), + /// Filter by environment ID + EnvIs(EnvId), + /// Filter by event kind + EventIs(EventKindFilter), + /// Filter by node key + NodeKeyIs(NodeKey), + /// Filter by node target + NodeTargetIs(NodeTargets), +} + +impl Event { + pub fn new(kind: EventKind) -> Self { + Self { + created_at: Utc::now(), + agent: None, + node_key: None, + env: None, + kind, + } + } + + pub fn replace_kind(&self, kind: EventKind) -> Self { + Self { + created_at: Utc::now(), + agent: self.agent, + node_key: self.node_key.clone(), + env: self.env, + kind, + } + } + + pub fn with_agent(mut self, agent: &Agent) -> Self { + self.agent = Some(agent.id); + if let AgentState::Node(env_id, node) = &agent.state { + self.node_key = Some(node.node_key.clone()); + self.env = Some(*env_id); + } + self + } + + pub fn with_env(mut self, env_id: EnvId) -> Self { + self.env = Some(env_id); + self + } +} diff --git a/crates/controlplane/src/events/stream.rs b/crates/controlplane/src/events/stream.rs new file mode 100644 index 00000000..0510669a --- /dev/null +++ b/crates/controlplane/src/events/stream.rs @@ -0,0 +1,87 @@ +use std::sync::Arc; + +use futures_util::Stream; +use tokio::sync::broadcast::{self, error::TryRecvError}; + +use super::{Event, EventFilter}; + +#[derive(Debug)] +pub struct Events { + tx: broadcast::Sender>, +} + +impl Events { + pub fn new() -> Self { + Self { + tx: broadcast::channel(1024).0, + } + } + + pub fn emit(&self, event: Event) { + if self.tx.receiver_count() == 0 { + return; + } + // The only way this can fail is a receiver was dropped between the above check + // and this call... + let _ = self.tx.send(Arc::new(event)); + } + + pub fn subscribe(&self) -> EventSubscriber { + EventSubscriber { + rx: self.tx.subscribe(), + filter: EventFilter::Unfiltered, + } + } + + pub fn subscribe_on(&self, filter: EventFilter) -> EventSubscriber { + EventSubscriber { + rx: self.tx.subscribe(), + filter, + } + } +} + +impl Default for Events { + fn default() -> Self { + Self::new() + } +} + +pub struct EventSubscriber { + rx: broadcast::Receiver>, + filter: EventFilter, +} + +impl EventSubscriber { + pub async fn next(&mut self) -> Result, broadcast::error::RecvError> { + self.rx.recv().await + } +} + +impl Stream for EventSubscriber { + type Item = Arc; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + loop { + match self.rx.try_recv() { + Ok(event) if event.matches(&self.filter) => { + return std::task::Poll::Ready(Some(event)); + } + // skip events that don't match the filter + Ok(_) => continue, + Err(TryRecvError::Closed) => { + return std::task::Poll::Ready(None); + } + Err(TryRecvError::Empty) => { + return std::task::Poll::Pending; + } + Err(TryRecvError::Lagged(n)) => { + tracing::warn!("{n} events dropped by a subscriber"); + } + } + } + } +} diff --git a/crates/controlplane/src/main.rs b/crates/controlplane/src/main.rs index f3c99c03..7216d048 100644 --- a/crates/controlplane/src/main.rs +++ b/crates/controlplane/src/main.rs @@ -15,6 +15,7 @@ pub mod cli; pub mod db; pub mod env; pub mod error; +pub mod events; pub mod logging; pub mod persist; pub mod schema; diff --git a/crates/controlplane/src/server/actions/power.rs b/crates/controlplane/src/server/actions/power.rs index 83acb42c..bf6bdeaf 100644 --- a/crates/controlplane/src/server/actions/power.rs +++ b/crates/controlplane/src/server/actions/power.rs @@ -1,3 +1,5 @@ +use std::collections::HashSet; + use axum::{ response::{IntoResponse, Response}, Json, @@ -23,10 +25,26 @@ pub async fn online( }) .collect::>(); // TODO + let mut awaiting_agents = pending.iter().map(|a| a.0).collect::>(); + let node_map = pending_reconcile_node_map(pending.iter()); state.update_agent_states(pending).await; + use crate::events::prelude::*; + let mut subscriber = state + .events + .subscribe_on(NodeTargetIs(nodes) & EnvIs(env.id) & ReconcileComplete); + + while !awaiting_agents.is_empty() { + // TODO: expire after some time + if let Ok(event) = subscriber.next().await { + if let Some(agent) = event.agent { + awaiting_agents.remove(&agent); + } + } + } + Json(node_map).into_response() } diff --git a/crates/controlplane/src/server/rpc.rs b/crates/controlplane/src/server/rpc.rs index 598539fd..798300ac 100644 --- a/crates/controlplane/src/server/rpc.rs +++ b/crates/controlplane/src/server/rpc.rs @@ -26,6 +26,7 @@ use tracing::warn; use crate::{ error::StateError, + events::{Event, EventKind}, state::{AddrMap, AgentAddrs, AppState}, }; @@ -146,6 +147,10 @@ impl ControlService for ControlRpcServer { update_time: Utc::now(), }; + self.state + .events + .emit(Event::new(EventKind::Block(info.clone())).with_agent(&agent)); + agent.status.block_info = Some(info.clone()); let agent_id = agent.id(); let client = agent.client_owned().clone(); @@ -200,7 +205,10 @@ impl ControlService for ControlRpcServer { return; }; - agent.status.node_status = status; + agent.status.node_status = status.clone(); + self.state + .events + .emit(Event::new(EventKind::NodeStatus(status)).with_agent(&agent)); } async fn post_reconcile_status( @@ -212,8 +220,21 @@ impl ControlService for ControlRpcServer { return; }; - // TODO: pipe these status updates to some event stream - agent.status.reconcile = Some((Instant::now(), status)); + agent.status.reconcile = Some((Instant::now(), status.clone())); + + // Emit events for this reconcile + + let ev = Event::new(EventKind::ReconcileComplete).with_agent(&agent); + let is_complete = status.as_ref().is_ok_and(|e| e.inner.is_some()); + + self.state.events.emit(ev.replace_kind(match status { + Ok(res) => EventKind::Reconcile(res), + Err(err) => EventKind::ReconcileError(err), + })); + + if is_complete { + self.state.events.emit(ev); + } } } diff --git a/crates/controlplane/src/server/websocket.rs b/crates/controlplane/src/server/websocket.rs index 16268b8c..25bf9c38 100644 --- a/crates/controlplane/src/server/websocket.rs +++ b/crates/controlplane/src/server/websocket.rs @@ -26,6 +26,7 @@ use tracing::{error, info, warn}; use super::{jwt::Claims, rpc::ControlRpcServer}; use crate::{ + events::{Event, EventKind}, server::{ jwt::JWT_SECRET, rpc::{MuxedMessageIncoming, MuxedMessageOutgoing}, @@ -134,6 +135,9 @@ async fn handle_socket( warn!("Connecting agent {id} is trying to identify with an invalid nonce"); break 'reconnect; } + state + .events + .emit(Event::new(EventKind::AgentConnected).with_agent(&agent)); match agent.env() { Some(env) if !state.envs.contains_key(&env) => { @@ -244,20 +248,26 @@ async fn handle_socket( error!("failed to save agent {id} to the database: {e}"); } - if !is_ip_change && !is_port_change { - return; + let handshake_event = Event::new(EventKind::AgentHandshakeComplete).with_agent(&agent); + + 'peer_update: { + if !is_ip_change && !is_port_change { + break 'peer_update; + } + let Some(env_id) = agent.env() else { + break 'peer_update; + }; + drop(agent); + let Some(env) = state2.get_env(env_id) else { + break 'peer_update; + }; + + info!("Agent {id} updated its network addresses... Submitting changes to associated peers"); + env.update_peer_addr(&state2, id, is_port_change, is_ip_change) + .await; } - let Some(env_id) = agent.env() else { - return; - }; - drop(agent); - let Some(env) = state2.get_env(env_id) else { - return; - }; - info!("Agent {id} updated its network addresses... Submitting changes to associated peers"); - env.update_peer_addr(&state2, id, is_port_change, is_ip_change) - .await; + state2.events.emit(handshake_event); }); // set up the server, for incoming RPC requests @@ -356,6 +366,10 @@ async fn handle_socket( // remove the client from the agent in the agent pool if let Some(mut agent) = state.pool.get_mut(&id) { agent.mark_disconnected(); + + state + .events + .emit(Event::new(EventKind::AgentDisconnected).with_agent(&agent)); } info!("Agent {id} disconnected"); diff --git a/crates/controlplane/src/state/global.rs b/crates/controlplane/src/state/global.rs index 3594dd33..9338be7b 100644 --- a/crates/controlplane/src/state/global.rs +++ b/crates/controlplane/src/state/global.rs @@ -25,6 +25,7 @@ use crate::{ db::Database, env::{cache::NetworkCache, error::EnvRequestError, Environment, PortType}, error::StateError, + events::Events, schema::storage::{LoadedStorage, STORAGE_DIR}, server::error::StartError, ReloadHandler, @@ -44,6 +45,7 @@ pub struct GlobalState { pub storage: StorageMap, pub envs: EnvMap, pub env_network_cache: OpaqueDebug>, + pub events: Events, pub prometheus: OpaqueDebug>, @@ -94,6 +96,7 @@ impl GlobalState { pool, storage, envs: EnvMap::default(), + events: Default::default(), prometheus: OpaqueDebug(prometheus), db: OpaqueDebug(db), env_network_cache: Default::default(), From 606b9073093f75603ede7fc459466133190e055c Mon Sep 17 00:00:00 2001 From: Meshiest Date: Thu, 28 Nov 2024 03:52:43 -0500 Subject: [PATCH 27/68] fix(agent): fix not broadcasting reconcile status, fix some missing requeues --- crates/agent/src/reconcile/agent.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index 95a8dc4c..2520ad38 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -8,6 +8,7 @@ use snops_common::{ rpc::error::ReconcileError, state::{AgentState, HeightRequest, ReconcileCondition, TransferId}, }; +use tarpc::context; use tokio::{ select, sync::{mpsc::Receiver, Mutex}, @@ -146,6 +147,15 @@ impl AgentStateReconciler { trace!("Reconciling agent state..."); let res = self.reconcile().await; + if let Some(client) = self.state.get_ws_client().await { + let res = res.clone(); + // TODO: throttle this broadcast + tokio::spawn(async move { + if let Err(e) = client.post_reconcile_status(context::current(), res).await { + error!("failed to post reconcile status: {e}"); + } + }); + } match res { Ok(status) => { if status.inner.is_some() { @@ -389,10 +399,14 @@ impl Reconcile<(), ReconcileError> for AgentStateReconciler { let process = ProcessContext::new(command)?; self.context.process = Some(process); - return Ok(ReconcileStatus::default().add_scope("agent_state/starting")); + return Ok(ReconcileStatus::empty() + .add_scope("agent_state/starting") + .requeue_after(Duration::from_secs(1))); } - Ok(ReconcileStatus::empty()) + Ok(ReconcileStatus::empty() + .add_scope("agent_state/edge_case") + .requeue_after(Duration::from_secs(1))) } } From 49d7b57c85e812e4e052a58dc8f75ec91f6eb19d Mon Sep 17 00:00:00 2001 From: Meshiest Date: Thu, 28 Nov 2024 03:56:31 -0500 Subject: [PATCH 28/68] feat(controlplane): event stream tests and quality of life --- crates/controlplane/src/events/filter.rs | 32 +---- crates/controlplane/src/events/filter_ops.rs | 26 +++- crates/controlplane/src/events/mod.rs | 9 +- crates/controlplane/src/events/models.rs | 59 ++++++++- crates/controlplane/src/events/stream.rs | 46 +++++-- crates/controlplane/src/events/test_filter.rs | 120 ++++++++++++++++++ .../src/events/test_filter_ops.rs | 79 ++++++++++++ crates/controlplane/src/events/test_stream.rs | 40 ++++++ crates/controlplane/src/server/rpc.rs | 8 +- crates/controlplane/src/server/websocket.rs | 8 +- 10 files changed, 371 insertions(+), 56 deletions(-) create mode 100644 crates/controlplane/src/events/test_filter.rs create mode 100644 crates/controlplane/src/events/test_filter_ops.rs create mode 100644 crates/controlplane/src/events/test_stream.rs diff --git a/crates/controlplane/src/events/filter.rs b/crates/controlplane/src/events/filter.rs index 4a1019cf..c6b9f6bf 100644 --- a/crates/controlplane/src/events/filter.rs +++ b/crates/controlplane/src/events/filter.rs @@ -1,32 +1,4 @@ -use super::{Event, EventFilter, EventKind, EventKindFilter}; - -impl EventKind { - pub fn filter(&self, filter: &EventKindFilter) -> bool { - matches!( - (self, filter), - (EventKind::AgentConnected, EventKindFilter::AgentConnected) - | ( - EventKind::AgentHandshakeComplete, - EventKindFilter::AgentHandshakeComplete - ) - | ( - EventKind::AgentDisconnected, - EventKindFilter::AgentDisconnected - ) - | ( - EventKind::ReconcileComplete, - EventKindFilter::ReconcileComplete - ) - | (EventKind::Reconcile(_), EventKindFilter::Reconcile) - | ( - EventKind::ReconcileError(_), - EventKindFilter::ReconcileError - ) - | (EventKind::NodeStatus(_), EventKindFilter::NodeStatus) - | (EventKind::Block(_), EventKindFilter::Block) - ) - } -} +use super::{Event, EventFilter}; impl Event { pub fn matches(&self, filter: &EventFilter) -> bool { @@ -38,7 +10,7 @@ impl Event { EventFilter::Not(f) => !self.matches(f), EventFilter::AgentIs(agent) => self.agent == Some(*agent), EventFilter::EnvIs(env) => self.env == Some(*env), - EventFilter::EventIs(kind) => self.kind.filter(kind), + EventFilter::EventIs(kind) => self.kind.filter() == *kind, EventFilter::NodeKeyIs(node_key) => self.node_key.as_ref() == Some(node_key), EventFilter::NodeTargetIs(node_targets) => self .node_key diff --git a/crates/controlplane/src/events/filter_ops.rs b/crates/controlplane/src/events/filter_ops.rs index 001f8d08..572c954d 100644 --- a/crates/controlplane/src/events/filter_ops.rs +++ b/crates/controlplane/src/events/filter_ops.rs @@ -1,4 +1,4 @@ -use super::{EventFilter, EventKind, EventKindFilter}; +use super::{EventFilter, EventKindFilter}; impl std::ops::BitAnd for EventFilter { type Output = Self; @@ -104,6 +104,14 @@ impl std::ops::BitAnd for EventKindFilter { } } +impl std::ops::BitXor for EventKindFilter { + type Output = EventFilter; + + fn bitxor(self, rhs: EventFilter) -> Self::Output { + EventFilter::EventIs(self) ^ rhs + } +} + impl std::ops::BitOr for EventFilter { type Output = EventFilter; @@ -120,6 +128,14 @@ impl std::ops::BitAnd for EventFilter { } } +impl std::ops::BitXor for EventFilter { + type Output = EventFilter; + + fn bitxor(self, rhs: EventKindFilter) -> Self::Output { + self ^ EventFilter::EventIs(rhs) + } +} + impl std::ops::BitOr for EventKindFilter { type Output = EventFilter; @@ -135,3 +151,11 @@ impl std::ops::BitAnd for EventKindFilter { EventFilter::EventIs(self) & EventFilter::EventIs(rhs) } } + +impl std::ops::BitXor for EventKindFilter { + type Output = EventFilter; + + fn bitxor(self, rhs: Self) -> Self::Output { + EventFilter::EventIs(self) ^ EventFilter::EventIs(rhs) + } +} diff --git a/crates/controlplane/src/events/mod.rs b/crates/controlplane/src/events/mod.rs index 29a72365..675d4556 100644 --- a/crates/controlplane/src/events/mod.rs +++ b/crates/controlplane/src/events/mod.rs @@ -5,11 +5,16 @@ pub use stream::*; mod filter; mod filter_ops; -pub use filter::*; pub mod prelude { - pub use super::filter::*; pub use super::models::EventFilter::*; pub use super::models::EventKindFilter::*; pub use super::models::*; } + +#[cfg(test)] +mod test_filter; +#[cfg(test)] +mod test_filter_ops; +#[cfg(test)] +mod test_stream; diff --git a/crates/controlplane/src/events/models.rs b/crates/controlplane/src/events/models.rs index 184e6c06..5c08eda7 100644 --- a/crates/controlplane/src/events/models.rs +++ b/crates/controlplane/src/events/models.rs @@ -38,7 +38,23 @@ pub enum EventKind { Block(LatestBlockInfo), } -#[derive(Clone, Copy, Debug)] +impl EventKind { + pub fn filter(&self) -> EventKindFilter { + match self { + EventKind::AgentConnected => EventKindFilter::AgentConnected, + EventKind::AgentHandshakeComplete => EventKindFilter::AgentHandshakeComplete, + EventKind::AgentDisconnected => EventKindFilter::AgentDisconnected, + EventKind::ReconcileComplete => EventKindFilter::ReconcileComplete, + EventKind::Reconcile(_) => EventKindFilter::Reconcile, + EventKind::ReconcileError(_) => EventKindFilter::ReconcileError, + EventKind::NodeStatus(_) => EventKindFilter::NodeStatus, + EventKind::Block(_) => EventKindFilter::Block, + } + } +} + +#[derive(Clone, Copy, Debug, PartialEq)] +#[repr(u8)] pub enum EventKindFilter { AgentConnected, AgentHandshakeComplete, @@ -50,6 +66,7 @@ pub enum EventKindFilter { Block, } +#[derive(Clone, Debug, PartialEq)] pub enum EventFilter { /// No filter Unfiltered, @@ -110,3 +127,43 @@ impl Event { self } } + +impl From for EventFilter { + fn from(kind: EventKindFilter) -> Self { + EventFilter::EventIs(kind) + } +} + +impl EventKind { + pub fn event(self) -> Event { + Event::new(self) + } + + pub fn with_agent(self, agent: &Agent) -> Event { + let mut event = Event::new(self); + event.agent = Some(agent.id); + if let AgentState::Node(env_id, node) = &agent.state { + event.node_key = Some(node.node_key.clone()); + event.env = Some(*env_id); + } + event + } + + pub fn with_agent_id(self, agent_id: AgentId) -> Event { + let mut event = Event::new(self); + event.agent = Some(agent_id); + event + } + + pub fn with_node_key(self, node_key: NodeKey) -> Event { + let mut event = Event::new(self); + event.node_key = Some(node_key); + event + } + + pub fn with_env_id(self, env_id: EnvId) -> Event { + let mut event = Event::new(self); + event.env = Some(env_id); + event + } +} diff --git a/crates/controlplane/src/events/stream.rs b/crates/controlplane/src/events/stream.rs index 0510669a..8e59b905 100644 --- a/crates/controlplane/src/events/stream.rs +++ b/crates/controlplane/src/events/stream.rs @@ -1,4 +1,4 @@ -use std::sync::Arc; +use std::{sync::Arc, task::Poll}; use futures_util::Stream; use tokio::sync::broadcast::{self, error::TryRecvError}; @@ -33,10 +33,10 @@ impl Events { } } - pub fn subscribe_on(&self, filter: EventFilter) -> EventSubscriber { + pub fn subscribe_on(&self, filter: impl Into) -> EventSubscriber { EventSubscriber { rx: self.tx.subscribe(), - filter, + filter: filter.into(), } } } @@ -54,7 +54,31 @@ pub struct EventSubscriber { impl EventSubscriber { pub async fn next(&mut self) -> Result, broadcast::error::RecvError> { - self.rx.recv().await + loop { + match self.rx.recv().await { + Ok(event) if event.matches(&self.filter) => break Ok(event), + // skip events that don't match the filter + Ok(_) => continue, + Err(e) => break Err(e), + } + } + } + + pub fn collect_many(&mut self) -> Vec> { + let mut events = Vec::new(); + loop { + match self.rx.try_recv() { + Ok(event) if event.matches(&self.filter) => events.push(event), + // skip events that don't match the filter + Ok(_) => continue, + Err(TryRecvError::Closed) => break, + Err(TryRecvError::Empty) => break, + Err(TryRecvError::Lagged(n)) => { + tracing::warn!("{n} events dropped by a subscriber"); + } + } + } + events } } @@ -64,20 +88,14 @@ impl Stream for EventSubscriber { fn poll_next( mut self: std::pin::Pin<&mut Self>, _cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { loop { match self.rx.try_recv() { - Ok(event) if event.matches(&self.filter) => { - return std::task::Poll::Ready(Some(event)); - } + Ok(event) if event.matches(&self.filter) => break Poll::Ready(Some(event)), // skip events that don't match the filter Ok(_) => continue, - Err(TryRecvError::Closed) => { - return std::task::Poll::Ready(None); - } - Err(TryRecvError::Empty) => { - return std::task::Poll::Pending; - } + Err(TryRecvError::Closed) => break Poll::Ready(None), + Err(TryRecvError::Empty) => break Poll::Pending, Err(TryRecvError::Lagged(n)) => { tracing::warn!("{n} events dropped by a subscriber"); } diff --git a/crates/controlplane/src/events/test_filter.rs b/crates/controlplane/src/events/test_filter.rs new file mode 100644 index 00000000..d431186e --- /dev/null +++ b/crates/controlplane/src/events/test_filter.rs @@ -0,0 +1,120 @@ +use std::str::FromStr; + +use chrono::Utc; +use lazy_static::lazy_static; +use snops_common::node_targets::NodeTargets; +use snops_common::rpc::error::ReconcileError; +use snops_common::state::InternedId; +use snops_common::state::LatestBlockInfo; +use snops_common::state::NodeKey; +use snops_common::state::NodeStatus; +use snops_common::state::ReconcileStatus; + +use super::EventFilter::*; +use super::EventKind::*; +use super::EventKindFilter as EKF; +use crate::events::Event; + +lazy_static! { + static ref A: InternedId = InternedId::from_str("a").unwrap(); + static ref B: InternedId = InternedId::from_str("b").unwrap(); + static ref C: InternedId = InternedId::from_str("c").unwrap(); + static ref D: InternedId = InternedId::from_str("d").unwrap(); +} + +#[test] +fn test_unfiltered() { + assert!(AgentConnected.event().matches(&Unfiltered)); + assert!(AgentHandshakeComplete.event().matches(&Unfiltered)); + assert!(AgentDisconnected.event().matches(&Unfiltered)); + assert!(ReconcileComplete.event().matches(&Unfiltered)); + assert!(Reconcile(ReconcileStatus::empty()) + .event() + .matches(&Unfiltered)); + assert!(ReconcileError(ReconcileError::Offline) + .event() + .matches(&Unfiltered)); + assert!(NodeStatus(NodeStatus::Unknown).event().matches(&Unfiltered)); + assert!(Block(LatestBlockInfo::default()) + .event() + .matches(&Unfiltered)); +} + +#[test] +fn test_all_of() { + assert!(AgentConnected + .event() + .matches(&AllOf(vec![EventIs(EKF::AgentConnected)]))); + + let e = Event { + created_at: Utc::now(), + agent: Some(*A), + node_key: Some(NodeKey::from_str("client/foo").unwrap()), + env: Some(*B), + kind: AgentConnected, + }; + + assert!(e.matches(&(EKF::AgentConnected & AgentIs(*A)))); + assert!(e.matches(&(EKF::AgentConnected & NodeKeyIs(NodeKey::from_str("client/foo").unwrap())))); + assert!(e.matches(&(EKF::AgentConnected & EnvIs(*B)))); + assert!(e.matches(&(AgentIs(*A) & NodeTargetIs(NodeTargets::ALL) & EnvIs(*B)))); + + assert!(!e.matches(&(EKF::AgentConnected & AgentIs(*B)))); + assert!( + !e.matches(&(EKF::AgentConnected & NodeKeyIs(NodeKey::from_str("client/bar").unwrap()))) + ); + assert!(!e.matches(&(EKF::AgentConnected & EnvIs(*A)))); + assert!(!e.matches(&(AgentIs(*B) & NodeTargetIs(NodeTargets::ALL) & EnvIs(*B)))); +} + +#[test] +fn test_any_of() { + assert!(AgentConnected + .event() + .matches(&AnyOf(vec![EventIs(EKF::AgentConnected)]))); + + let e = Event { + created_at: Utc::now(), + agent: Some(*A), + node_key: Some(NodeKey::from_str("client/foo").unwrap()), + env: Some(*B), + kind: AgentConnected, + }; + + assert!(e.matches(&(EKF::AgentConnected | AgentIs(*A)))); + assert!(e.matches(&(EKF::AgentConnected | NodeKeyIs(NodeKey::from_str("client/foo").unwrap())))); + assert!(e.matches(&(EKF::AgentConnected | EnvIs(*B)))); + assert!(e.matches(&(AgentIs(*A) | NodeTargetIs(NodeTargets::ALL) | EnvIs(*B)))); + + assert!(e.matches(&(EKF::AgentConnected | AgentIs(*B)))); + assert!(e.matches(&(EKF::AgentConnected | NodeKeyIs(NodeKey::from_str("client/bar").unwrap())))); + assert!(e.matches(&(EKF::AgentConnected | EnvIs(*A)))); + + assert!(e.matches(&(AgentIs(*B) | NodeTargetIs(NodeTargets::ALL) | EnvIs(*B)))); + + assert!(!e.matches(&(EKF::AgentDisconnected | AgentIs(*C)))); + assert!( + !e.matches(&(EKF::AgentDisconnected | NodeKeyIs(NodeKey::from_str("client/bar").unwrap()))) + ); +} + +#[test] +fn test_one_of() { + assert!(AgentConnected + .event() + .matches(&OneOf(vec![EventIs(EKF::AgentConnected)]))); + + let e = Event { + created_at: Utc::now(), + agent: Some(*A), + node_key: Some(NodeKey::from_str("client/foo").unwrap()), + env: Some(*B), + kind: AgentConnected, + }; + + assert!(e.matches(&(EKF::AgentConnected ^ AgentIs(*B)))); + assert!(e.matches(&(EKF::AgentConnected & (AgentIs(*A) ^ AgentIs(*B) ^ AgentIs(*C))))); + + assert!(!e.matches(&(EKF::AgentConnected ^ AgentIs(*A)))); + assert!(e.matches(&(!(EKF::AgentConnected ^ AgentIs(*A))))); +} diff --git a/crates/controlplane/src/events/test_filter_ops.rs b/crates/controlplane/src/events/test_filter_ops.rs new file mode 100644 index 00000000..179f148f --- /dev/null +++ b/crates/controlplane/src/events/test_filter_ops.rs @@ -0,0 +1,79 @@ +use std::str::FromStr; + +use lazy_static::lazy_static; +use snops_common::state::InternedId; + +use super::EventFilter::*; +use super::EventKindFilter::*; + +lazy_static! { + static ref A: InternedId = InternedId::from_str("a").unwrap(); + static ref B: InternedId = InternedId::from_str("b").unwrap(); + static ref C: InternedId = InternedId::from_str("c").unwrap(); + static ref D: InternedId = InternedId::from_str("d").unwrap(); +} + +#[test] +fn test_filter_bitand() { + assert_eq!(Unfiltered & Unfiltered, Unfiltered); + assert_eq!(Block & Unfiltered, EventIs(Block)); + assert_eq!( + Block & AgentIs(*A), + AllOf(vec![EventIs(Block), AgentIs(*A)]) + ); + assert_eq!( + AgentIs(*A) & AgentIs(*B), + AllOf(vec![AgentIs(*A), AgentIs(*B)]) + ); + assert_eq!( + AgentIs(*A) & AgentIs(*B) & AgentIs(*C), + AllOf(vec![AgentIs(*A), AgentIs(*B), AgentIs(*C)]) + ); +} + +#[test] +fn test_filter_bitor() { + assert_eq!(Unfiltered | Unfiltered, Unfiltered); + assert_eq!(Block | Unfiltered, Unfiltered); + assert_eq!( + Block | AgentIs(*A), + AnyOf(vec![EventIs(Block), AgentIs(*A)]) + ); + assert_eq!( + AgentIs(*A) | AgentIs(*B), + AnyOf(vec![AgentIs(*A), AgentIs(*B)]) + ); + assert_eq!( + AgentIs(*A) | AgentIs(*B) | AgentIs(*C), + AnyOf(vec![AgentIs(*A), AgentIs(*B), AgentIs(*C)]) + ); +} + +#[test] +fn test_filter_bitxor() { + assert_eq!(Unfiltered ^ Unfiltered, Unfiltered); + assert_eq!(Block ^ Unfiltered, EventIs(Block)); + assert_eq!( + Block ^ AgentIs(*A), + OneOf(vec![EventIs(Block), AgentIs(*A)]) + ); + assert_eq!( + AgentIs(*A) ^ AgentIs(*B), + OneOf(vec![AgentIs(*A), AgentIs(*B)]) + ); + assert_eq!( + AgentIs(*A) ^ AgentIs(*B) ^ AgentIs(*C), + OneOf(vec![AgentIs(*A), AgentIs(*B), AgentIs(*C)]) + ); +} + +#[test] +fn test_filter_not() { + assert_eq!(!Unfiltered, Not(Box::new(Unfiltered))); + assert_eq!(!Block, Not(Box::new(EventIs(Block)))); + assert_eq!(!AgentIs(*A), Not(Box::new(AgentIs(*A)))); + assert_eq!( + !AgentIs(*A) & AgentIs(*B), + AllOf(vec![Not(Box::new(AgentIs(*A))), AgentIs(*B)]) + ); +} diff --git a/crates/controlplane/src/events/test_stream.rs b/crates/controlplane/src/events/test_stream.rs new file mode 100644 index 00000000..b2ad17d5 --- /dev/null +++ b/crates/controlplane/src/events/test_stream.rs @@ -0,0 +1,40 @@ +use std::str::FromStr; + +use lazy_static::lazy_static; +use snops_common::state::InternedId; + +use super::EventFilter::*; +use super::EventKind as EK; +use super::EventKindFilter::*; +use super::Events; + +lazy_static! { + static ref A: InternedId = InternedId::from_str("a").unwrap(); + static ref B: InternedId = InternedId::from_str("b").unwrap(); + static ref C: InternedId = InternedId::from_str("c").unwrap(); + static ref D: InternedId = InternedId::from_str("d").unwrap(); +} + +#[test] +fn test_stream_filtering() { + let events = Events::new(); + + let mut sub_all = events.subscribe(); + let mut sub_a = events.subscribe_on(AgentIs(*A)); + let mut sub_b = events.subscribe_on(AgentIs(*B)); + let mut sub_connected = events.subscribe_on(AgentConnected); + + assert_eq!(sub_all.collect_many().len(), 0); + assert_eq!(sub_a.collect_many().len(), 0); + assert_eq!(sub_b.collect_many().len(), 0); + assert_eq!(sub_connected.collect_many().len(), 0); + + events.emit(EK::AgentConnected.with_agent_id(*A)); + events.emit(EK::AgentDisconnected.with_agent_id(*A)); + events.emit(EK::Block(Default::default()).with_agent_id(*B)); + + assert_eq!(sub_all.collect_many().len(), 3); + assert_eq!(sub_a.collect_many().len(), 2); + assert_eq!(sub_b.collect_many().len(), 1); + assert_eq!(sub_connected.collect_many().len(), 1); +} diff --git a/crates/controlplane/src/server/rpc.rs b/crates/controlplane/src/server/rpc.rs index 798300ac..e107bd63 100644 --- a/crates/controlplane/src/server/rpc.rs +++ b/crates/controlplane/src/server/rpc.rs @@ -26,7 +26,7 @@ use tracing::warn; use crate::{ error::StateError, - events::{Event, EventKind}, + events::EventKind, state::{AddrMap, AgentAddrs, AppState}, }; @@ -149,7 +149,7 @@ impl ControlService for ControlRpcServer { self.state .events - .emit(Event::new(EventKind::Block(info.clone())).with_agent(&agent)); + .emit(EventKind::Block(info.clone()).with_agent(&agent)); agent.status.block_info = Some(info.clone()); let agent_id = agent.id(); @@ -208,7 +208,7 @@ impl ControlService for ControlRpcServer { agent.status.node_status = status.clone(); self.state .events - .emit(Event::new(EventKind::NodeStatus(status)).with_agent(&agent)); + .emit(EventKind::NodeStatus(status).with_agent(&agent)); } async fn post_reconcile_status( @@ -224,7 +224,7 @@ impl ControlService for ControlRpcServer { // Emit events for this reconcile - let ev = Event::new(EventKind::ReconcileComplete).with_agent(&agent); + let ev = EventKind::ReconcileComplete.with_agent(&agent); let is_complete = status.as_ref().is_ok_and(|e| e.inner.is_some()); self.state.events.emit(ev.replace_kind(match status { diff --git a/crates/controlplane/src/server/websocket.rs b/crates/controlplane/src/server/websocket.rs index 25bf9c38..69e9c6f8 100644 --- a/crates/controlplane/src/server/websocket.rs +++ b/crates/controlplane/src/server/websocket.rs @@ -26,7 +26,7 @@ use tracing::{error, info, warn}; use super::{jwt::Claims, rpc::ControlRpcServer}; use crate::{ - events::{Event, EventKind}, + events::EventKind, server::{ jwt::JWT_SECRET, rpc::{MuxedMessageIncoming, MuxedMessageOutgoing}, @@ -137,7 +137,7 @@ async fn handle_socket( } state .events - .emit(Event::new(EventKind::AgentConnected).with_agent(&agent)); + .emit(EventKind::AgentConnected.with_agent(&agent)); match agent.env() { Some(env) if !state.envs.contains_key(&env) => { @@ -248,7 +248,7 @@ async fn handle_socket( error!("failed to save agent {id} to the database: {e}"); } - let handshake_event = Event::new(EventKind::AgentHandshakeComplete).with_agent(&agent); + let handshake_event = EventKind::AgentHandshakeComplete.with_agent(&agent); 'peer_update: { if !is_ip_change && !is_port_change { @@ -369,7 +369,7 @@ async fn handle_socket( state .events - .emit(Event::new(EventKind::AgentDisconnected).with_agent(&agent)); + .emit(EventKind::AgentDisconnected.with_agent(&agent)); } info!("Agent {id} disconnected"); From 7a26c110dbfad73bd242cd2fac27748554670f08 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Thu, 28 Nov 2024 03:56:51 -0500 Subject: [PATCH 29/68] fix(controlplane): online/offline/reboot actions properly wait for reconcile complete --- .../controlplane/src/server/actions/power.rs | 81 +++++++++++-------- 1 file changed, 49 insertions(+), 32 deletions(-) diff --git a/crates/controlplane/src/server/actions/power.rs b/crates/controlplane/src/server/actions/power.rs index bf6bdeaf..c5a61692 100644 --- a/crates/controlplane/src/server/actions/power.rs +++ b/crates/controlplane/src/server/actions/power.rs @@ -4,15 +4,57 @@ use axum::{ response::{IntoResponse, Response}, Json, }; -use snops_common::action_models::WithTargets; +use snops_common::{ + action_models::WithTargets, + node_targets::NodeTargets, + state::{AgentId, AgentState, EnvId}, +}; +use tracing::info; use super::Env; -use crate::state::pending_reconcile_node_map; +use crate::state::{pending_reconcile_node_map, GlobalState}; + +async fn wait_for_nodes( + state: &GlobalState, + env_id: EnvId, + nodes: NodeTargets, + pending: Vec<(AgentId, AgentState)>, +) -> Response { + let mut awaiting_agents = pending.iter().map(|a| a.0).collect::>(); + let node_map = pending_reconcile_node_map(pending.iter()); + + // create the subscriber before updating agent states in order to + // avoid missing any events + use crate::events::prelude::*; + let mut subscriber = state + .events + .subscribe_on(NodeTargetIs(nodes) & EnvIs(env_id) & ReconcileComplete); + + state.update_agent_states(pending).await; + + // wait at most 30 seconds for all agents to reconcile + let expires = tokio::time::Instant::now() + std::time::Duration::from_secs(30); + while !awaiting_agents.is_empty() { + tokio::select! { + _ = tokio::time::sleep_until(expires) => { + break; + } + Ok(event) = subscriber.next() => { + if let Some(agent) = event.agent { + awaiting_agents.remove(&agent); + } + } + } + } + + Json(node_map).into_response() +} pub async fn online( Env { env, state, .. }: Env, Json(WithTargets { nodes, .. }): Json, ) -> Response { + info!("env {} invoked online action for {nodes}", env.id); let pending = env .matching_agents(&nodes, &state.pool) .filter_map(|a| { @@ -23,35 +65,16 @@ pub async fn online( }) }) }) - .collect::>(); // TODO - - let mut awaiting_agents = pending.iter().map(|a| a.0).collect::>(); - - let node_map = pending_reconcile_node_map(pending.iter()); + .collect::>(); - state.update_agent_states(pending).await; - - use crate::events::prelude::*; - let mut subscriber = state - .events - .subscribe_on(NodeTargetIs(nodes) & EnvIs(env.id) & ReconcileComplete); - - while !awaiting_agents.is_empty() { - // TODO: expire after some time - if let Ok(event) = subscriber.next().await { - if let Some(agent) = event.agent { - awaiting_agents.remove(&agent); - } - } - } - - Json(node_map).into_response() + wait_for_nodes(&state, env.id, nodes, pending).await } pub async fn offline( Env { env, state, .. }: Env, Json(WithTargets { nodes, .. }): Json, ) -> Response { + info!("env {} invoked offline action for {nodes}", env.id); let pending = env .matching_agents(&nodes, &state.pool) .filter_map(|a| { @@ -62,20 +85,14 @@ pub async fn offline( }) }) }) - .collect::>(); // TODO - - let node_map = pending_reconcile_node_map(pending.iter()); - - state.update_agent_states(pending).await; + .collect::>(); - Json(node_map).into_response() + wait_for_nodes(&state, env.id, nodes, pending).await } pub async fn reboot(env: Env, json: Json) -> Response { let offline_res = offline(env.clone(), json.clone()).await; - // TODO: wait for nodes to reconcile offline - if !offline_res.status().is_success() { offline_res } else { From 83eced68d944bcf93235a88885bf6a1067f907f1 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Fri, 29 Nov 2024 12:54:28 -0500 Subject: [PATCH 30/68] perf(snops): address requests use vec instead of hashset for agent id iteration --- crates/agent/src/reconcile/address.rs | 6 ++++-- crates/agent/src/state.rs | 5 ++++- crates/common/src/rpc/control/mod.rs | 9 ++------- crates/controlplane/src/server/rpc.rs | 14 +++++--------- crates/controlplane/src/state/global.rs | 11 ++++------- 5 files changed, 19 insertions(+), 26 deletions(-) diff --git a/crates/agent/src/reconcile/address.rs b/crates/agent/src/reconcile/address.rs index 42f2d5e6..9c73ceb6 100644 --- a/crates/agent/src/reconcile/address.rs +++ b/crates/agent/src/reconcile/address.rs @@ -23,7 +23,7 @@ impl Reconcile<(), ReconcileError> for AddressResolveReconciler { let AddressResolveReconciler { state, node } = self; // Find agents that do not have cached addresses - let unresolved_addrs: HashSet = { + let unresolved_addrs: Vec = { let resolved_addrs = state.resolved_addrs.read().await; node.peers .iter() @@ -35,11 +35,13 @@ impl Reconcile<(), ReconcileError> for AddressResolveReconciler { None } }) + // Ensure we only have unique agent ids (can use itertools down the line) + .collect::>() + .into_iter() .collect() }; // All addrs have been resolved. - // TODO: May need to mark some of these as stale at some point. if unresolved_addrs.is_empty() { return Ok(ReconcileStatus::default()); } diff --git a/crates/agent/src/state.rs b/crates/agent/src/state.rs index f19b1555..6e8a4660 100644 --- a/crates/agent/src/state.rs +++ b/crates/agent/src/state.rs @@ -183,7 +183,10 @@ impl GlobalState { None } }) - .collect::>(); + // Ensure we only have unique agent ids (can use itertools down the line) + .collect::>() + .into_iter() + .collect::>(); if peer_ids.is_empty() { return; diff --git a/crates/common/src/rpc/control/mod.rs b/crates/common/src/rpc/control/mod.rs index 38d76b4a..4e83ed02 100644 --- a/crates/common/src/rpc/control/mod.rs +++ b/crates/common/src/rpc/control/mod.rs @@ -1,9 +1,6 @@ pub mod agent; -use std::{ - collections::{HashMap, HashSet}, - net::IpAddr, -}; +use std::{collections::HashMap, net::IpAddr}; use super::error::{ReconcileError, ResolveError}; use crate::{ @@ -16,9 +13,7 @@ pub const PING_HEADER: &[u8] = b"snops-agent"; #[tarpc::service] pub trait ControlService { /// Resolve the addresses of the given agents. - async fn resolve_addrs( - peers: HashSet, - ) -> Result, ResolveError>; + async fn resolve_addrs(peers: Vec) -> Result, ResolveError>; /// Get the environment info for the given environment. async fn get_env_info(env_id: EnvId) -> Option; diff --git a/crates/controlplane/src/server/rpc.rs b/crates/controlplane/src/server/rpc.rs index e107bd63..90ca36a3 100644 --- a/crates/controlplane/src/server/rpc.rs +++ b/crates/controlplane/src/server/rpc.rs @@ -1,8 +1,4 @@ -use std::{ - collections::{HashMap, HashSet}, - net::IpAddr, - time::Instant, -}; +use std::{collections::HashMap, net::IpAddr, time::Instant}; use chrono::Utc; use snops_common::{ @@ -45,13 +41,13 @@ impl ControlService for ControlRpcServer { async fn resolve_addrs( self, _: context::Context, - mut peers: HashSet, + mut peers: Vec, ) -> Result, ResolveError> { - peers.insert(self.agent); + peers.push(self.agent); let addr_map = self .state - .get_addr_map(Some(&peers)) + .get_addr_map(&peers) .await .map_err(|_| ResolveError::AgentHasNoAddresses)?; resolve_addrs(&addr_map, self.agent, &peers).map_err(|_| ResolveError::SourceAgentNotFound) @@ -259,7 +255,7 @@ pub fn resolve_one_addr(src_addrs: &AgentAddrs, target_addrs: &AgentAddrs) -> Op fn resolve_addrs( addr_map: &AddrMap, src: AgentId, - peers: &HashSet, + peers: &[AgentId], ) -> Result, StateError> { let src_addrs = addr_map .get(&src) diff --git a/crates/controlplane/src/state/global.rs b/crates/controlplane/src/state/global.rs index 9338be7b..f213f373 100644 --- a/crates/controlplane/src/state/global.rs +++ b/crates/controlplane/src/state/global.rs @@ -1,4 +1,4 @@ -use std::{collections::HashSet, fmt::Display, net::SocketAddr, path::PathBuf, sync::Arc}; +use std::{fmt::Display, net::SocketAddr, path::PathBuf, sync::Arc}; use chrono::Utc; use dashmap::DashMap; @@ -162,13 +162,10 @@ impl GlobalState { /// Get a peer-to-addr mapping for a set of agents /// Locks pools for reading - pub async fn get_addr_map( - &self, - filter: Option<&HashSet>, - ) -> Result { - self.pool + pub async fn get_addr_map(&self, filter: &[AgentId]) -> Result { + filter .iter() - .filter(|agent| filter.is_none() || filter.is_some_and(|p| p.contains(&agent.id()))) + .filter_map(|id| self.pool.get(id)) .map(|agent| { let addrs = agent .addrs From da53e3494ef158846c4e2f92edc1833d39d76734 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Fri, 29 Nov 2024 12:54:56 -0500 Subject: [PATCH 31/68] chore: remove complete todos --- crates/agent/src/api.rs | 2 -- crates/agent/src/client.rs | 4 +++- crates/agent/src/metrics/mod.rs | 1 - crates/agent/src/reconcile/agent.rs | 2 -- crates/agent/src/reconcile/storage.rs | 2 +- crates/agent/src/rpc/control.rs | 23 ++++++++----------- crates/aot/src/ledger/query.rs | 4 ++++ crates/aot/src/ledger/truncate.rs | 2 -- crates/controlplane/src/db.rs | 1 - crates/controlplane/src/persist/env.rs | 3 +-- crates/controlplane/src/schema/storage/mod.rs | 8 ++----- 11 files changed, 20 insertions(+), 32 deletions(-) diff --git a/crates/agent/src/api.rs b/crates/agent/src/api.rs index ab6cfcc1..b749f3f0 100644 --- a/crates/agent/src/api.rs +++ b/crates/agent/src/api.rs @@ -135,8 +135,6 @@ pub async fn check_binary( tokio::fs::set_permissions(path, std::fs::Permissions::from_mode(0o755)).await?; } - // TODO: check sha256 and size - return Ok(()); } info!("downloading binary update to {}: {binary}", path.display()); diff --git a/crates/agent/src/client.rs b/crates/agent/src/client.rs index 3d455fc4..103552cc 100644 --- a/crates/agent/src/client.rs +++ b/crates/agent/src/client.rs @@ -70,7 +70,9 @@ pub async fn ws_connection(ws_req: Request, state: Arc) { // Clear old info cache. we will get new info from the control plane state.set_env_info(None).await; - // TODO: fetch latest info from controlplane rather than clearing + if let Err(e) = state.db.set_env_info(None) { + error!("failed to clear old env info cache: {e}"); + } // create rpc channels let (client_response_in, client_transport, mut client_request_out) = RpcTransport::new(); diff --git a/crates/agent/src/metrics/mod.rs b/crates/agent/src/metrics/mod.rs index 8a90c3b2..2f9e9b2f 100644 --- a/crates/agent/src/metrics/mod.rs +++ b/crates/agent/src/metrics/mod.rs @@ -31,7 +31,6 @@ pub fn init(state: Arc) { continue; } - // TODO: maybe this should use bind_addr let metrics_text = 'metrics: { let response = match client.get(&route).send().await { Ok(response) => response, diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index 2520ad38..c86e0b21 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -176,8 +176,6 @@ impl AgentStateReconciler { next_reconcile_at = Instant::now() + Duration::from_secs(err_backoff); } } - - // TODO: announce reconcile status to the server, throttled } } diff --git a/crates/agent/src/reconcile/storage.rs b/crates/agent/src/reconcile/storage.rs index e691204b..ebaa74a1 100644 --- a/crates/agent/src/reconcile/storage.rs +++ b/crates/agent/src/reconcile/storage.rs @@ -462,7 +462,7 @@ impl<'a> Reconcile<(), ReconcileError> for LedgerReconciler<'a> { // moment... Ok(false) => { error!("ledger modification to height {} failed", target_height.1); - // TODO: handle this failure + // TODO: handle this failure.. maybe even by deleting the ledger } // Bubble an actual error up to the caller Err(err) => { diff --git a/crates/agent/src/rpc/control.rs b/crates/agent/src/rpc/control.rs index 283bed54..8999d305 100644 --- a/crates/agent/src/rpc/control.rs +++ b/crates/agent/src/rpc/control.rs @@ -1,10 +1,9 @@ //! Control plane-to-agent RPC. -use std::{net::IpAddr, path::PathBuf}; +use std::net::IpAddr; use snops_common::{ aot_cmds::AotCmd, - binaries::{BinaryEntry, BinarySource}, define_rpc_mux, prelude::snarkos_status::SnarkOSLiteBlock, rpc::{ @@ -22,7 +21,9 @@ use snops_common::{ use tarpc::context::Context; use tracing::{error, info, trace}; -use crate::{api, log::make_env_filter, metrics::MetricComputer, state::AppState}; +use crate::{ + api, log::make_env_filter, metrics::MetricComputer, reconcile::default_binary, state::AppState, +}; define_rpc_mux!(child; ControlServiceRequest => ControlServiceResponse; @@ -222,11 +223,12 @@ impl AgentService for AgentRpcServer { query: String, auth: String, ) -> Result { - info!("executing authorization..."); + info!("Executing authorization for {env_id}..."); // TODO: maybe in the env config store a branch label for the binary so it won't // be put in storage and won't overwrite itself + // TODO: compute agents wiping out env info when alternating environments let info = self .state .get_env_info(env_id) @@ -239,14 +241,7 @@ impl AgentService for AgentRpcServer { .path .join(format!("snarkos-aot-{env_id}-compute")); - let default_entry = BinaryEntry { - source: BinarySource::Path(PathBuf::from(format!( - "/content/storage/{}/{}/binaries/default", - info.network, info.storage.id, - ))), - sha256: None, - size: None, - }; + let default_entry = default_binary(&info); // download the snarkOS binary api::check_binary( @@ -261,7 +256,7 @@ impl AgentService for AgentRpcServer { &self.state.endpoint, &aot_bin, self.state.transfer_tx(), - ) // TODO: http(s)? + ) .await .map_err(|e| { error!("failed obtain runner binary: {e}"); @@ -278,7 +273,7 @@ impl AgentService for AgentRpcServer { { Ok(exec) => { let elapsed = start.elapsed().as_millis(); - info!("authorization executed in {elapsed}ms"); + info!("Authorization executed in {elapsed}ms"); trace!("authorization output: {exec}"); Ok(exec) } diff --git a/crates/aot/src/ledger/query.rs b/crates/aot/src/ledger/query.rs index e38db79d..bdf9cc6c 100644 --- a/crates/aot/src/ledger/query.rs +++ b/crates/aot/src/ledger/query.rs @@ -91,6 +91,10 @@ impl LedgerQuery { &format!("/{network}/latest/stateRoot"), get(Self::latest_state_root), ) + .route( + &format!("/{network}/stateRoot/latest"), + get(Self::latest_state_root), + ) .route( &format!("/{network}/block/height/latest"), get(Self::latest_height), diff --git a/crates/aot/src/ledger/truncate.rs b/crates/aot/src/ledger/truncate.rs index a04cf735..e243568f 100644 --- a/crates/aot/src/ledger/truncate.rs +++ b/crates/aot/src/ledger/truncate.rs @@ -32,8 +32,6 @@ pub struct Replay { /// When checkpoint is enabled, checkpoints. #[arg(short, long, default_value_t = false)] checkpoint: bool, - // TODO: duration based truncation (blocks within a duration before now) - // TODO: timestamp based truncation (blocks after a certain date) } /// A command to truncate the ledger to a specific height. diff --git a/crates/controlplane/src/db.rs b/crates/controlplane/src/db.rs index 1ea705e4..870eccfa 100644 --- a/crates/controlplane/src/db.rs +++ b/crates/controlplane/src/db.rs @@ -40,7 +40,6 @@ pub struct Database { pub(crate) tx_index: DbTree, /// Number of attempts for the transaction's current state pub(crate) tx_attempts: DbTree, - // TODO: tx_attempts for tracking retries (of broadcast and execution) } impl DatabaseTrait for Database { diff --git a/crates/controlplane/src/persist/env.rs b/crates/controlplane/src/persist/env.rs index f48f8e2b..4dee32d8 100644 --- a/crates/controlplane/src/persist/env.rs +++ b/crates/controlplane/src/persist/env.rs @@ -34,7 +34,6 @@ pub struct PersistEnv { /// List of nodes and their states or external node info pub nodes: Vec<(NodeKey, PersistNode)>, /// Loaded cannon configs in this env - /// TODO: persist cannon pub cannons: Vec<(CannonId, TxSource, TxSink)>, } @@ -197,7 +196,7 @@ impl DataFormat for PersistEnv { type Header = PersistEnvFormatHeader; const LATEST_HEADER: Self::Header = PersistEnvFormatHeader { version: 1, - nodes: PersistNode::LATEST_HEADER, // TODO: use PersistNode::LATEST_HEADER + nodes: PersistNode::LATEST_HEADER, tx_source: TxSource::LATEST_HEADER, tx_sink: TxSink::LATEST_HEADER, network: NetworkId::LATEST_HEADER, diff --git a/crates/controlplane/src/schema/storage/mod.rs b/crates/controlplane/src/schema/storage/mod.rs index ed345169..94fbdcc9 100644 --- a/crates/controlplane/src/schema/storage/mod.rs +++ b/crates/controlplane/src/schema/storage/mod.rs @@ -74,7 +74,8 @@ pub struct StorageGeneration { pub transactions: Vec, } -// TODO: I don't know what this type should look like +// TODO: Convert this into a struct similar to the execute action, then use +// compute agents to assemble these on the fly #[derive(Deserialize, Debug, Clone, Serialize)] pub struct Transaction { pub file: PathBuf, @@ -87,7 +88,6 @@ pub struct Transaction { #[derive(Deserialize, Debug, Clone, Serialize)] #[serde(rename_all = "kebab-case")] pub struct GenesisGeneration { - // TODO: bonded balances mode, seed, genesis_key pub private_key: Option, pub seed: Option, pub additional_accounts: Option, @@ -151,10 +151,6 @@ impl Document { ) -> Result, SchemaError> { let id = self.id; - // todo: maybe update the loaded storage in global state if the hash - // of the storage document is different I guess... - // that might interfere with running tests, so I don't know - // add the prepared storage to the storage map if state.storage.contains_key(&(network, id)) { From c7410d9fc25c94feac6f4a9b18e67b4183bb5041 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Fri, 29 Nov 2024 14:23:02 -0500 Subject: [PATCH 32/68] refactor(controlplane): reduce repeated handshake code, fixing missing handshake event --- crates/controlplane/src/server/websocket.rs | 101 +++++++++----------- 1 file changed, 47 insertions(+), 54 deletions(-) diff --git a/crates/controlplane/src/server/websocket.rs b/crates/controlplane/src/server/websocket.rs index 69e9c6f8..f9fc394a 100644 --- a/crates/controlplane/src/server/websocket.rs +++ b/crates/controlplane/src/server/websocket.rs @@ -1,4 +1,4 @@ -use std::{sync::Arc, time::Duration}; +use std::sync::Arc; use ::jwt::VerifyWithKey; use axum::{ @@ -20,7 +20,7 @@ use snops_common::{ ControlService, }, }; -use tarpc::server::Channel; +use tarpc::{context, server::Channel}; use tokio::select; use tracing::{error, info, warn}; @@ -107,7 +107,7 @@ async fn handle_socket( let client = AgentServiceClient::new(tarpc::client::Config::default(), client_transport).spawn(); - let id: AgentId = 'insertion: { + let (id, handshake) = 'insertion: { let client = client.clone(); let mut handshake = Handshake { loki: state.cli.loki.as_ref().map(|u| u.to_string()), @@ -158,20 +158,7 @@ async fn handle_socket( error!("failed to save agent {id} to the database: {e}"); } - // drop agent ref to allow for mutable borrow in handshake requests - drop(agent); - - tokio::spawn(async move { - // we do this in a separate task because we don't want to hold up pool insertion - let mut ctx = tarpc::context::current(); - ctx.deadline += Duration::from_secs(300); - match client.handshake(ctx, handshake).await { - Ok(()) => (), - Err(e) => error!("failed to perform agent {id} handshake: {e}"), - } - }); - - break 'insertion id; + break 'insertion (id, handshake); } } @@ -198,17 +185,6 @@ async fn handle_socket( let signed_jwt = agent.sign_jwt(); handshake.jwt = Some(signed_jwt); - // handshake with the client - tokio::spawn(async move { - // we do this in a separate task because we don't want to hold up pool insertion - let mut ctx = tarpc::context::current(); - ctx.deadline += Duration::from_secs(300); - match client.handshake(ctx, handshake).await { - Ok(()) => (), - Err(e) => error!("failed to perform agent {id} handshake: {e}"), - } - }); - // insert a new agent into the pool if let Err(e) = state.db.agents.save(&id, &agent) { error!("failed to save agent {id} to the database: {e}"); @@ -220,14 +196,32 @@ async fn handle_socket( state.pool.len() ); - id + (id, handshake) }; - // fetch the agent's network addresses on connect/reconnect + // Handshake with the client in a separate task because we don't want to hold up + // pool insertion let state2 = Arc::clone(&state); + let client2 = client.clone(); tokio::spawn(async move { - let Ok((ports, external, internal)) = client.get_addrs(tarpc::context::current()).await - else { + let agent = state2.pool.get(&id)?; + let event = EventKind::AgentHandshakeComplete.with_agent(&agent); + + // Prevent readonly agent from being held over the handshake RPC + drop(agent); + + match client2.handshake(context::current(), handshake).await { + Ok(()) => state2.events.emit(event), + Err(e) => error!("failed to perform agent {id} handshake: {e}"), + } + + Some(()) + }); + + // Fetch the agent's network addresses on connect/reconnect + let state2 = Arc::clone(&state); + tokio::spawn(async move { + let Ok((ports, external, internal)) = client.get_addrs(context::current()).await else { return; }; let Some(mut agent) = state2.pool.get_mut(&id) else { @@ -248,26 +242,21 @@ async fn handle_socket( error!("failed to save agent {id} to the database: {e}"); } - let handshake_event = EventKind::AgentHandshakeComplete.with_agent(&agent); - - 'peer_update: { - if !is_ip_change && !is_port_change { - break 'peer_update; - } - let Some(env_id) = agent.env() else { - break 'peer_update; - }; - drop(agent); - let Some(env) = state2.get_env(env_id) else { - break 'peer_update; - }; - - info!("Agent {id} updated its network addresses... Submitting changes to associated peers"); - env.update_peer_addr(&state2, id, is_port_change, is_ip_change) - .await; + if !is_ip_change && !is_port_change { + return; } + let Some(env_id) = agent.env() else { return }; + + // Prevent mutable agent from being held over the network address update RPC + drop(agent); - state2.events.emit(handshake_event); + let Some(env) = state2.get_env(env_id) else { + return; + }; + + info!("Agent {id} updated its network addresses... Submitting changes to associated peers"); + env.update_peer_addr(&state2, id, is_port_change, is_ip_change) + .await; }); // set up the server, for incoming RPC requests @@ -291,12 +280,16 @@ async fn handle_socket( // handle incoming messages msg = socket.recv() => { match msg { - Some(Err(_)) | None => break, + Some(Err(e)) => { + error!("Agent {id} failed to receive a message: {e}"); + break; + } + None => break, Some(Ok(Message::Binary(bin))) => { let msg = match bincode::deserialize(&bin) { Ok(msg) => msg, Err(e) => { - error!("failed to deserialize a message from agent {id}: {e}"); + error!("Agent {id} failed to deserialize a message: {e}"); break; } }; @@ -304,13 +297,13 @@ async fn handle_socket( match msg { MuxedMessageIncoming::Parent(msg) => { if let Err(e) = server_request_in.send(msg) { - error!("internal RPC channel closed: {e}"); + error!("Agent {id} internal RPC channel closed: {e}"); break; } }, MuxedMessageIncoming::Child(msg) => { if let Err(e) = client_response_in.send(msg) { - error!("internal RPC channel closed: {e}"); + error!("Agent {id} internal RPC channel closed: {e}"); break; } } From 63d22cce307e05843078287ef938dffdebf05fdb Mon Sep 17 00:00:00 2001 From: Meshiest Date: Fri, 29 Nov 2024 15:32:22 -0500 Subject: [PATCH 33/68] feat(snops): enforce agent versions when connecting to controlplane, force reconnections --- Cargo.lock | 12 ++++--- Cargo.toml | 1 + crates/agent/Cargo.toml | 2 +- crates/agent/src/cli.rs | 3 ++ crates/agent/src/client.rs | 12 ++++--- crates/aot/Cargo.toml | 2 +- crates/common/Cargo.toml | 2 +- crates/controlplane/Cargo.toml | 3 +- crates/controlplane/src/agent_version.rs | 37 +++++++++++++++++++++ crates/controlplane/src/main.rs | 1 + crates/controlplane/src/server/websocket.rs | 15 +++++++-- 11 files changed, 76 insertions(+), 14 deletions(-) create mode 100644 crates/controlplane/src/agent_version.rs diff --git a/Cargo.lock b/Cargo.lock index 0fa4de92..ab657049 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3106,6 +3106,9 @@ name = "semver" version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" +dependencies = [ + "serde", +] [[package]] name = "serde" @@ -3309,7 +3312,7 @@ dependencies = [ [[package]] name = "snarkos-aot" -version = "0.1.0" +version = "0.1.1" dependencies = [ "aleo-std", "anyhow", @@ -4530,7 +4533,7 @@ dependencies = [ [[package]] name = "snops" -version = "0.1.0" +version = "0.2.0" dependencies = [ "axum", "bimap", @@ -4552,6 +4555,7 @@ dependencies = [ "rand_chacha", "rayon", "reqwest 0.12.8", + "semver", "serde", "serde_json", "serde_yml", @@ -4574,7 +4578,7 @@ dependencies = [ [[package]] name = "snops-agent" -version = "0.1.0" +version = "0.2.0" dependencies = [ "anyhow", "axum", @@ -4639,7 +4643,7 @@ dependencies = [ [[package]] name = "snops-common" -version = "0.1.0" +version = "0.2.0" dependencies = [ "anyhow", "bincode", diff --git a/Cargo.toml b/Cargo.toml index 17527a73..910513c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -81,6 +81,7 @@ reqwest = { version = "0.12", default-features = false, features = [ ] } # Can't update this cause snarkos/vm rocksdb = { version = "0.21", default-features = false } +semver = { version = "1.0", features = ["serde"] } serde = { version = "1", default-features = false, features = [ "alloc", "derive", diff --git a/crates/agent/Cargo.toml b/crates/agent/Cargo.toml index 7a5f2a76..6f01822c 100644 --- a/crates/agent/Cargo.toml +++ b/crates/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "snops-agent" -version = "0.1.0" +version = "0.2.0" edition = "2021" license = "MIT" description = "A snarkops agent for communicating with snarkos nodes and the control plane" diff --git a/crates/agent/src/cli.rs b/crates/agent/src/cli.rs index 7ee990a5..9194a6be 100644 --- a/crates/agent/src/cli.rs +++ b/crates/agent/src/cli.rs @@ -121,6 +121,9 @@ impl Cli { let mut query = format!("/agent?mode={}", u8::from(self.modes)); + // Add agent version + query.push_str(&format!("&version={}", env!("CARGO_PKG_VERSION"))); + // add &id= query.push_str(&format!("&id={}", self.id)); diff --git a/crates/agent/src/client.rs b/crates/agent/src/client.rs index 103552cc..b7f75ead 100644 --- a/crates/agent/src/client.rs +++ b/crates/agent/src/client.rs @@ -4,7 +4,7 @@ use std::{ }; use futures::{SinkExt, StreamExt}; -use http::{HeaderValue, Uri}; +use http::{HeaderValue, StatusCode, Uri}; use snops_common::{ constant::{ENV_AGENT_KEY, HEADER_AGENT_KEY}, rpc::{ @@ -58,10 +58,14 @@ pub async fn ws_connection(ws_req: Request, state: Arc) { tungstenite::Error::Io(e) if e.kind() == std::io::ErrorKind::ConnectionRefused => { return } - _ => {} + // Shutdown the agent if the control plane requires an upgrade + tungstenite::Error::Http(e) if e.status() == StatusCode::UPGRADE_REQUIRED => { + error!("The control plane requires an agent upgrade. Shutting down..."); + state.shutdown().await; + return; + } + _ => error!("failed to connect to websocket: {e}"), } - - error!("failed to connect to websocket: {e}"); return; } }; diff --git a/crates/aot/Cargo.toml b/crates/aot/Cargo.toml index cb9d75a4..4cdc3d28 100644 --- a/crates/aot/Cargo.toml +++ b/crates/aot/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2021" name = "snarkos-aot" -version = "0.1.0" +version = "0.1.1" license = "MIT" description = "Ahead of time utilities for SnarkVM, and a wrapper around the SnarkOS node for more options" diff --git a/crates/common/Cargo.toml b/crates/common/Cargo.toml index 8713d105..1563ef32 100644 --- a/crates/common/Cargo.toml +++ b/crates/common/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "snops-common" -version = "0.1.0" +version = "0.2.0" edition = "2021" license = "MIT" description = "Common types and utilities for snops" diff --git a/crates/controlplane/Cargo.toml b/crates/controlplane/Cargo.toml index 99960d6e..2607cd75 100644 --- a/crates/controlplane/Cargo.toml +++ b/crates/controlplane/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "snops" -version = "0.1.0" +version = "0.2.0" edition = "2021" license = "MIT" description = "The snarkops control plane responsible for managing environments and agents" @@ -39,6 +39,7 @@ rand.workspace = true rand_chacha.workspace = true rayon.workspace = true reqwest = { workspace = true, features = ["json", "stream"] } +semver.workspace = true serde.workspace = true serde_json.workspace = true serde_yaml.workspace = true diff --git a/crates/controlplane/src/agent_version.rs b/crates/controlplane/src/agent_version.rs new file mode 100644 index 00000000..ef9a86b7 --- /dev/null +++ b/crates/controlplane/src/agent_version.rs @@ -0,0 +1,37 @@ +use std::sync::OnceLock; + +use semver::{Comparator, Prerelease, Version, VersionReq}; + +/// A version requirement that matches the current controlplane version against +/// an agent version +fn cp_version() -> &'static VersionReq { + static CP_VERSION: OnceLock = OnceLock::new(); + + CP_VERSION.get_or_init(|| { + let version = Version::parse(env!("CARGO_PKG_VERSION")) + .expect("Failed to parse controlplane version"); + + VersionReq { + comparators: vec![ + Comparator { + op: semver::Op::GreaterEq, + major: version.major, + minor: Some(version.minor), + patch: Some(version.patch), + pre: Prerelease::EMPTY, + }, + Comparator { + op: semver::Op::Less, + major: version.major, + minor: Some(version.minor + 1), + patch: None, + pre: Prerelease::EMPTY, + }, + ], + } + }) +} + +pub fn agent_version_ok(agent_version: &Version) -> bool { + cp_version().matches(agent_version) +} diff --git a/crates/controlplane/src/main.rs b/crates/controlplane/src/main.rs index 7216d048..6f4a65ba 100644 --- a/crates/controlplane/src/main.rs +++ b/crates/controlplane/src/main.rs @@ -10,6 +10,7 @@ use tokio::select; use tracing::{error, info, level_filters::LevelFilter, trace}; use tracing_subscriber::{prelude::*, reload, EnvFilter}; +pub mod agent_version; pub mod cannon; pub mod cli; pub mod db; diff --git a/crates/controlplane/src/server/websocket.rs b/crates/controlplane/src/server/websocket.rs index f9fc394a..95b9b3a9 100644 --- a/crates/controlplane/src/server/websocket.rs +++ b/crates/controlplane/src/server/websocket.rs @@ -11,6 +11,7 @@ use axum::{ }; use futures_util::stream::StreamExt; use http::StatusCode; +use semver::Version; use serde::Deserialize; use snops_common::{ constant::HEADER_AGENT_KEY, @@ -26,6 +27,7 @@ use tracing::{error, info, warn}; use super::{jwt::Claims, rpc::ControlRpcServer}; use crate::{ + agent_version::agent_version_ok, events::EventKind, server::{ jwt::JWT_SECRET, @@ -37,6 +39,7 @@ use crate::{ #[derive(Debug, Deserialize)] pub struct AgentWsQuery { pub id: Option, + pub version: Option, #[serde(flatten)] pub flags: AgentFlags, } @@ -47,6 +50,11 @@ pub async fn agent_ws_handler( State(state): State, Query(query): Query, ) -> Response { + // Ensure agent version is compatible + if query.version.as_ref().is_none_or(|v| !agent_version_ok(v)) { + return StatusCode::UPGRADE_REQUIRED.into_response(); + }; + match (&state.agent_key, headers.get(HEADER_AGENT_KEY)) { // assert key equals passed header (Some(key), Some(header)) if key == header.to_str().unwrap_or_default() => (), @@ -71,6 +79,9 @@ async fn handle_socket( state: AppState, query: AgentWsQuery, ) { + // Safe because handle socket is only called if version is Some + let agent_version = query.version.unwrap(); + let claims = headers .get("Authorization") .and_then(|auth| -> Option { @@ -153,7 +164,7 @@ async fn handle_socket( // mark the agent as connected, update the flags as well agent.mark_connected(client.clone(), query.flags); - info!("Agent {id} reconnected"); + info!("Agent {id} reconnected with version {agent_version}"); if let Err(e) = state.db.agents.save(&id, &agent) { error!("failed to save agent {id} to the database: {e}"); } @@ -192,7 +203,7 @@ async fn handle_socket( state.pool.insert(id, agent); info!( - "Agent {id} connected; pool is now {} nodes", + "Agent {id} connected with version {agent_version}; pool is now {} nodes", state.pool.len() ); From ab607272bd92e9662ccf3c9a6497100ad7f07344 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Fri, 29 Nov 2024 23:44:40 -0500 Subject: [PATCH 34/68] chore(controlplane): remove unused ledger file code --- crates/common/src/api.rs | 47 ++++++------ crates/common/src/constant.rs | 2 - crates/controlplane/src/persist/storage.rs | 23 ++---- .../controlplane/src/schema/storage/loaded.rs | 26 ++----- crates/controlplane/src/schema/storage/mod.rs | 72 ++----------------- crates/controlplane/src/server/api.rs | 8 --- crates/controlplane/src/server/content.rs | 28 +++----- 7 files changed, 44 insertions(+), 162 deletions(-) diff --git a/crates/common/src/api.rs b/crates/common/src/api.rs index c8a33618..0bee7bb4 100644 --- a/crates/common/src/api.rs +++ b/crates/common/src/api.rs @@ -37,8 +37,6 @@ pub struct StorageInfo { pub id: StorageId, /// The retention policy used for this storage pub retention_policy: Option, - /// The available checkpoints in this storage - pub checkpoints: Vec, /// Whether to persist the ledger pub persist: bool, /// Version identifier for this ledger @@ -250,7 +248,7 @@ impl DataFormat for StorageInfo { type Header = StorageInfoHeader; const LATEST_HEADER: Self::Header = StorageInfoHeader { - version: 1, + version: 2, retention_policy: RetentionPolicy::LATEST_HEADER, binaries: BinaryEntry::LATEST_HEADER, }; @@ -261,18 +259,18 @@ impl DataFormat for StorageInfo { ) -> Result { let mut written = self.id.write_data(writer)?; written += self.retention_policy.write_data(writer)?; - written += self - .checkpoints - .iter() - .map( - |CheckpointMeta { - height, - timestamp, - filename, - }| (*height, *timestamp, filename.to_owned()), - ) - .collect::>() - .write_data(writer)?; + // written += self + // .checkpoints + // .iter() + // .map( + // |CheckpointMeta { + // height, + // timestamp, + // filename, + // }| (*height, *timestamp, filename.to_owned()), + // ) + // .collect::>() + // .write_data(writer)?; written += self.persist.write_data(writer)?; written += self.version.write_data(writer)?; written += self.native_genesis.write_data(writer)?; @@ -284,10 +282,10 @@ impl DataFormat for StorageInfo { reader: &mut R, header: &Self::Header, ) -> Result { - if header.version != 1 { + if header.version == 0 || header.version > Self::LATEST_HEADER.version { return Err(crate::format::DataReadError::unsupported( "StorageInfo", - 1, + Self::LATEST_HEADER.version, header.version, )); } @@ -295,14 +293,12 @@ impl DataFormat for StorageInfo { let id = StorageId::read_data(reader, &())?; let retention_policy = Option::::read_data(reader, &header.retention_policy)?; - let checkpoints = Vec::<(u32, i64, String)>::read_data(reader, &((), (), ()))? - .into_iter() - .map(|(height, timestamp, filename)| CheckpointMeta { - height, - timestamp, - filename, - }) - .collect(); + + // Omit checkpoints from a previous version + if header.version == 1 { + Vec::<(u32, i64, String)>::read_data(reader, &((), (), ()))?; + }; + let persist = bool::read_data(reader, &())?; let version = u16::read_data(reader, &())?; let native_genesis = bool::read_data(reader, &())?; @@ -311,7 +307,6 @@ impl DataFormat for StorageInfo { Ok(Self { id, retention_policy, - checkpoints, persist, version, native_genesis, diff --git a/crates/common/src/constant.rs b/crates/common/src/constant.rs index 6fb73394..4f50f0f6 100644 --- a/crates/common/src/constant.rs +++ b/crates/common/src/constant.rs @@ -13,8 +13,6 @@ pub const SNARKOS_GENESIS_FILE: &str = "genesis.block"; pub const LEDGER_BASE_DIR: &str = "ledger"; /// The directory name for persisted ledgers within the storage dir. pub const LEDGER_PERSIST_DIR: &str = "persist"; -/// Temporary storage archive file name. -pub const LEDGER_STORAGE_FILE: &str = "ledger.tar.gz"; /// File containing a version counter for a ledger pub const VERSION_FILE: &str = "version"; /// Directory name for the node's data. diff --git a/crates/controlplane/src/persist/storage.rs b/crates/controlplane/src/persist/storage.rs index 030657d9..a3df3f37 100644 --- a/crates/controlplane/src/persist/storage.rs +++ b/crates/controlplane/src/persist/storage.rs @@ -1,12 +1,11 @@ use indexmap::IndexMap; -use snops_checkpoint::{CheckpointManager, RetentionPolicy}; +use snops_checkpoint::RetentionPolicy; use snops_common::{ binaries::BinaryEntry, - constant::LEDGER_BASE_DIR, key_source::ACCOUNTS_KEY_ID, state::{InternedId, NetworkId, StorageId}, }; -use tracing::{info, warn}; +use tracing::warn; use super::prelude::*; use crate::{ @@ -85,7 +84,7 @@ impl From<&LoadedStorage> for PersistStorage { version: storage.version, persist: storage.persist, accounts: storage.accounts.keys().cloned().collect(), - retention_policy: storage.checkpoints.as_ref().map(|c| c.policy().clone()), + retention_policy: storage.retention_policy.clone(), native_genesis: storage.native_genesis, binaries: storage.binaries.clone(), } @@ -100,20 +99,6 @@ impl PersistStorage { storage_path.push(id.to_string()); let committee_file = storage_path.join("committee.json"); - let checkpoints = self - .retention_policy - .map(|policy| { - CheckpointManager::load(storage_path.join(LEDGER_BASE_DIR), policy) - .map_err(StorageError::CheckpointManager) - }) - .transpose()?; - - if let Some(checkpoints) = &checkpoints { - info!("storage {id} checkpoint manager loaded {checkpoints}"); - } else { - info!("storage {id} loaded without a checkpoint manager"); - } - let mut accounts = IndexMap::new(); // load accounts json @@ -142,7 +127,7 @@ impl PersistStorage { version: self.version, persist: self.persist, committee: read_to_addrs(pick_commitee_addr, &committee_file).await?, - checkpoints, + retention_policy: self.retention_policy, native_genesis: self.native_genesis, accounts, binaries: self.binaries, diff --git a/crates/controlplane/src/schema/storage/loaded.rs b/crates/controlplane/src/schema/storage/loaded.rs index 721e193e..8a2b3991 100644 --- a/crates/controlplane/src/schema/storage/loaded.rs +++ b/crates/controlplane/src/schema/storage/loaded.rs @@ -4,9 +4,9 @@ use futures_util::StreamExt; use indexmap::IndexMap; use rand::seq::IteratorRandom; use sha2::{Digest, Sha256}; -use snops_checkpoint::CheckpointManager; +use snops_checkpoint::RetentionPolicy; use snops_common::{ - api::{CheckpointMeta, StorageInfo}, + api::StorageInfo, binaries::{BinaryEntry, BinarySource}, key_source::KeySource, state::{InternedId, KeyState, NetworkId, StorageId}, @@ -33,7 +33,7 @@ pub struct LoadedStorage { /// other accounts files lookup pub accounts: IndexMap, /// storage of checkpoints - pub checkpoints: Option, + pub retention_policy: Option, /// whether agents using this storage should persist it pub persist: bool, /// whether to use the network's native genesis block @@ -146,23 +146,6 @@ impl LoadedStorage { } pub fn info(&self) -> StorageInfo { - let checkpoints = self - .checkpoints - .as_ref() - .map(|c| { - c.checkpoints() - .filter_map(|(c, path)| { - path.file_name() - .and_then(|s| s.to_str()) - .map(|filename| CheckpointMeta { - filename: filename.to_string(), - height: c.block_height, - timestamp: c.timestamp, - }) - }) - .collect() - }) - .unwrap_or_default(); let mut binaries: IndexMap<_, _> = self .binaries .iter() @@ -182,8 +165,7 @@ impl LoadedStorage { StorageInfo { id: self.id, version: self.version, - retention_policy: self.checkpoints.as_ref().map(|c| c.policy().clone()), - checkpoints, + retention_policy: self.retention_policy.clone(), persist: self.persist, native_genesis: self.native_genesis, binaries, diff --git a/crates/controlplane/src/schema/storage/mod.rs b/crates/controlplane/src/schema/storage/mod.rs index 94fbdcc9..eb06cdd0 100644 --- a/crates/controlplane/src/schema/storage/mod.rs +++ b/crates/controlplane/src/schema/storage/mod.rs @@ -1,17 +1,12 @@ -use std::{ - ops::Deref, - path::PathBuf, - process::{ExitStatus, Stdio}, - sync::Arc, -}; +use std::{ops::Deref, path::PathBuf, process::Stdio, sync::Arc}; use indexmap::IndexMap; use serde::{Deserialize, Serialize}; -use snops_checkpoint::{CheckpointManager, RetentionPolicy}; +use snops_checkpoint::RetentionPolicy; use snops_common::{ aot_cmds::error::CommandError, binaries::{BinaryEntry, BinarySource}, - constant::{LEDGER_BASE_DIR, LEDGER_STORAGE_FILE, SNARKOS_GENESIS_FILE, VERSION_FILE}, + constant::{SNARKOS_GENESIS_FILE, VERSION_FILE}, key_source::ACCOUNTS_KEY_ID, state::{InternedId, NetworkId, StorageId}, }; @@ -266,9 +261,7 @@ impl Document { .env("NETWORK", network.to_string()) .arg("genesis") .arg("--output") - .arg(&output) - .arg("--ledger") - .arg(base.join(LEDGER_BASE_DIR)); + .arg(&output); // conditional seed flag if let Some(seed) = genesis.seed { @@ -345,7 +338,7 @@ impl Document { .arg(balance.to_string()); } - info!("{command:?}"); + info!("Generating genesis for {id} with command: {command:?}"); let res = command .spawn() @@ -376,45 +369,6 @@ impl Document { } } - // tar the ledger so that it can be served to agents - // the genesis block is not compressed because it is already binary and might - // not be served independently - let ledger_exists = matches!( - tokio::fs::try_exists(base.join(LEDGER_BASE_DIR)).await, - Ok(true) - ); - let ledger_tar_exists = matches!( - tokio::fs::try_exists(base.join(LEDGER_STORAGE_FILE)).await, - Ok(true) - ); - - if ledger_exists && !ledger_tar_exists { - let mut child = Command::new("tar") - .current_dir(&base) - .arg("czf") - .arg(LEDGER_STORAGE_FILE) - .arg(LEDGER_BASE_DIR) - .kill_on_drop(true) - .spawn() - .map_err(|e| { - StorageError::Command(CommandError::action("spawning", "tar ledger", e), id) - })?; - - if !child - .wait() - .await - .as_ref() - .map(ExitStatus::success) - .unwrap_or(false) - { - error!("failed to compress ledger"); - } - - tokio::fs::try_exists(&base.join(LEDGER_STORAGE_FILE)) - .await - .map_err(|e| StorageError::FailedToTarLedger(id, e))?; - } - let mut accounts = IndexMap::new(); accounts.insert( *ACCOUNTS_KEY_ID, @@ -472,20 +426,6 @@ impl Document { .await .map_err(|e| StorageError::WriteVersion(version_file.clone(), e))?; - let checkpoints = self - .retention_policy - .map(|policy| { - CheckpointManager::load(base.join(LEDGER_BASE_DIR), policy) - .map_err(StorageError::CheckpointManager) - }) - .transpose()?; - - if let Some(checkpoints) = &checkpoints { - info!("storage {id} checkpoint manager loaded {checkpoints}"); - } else { - info!("storage {id} loaded without a checkpoint manager"); - } - let committee_file = base.join("committee.json"); // if the committee was specified in the generation params, use that @@ -527,7 +467,7 @@ impl Document { network, committee, accounts, - checkpoints, + retention_policy: self.retention_policy, persist: self.persist, native_genesis, binaries, diff --git a/crates/controlplane/src/server/api.rs b/crates/controlplane/src/server/api.rs index 4ded0384..8ba54d29 100644 --- a/crates/controlplane/src/server/api.rs +++ b/crates/controlplane/src/server/api.rs @@ -148,14 +148,6 @@ async fn set_log_level(Path(level): Path, state: State) -> Res status_ok() } -#[derive(Deserialize)] -#[serde(rename_all = "lowercase")] -enum StorageType { - Genesis, - Ledger, - Binary, -} - async fn get_env_info(Path(env_id): Path, state: State) -> Response { let env_id = unwrap_or_not_found!(id_or_none(&env_id)); let env = unwrap_or_not_found!(state.get_env(env_id)); diff --git a/crates/controlplane/src/server/content.rs b/crates/controlplane/src/server/content.rs index a5f9b670..c6066ee5 100644 --- a/crates/controlplane/src/server/content.rs +++ b/crates/controlplane/src/server/content.rs @@ -81,25 +81,20 @@ async fn serve_binary( ) -> Response { let storage = unwrap_or_not_found!(state.storage.get(&(network, storage_id))).clone(); - let (id, entry) = match storage.resolve_binary_entry(binary_id) { - Ok(res) => res, - Err(e) => return ServerError::from(e).into_response(), - }; - - respond_from_entry(id, entry, req).await + match storage.resolve_binary_entry(binary_id) { + Ok((id, entry)) => respond_from_entry(id, entry, req).await, + Err(e) => ServerError::from(e).into_response(), + } } /// Given a binary entry, respond with the binary or a redirect to the binary async fn respond_from_entry(id: InternedId, entry: &BinaryEntry, req: Request) -> Response { match &entry.source { BinarySource::Url(url) => Redirect::temporary(url.as_str()).into_response(), - BinarySource::Path(file) => { - if !file.exists() { - return ServerError::from(StorageError::BinaryFileMissing(id, file.clone())) - .into_response(); - } - ServeFile::new(file).call(req).await.into_response() + BinarySource::Path(file) if !file.exists() => { + ServerError::from(StorageError::BinaryFileMissing(id, file.clone())).into_response() } + BinarySource::Path(file) => ServeFile::new(file).call(req).await.into_response(), } } @@ -109,7 +104,6 @@ async fn serve_file( req: Request, ) -> Response { let storage = unwrap_or_not_found!(state.storage.get(&(network, storage_id))).clone(); - let file_path = storage.path(&state).join(&file); match file.as_str() { // ensure genesis is only served if native genesis is disabled @@ -118,16 +112,12 @@ async fn serve_file( return StatusCode::NOT_FOUND.into_response(); } } - // allow ledger.tar.gz to be served - "ledger.tar.gz" => {} - // allow checkpoints to be served - _ if file.ends_with(".checkpoint") => {} - // serve the version file - "version" => {} // otherwise, return a 404 _ => return StatusCode::NOT_FOUND.into_response(), } + let file_path = storage.path(&state).join(&file); + // ensure the file exists if !file_path.exists() { return StatusCode::NOT_FOUND.into_response(); From af94071f0f016c718fe502e2d5d8ede69be659d4 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 30 Nov 2024 01:11:59 -0500 Subject: [PATCH 35/68] feat(agent): restart node if binary version changed --- Cargo.lock | 1 + crates/agent/src/client.rs | 6 - crates/agent/src/main.rs | 9 + crates/agent/src/reconcile/agent.rs | 221 +++++++++++++------- crates/agent/src/reconcile/command.rs | 2 +- crates/agent/src/reconcile/process.rs | 12 ++ crates/agent/src/reconcile/storage.rs | 21 +- crates/agent/src/rpc/control.rs | 10 +- crates/agent/src/state.rs | 13 +- crates/checkpoint/Cargo.toml | 1 + crates/checkpoint/src/manager.rs | 27 ++- crates/common/src/api.rs | 2 +- crates/common/src/rpc/control/agent.rs | 5 +- crates/common/src/state/reconcile.rs | 17 ++ crates/controlplane/src/env/mod.rs | 29 ++- crates/controlplane/src/server/websocket.rs | 5 + crates/controlplane/src/state/reconcile.rs | 22 +- crates/controlplane/src/state/rpc.rs | 12 +- 18 files changed, 291 insertions(+), 124 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ab657049..6521cd21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4620,6 +4620,7 @@ dependencies = [ "anyhow", "chrono", "glob", + "lazysort", "rayon", "serde", "snarkos-node", diff --git a/crates/agent/src/client.rs b/crates/agent/src/client.rs index b7f75ead..72975fe5 100644 --- a/crates/agent/src/client.rs +++ b/crates/agent/src/client.rs @@ -72,12 +72,6 @@ pub async fn ws_connection(ws_req: Request, state: Arc) { info!("Connection established with the control plane"); - // Clear old info cache. we will get new info from the control plane - state.set_env_info(None).await; - if let Err(e) = state.db.set_env_info(None) { - error!("failed to clear old env info cache: {e}"); - } - // create rpc channels let (client_response_in, client_transport, mut client_request_out) = RpcTransport::new(); let (server_request_in, server_transport, mut server_response_out) = RpcTransport::new(); diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index cc4ce543..439fca8e 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -194,3 +194,12 @@ impl Signals { futs.next().await; } } + +#[cfg(test)] +mod test { + #[test] + // CI is failing because the agent has no tests + fn test_nothing() { + assert_eq!(1, 1) + } +} diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index c86e0b21..476db2a4 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -4,9 +4,12 @@ use std::{ }; use snops_common::{ + api::AgentEnvInfo, binaries::BinaryEntry, rpc::error::ReconcileError, - state::{AgentState, HeightRequest, ReconcileCondition, TransferId}, + state::{ + AgentState, HeightRequest, NodeState, ReconcileCondition, ReconcileOptions, TransferId, + }, }; use tarpc::context; use tokio::{ @@ -27,7 +30,8 @@ use super::{ use crate::{ db::Database, reconcile::{ - address::AddressResolveReconciler, process::EndProcessReconciler, storage::LedgerReconciler, + address::AddressResolveReconciler, default_binary, process::EndProcessReconciler, + storage::LedgerReconciler, }, state::GlobalState, }; @@ -113,12 +117,16 @@ macro_rules! reconcile { } impl AgentStateReconciler { - pub async fn loop_forever(&mut self, mut reconcile_requests: Receiver) { + pub async fn loop_forever( + &mut self, + mut reconcile_requests: Receiver<(Instant, ReconcileOptions)>, + ) { let mut err_backoff = 0; // The first reconcile is scheduled for 5 seconds after startup. // Connecting to the controlplane will likely trigger a reconcile sooner. let mut next_reconcile_at = Instant::now() + Duration::from_secs(5); + let mut next_opts = ReconcileOptions::default(); // Repeated reconcile loop loop { @@ -126,8 +134,9 @@ impl AgentStateReconciler { loop { select! { // Replace the next_reconcile_at with the soonest reconcile time - Some(new_reconcile_at) = reconcile_requests.recv() => { + Some((new_reconcile_at, opts)) = reconcile_requests.recv() => { next_reconcile_at = next_reconcile_at.min(new_reconcile_at); + next_opts = next_opts.union(opts); }, _ = sleep_until(next_reconcile_at.into()) => { break @@ -145,6 +154,16 @@ impl AgentStateReconciler { // This prevents the agent state from changing during reconciliation self.agent_state = self.state.get_agent_state().await; + // Clear the env info if refetch_info is set to force it to be fetched again + if next_opts.refetch_info { + self.state.set_env_info(None).await; + } + + // If the agent is forced to shutdown, set the shutdown_pending flag + if next_opts.force_shutdown && self.has_process() { + self.context.shutdown_pending = true; + } + trace!("Reconciling agent state..."); let res = self.reconcile().await; if let Some(client) = self.state.get_ws_client().await { @@ -212,6 +231,77 @@ impl AgentStateReconciler { Ok(ReconcileStatus::default().add_scope("agent_state/inventory")) } + + pub fn has_process(&self) -> bool { + self.context.process.is_some() + } + + pub fn is_shutdown_pending(&self, node: &NodeState, env_info: &AgentEnvInfo) -> bool { + // Ensure the process is running + if !self.has_process() { + return false; + } + + // Node was already marked for shutdown + if self.context.shutdown_pending { + return true; + } + + // Node is now configured to be offline + if !node.online { + info!("Node is marked offline"); + return true; + } + + // Check if the storage version, storage id, or network id has changed + if self + .context + .env_state + .as_ref() + .is_none_or(|e| e.changed(env_info)) + { + info!("Node storage version, storage id, or network id has changed"); + return true; + } + + // Check if the ledger height is not resolved + if self.context.ledger_last_height != Some(node.height) && !node.height.1.is_top() { + info!("Node ledger target height has changed"); + return true; + } + + let default_binary = default_binary(env_info); + let target_binary = env_info + .storage + .binaries + .get(&node.binary.unwrap_or_default()) + .unwrap_or(&default_binary); + + // Check if the binary this node is running is different from the one in storage + if self.context.process.as_ref().is_some_and(|p| { + target_binary + .sha256 + .as_ref() + .is_some_and(|sha256| !p.is_sha256_eq(sha256)) + }) { + info!("Node binary for the running process has changed"); + return true; + } + + // Check if the binary this node is running is different from the one in storage + if self + .context + .transfers + .as_ref() + .and_then(|t| t.binary_transfer.as_ref()) + .is_some_and(|(_, bin)| bin != target_binary) + { + info!("Node binary has changed"); + return true; + } + + false + } } impl Reconcile<(), ReconcileError> for AgentStateReconciler { @@ -225,27 +315,13 @@ impl Reconcile<(), ReconcileError> for AgentStateReconciler { let env_info = self.state.get_env_info(*env_id).await?; - // Check if the storage version, storage id, or network id has changed - let storage_has_changed = self - .context - .env_state - .as_ref() - .map(|e| e.changed(&env_info)) - .unwrap_or(true); - - // Check if the ledger height is not resolved - let height_has_changed = - self.context.ledger_last_height != Some(node.height) && !node.height.1.is_top(); - - // If the node should be torn down, or the storage has changed, we need to + // If the node should be torn down because a configuration changed, we need to // gracefully shut down the node. - let shutdown_pending = !node.online || storage_has_changed || height_has_changed; - - if let (true, Some(process)) = ( - shutdown_pending || self.context.shutdown_pending, - self.context.process.as_mut(), - ) { + if self.is_shutdown_pending(node, &env_info) { self.context.shutdown_pending = true; + // Unwrap safety - is_shutdown_pending ensures the process exists. + let process = self.context.process.as_mut().unwrap(); + reconcile!(end_process, EndProcessReconciler(process), res => { // If the process has exited, clear the process context if res.inner.is_some() { @@ -279,40 +355,44 @@ impl Reconcile<(), ReconcileError> for AgentStateReconciler { if !process.is_running() { info!("Node process has exited..."); self.context.process = None; - } else { - // Accumulate all the fields that are used to derive the command that starts - // the node. - let command = NodeCommand::new( - Arc::clone(&self.state), - node_arc, - *env_id, - Arc::clone(&env_info), - ) - .await?; - - // If the command has changed, restart the process - if process.command != command { - info!("Node command has changed, restarting process..."); - self.context.shutdown_pending = true; - return Ok(ReconcileStatus::empty() - .add_scope("agent_state/command_changed") - .requeue_after(Duration::ZERO)); - } - // Prevent other reconcilers from running while the node is running - if self.state.is_node_online() { - return Ok(ReconcileStatus::default().add_scope("agent_state/running")); - } else { - // If the node is not online, the process is still running, but the node - // has not connected to the controlplane. - // This can happen if the node is still syncing, or if the controlplane - // is not reachable. - return Ok(ReconcileStatus::empty() - .requeue_after(Duration::from_secs(1)) - .add_condition(ReconcileCondition::PendingStartup) - .add_scope("agent_state/starting")); - } + return Ok(ReconcileStatus::empty() + .requeue_after(Duration::ZERO) + .add_scope("agent_state/exited")); } + + // Accumulate all the fields that are used to derive the command that starts + // the node. + let command = NodeCommand::new( + Arc::clone(&self.state), + node_arc, + *env_id, + Arc::clone(&env_info), + ) + .await?; + + // If the command has changed, restart the process + if process.command != command { + info!("Node command has changed, restarting process..."); + self.context.shutdown_pending = true; + return Ok(ReconcileStatus::empty() + .add_scope("agent_state/command_changed") + .requeue_after(Duration::ZERO)); + } + + // Prevent other reconcilers from running while the node is running + if self.state.is_node_online() { + return Ok(ReconcileStatus::default().add_scope("agent_state/running")); + } + + // If the node is not online, the process is still running, but the node + // has not connected to the controlplane. + // This can happen if the node is still syncing, or if the controlplane + // is not reachable. + return Ok(ReconcileStatus::empty() + .requeue_after(Duration::from_secs(1)) + .add_condition(ReconcileCondition::PendingStartup) + .add_scope("agent_state/starting")); } let storage_path = self @@ -385,25 +465,20 @@ impl Reconcile<(), ReconcileError> for AgentStateReconciler { // TODO: if possible, use the NodeCommand as configuration for a node service to // allow running the node outside of the agent - if self.context.process.is_none() { - info!("Starting node process"); - let command = NodeCommand::new( - Arc::clone(&self.state), - node_arc, - *env_id, - Arc::clone(&env_info), - ) - .await?; - - let process = ProcessContext::new(command)?; - self.context.process = Some(process); - return Ok(ReconcileStatus::empty() - .add_scope("agent_state/starting") - .requeue_after(Duration::from_secs(1))); - } - + info!("Starting node process"); + let command = NodeCommand::new( + Arc::clone(&self.state), + node_arc, + *env_id, + Arc::clone(&env_info), + ) + .await?; + + let process = ProcessContext::new(command)?; + self.context.process = Some(process); + self.context.shutdown_pending = false; Ok(ReconcileStatus::empty() - .add_scope("agent_state/edge_case") + .add_scope("agent_state/starting") .requeue_after(Duration::from_secs(1))) } } diff --git a/crates/agent/src/reconcile/command.rs b/crates/agent/src/reconcile/command.rs index 08a30020..e37553a2 100644 --- a/crates/agent/src/reconcile/command.rs +++ b/crates/agent/src/reconcile/command.rs @@ -19,7 +19,7 @@ use crate::state::GlobalState; #[derive(Debug, Clone, Eq, PartialEq)] pub struct NodeCommand { /// Path to the snarkos binary - command_path: PathBuf, + pub command_path: PathBuf, /// If true, do not print stdout quiet: bool, /// Environment ID (used in loki) diff --git a/crates/agent/src/reconcile/process.rs b/crates/agent/src/reconcile/process.rs index 656529e8..b75bd15d 100644 --- a/crates/agent/src/reconcile/process.rs +++ b/crates/agent/src/reconcile/process.rs @@ -3,6 +3,7 @@ use std::time::{Duration, Instant}; use snops_common::{ rpc::error::ReconcileError, state::{ReconcileCondition, ReconcileStatus}, + util::sha256_file, }; use tokio::{process::Child, select}; use tracing::{error, info}; @@ -24,10 +25,15 @@ pub struct ProcessContext { sigint_at: Option, /// Time a sigkill was sent to the child process sigkill_at: Option, + /// The sha256 hash of the running binary + binary_sha256: String, } impl ProcessContext { pub fn new(command: NodeCommand) -> Result { + let binary_sha256 = sha256_file(&command.command_path).map_err(|e| { + ReconcileError::FileReadError(command.command_path.clone(), e.to_string()) + })?; command .build() .spawn() @@ -37,6 +43,7 @@ impl ProcessContext { started_at: Instant::now(), sigint_at: None, sigkill_at: None, + binary_sha256, }) .map_err(|e| { error!("failed to start node process: {e:?}"); @@ -51,6 +58,11 @@ impl ProcessContext { self.child.try_wait().is_ok_and(|status| status.is_none()) } + /// Check if the running binary matches the provided sha256 hash + pub fn is_sha256_eq(&self, sha256: &str) -> bool { + self.binary_sha256 == sha256 + } + /// A helper function to gracefully shutdown the node process without /// a reconciler pub async fn graceful_shutdown(&mut self) { diff --git a/crates/agent/src/reconcile/storage.rs b/crates/agent/src/reconcile/storage.rs index ebaa74a1..5b3bbd8b 100644 --- a/crates/agent/src/reconcile/storage.rs +++ b/crates/agent/src/reconcile/storage.rs @@ -4,7 +4,6 @@ use std::{ time::{Duration, Instant}, }; -use lazysort::SortedBy; use snops_checkpoint::CheckpointManager; use snops_common::{ api::AgentEnvInfo, @@ -53,15 +52,12 @@ impl<'a> Reconcile<(), ReconcileError> for BinaryReconciler<'a> { .unwrap_or(&default_binary); // Check if the binary has changed - let binary_has_changed = transfer - .as_ref() - .map(|(_, b)| b != target_binary) - .unwrap_or(true); + let binary_has_changed = transfer.as_ref().is_none_or(|(_, b)| b != target_binary); let dst = state.cli.path.join(SNARKOS_FILE); // The binary does not exist and is marked as OK... - if ok_at.is_some() && !dst.exists() { + if ok_at.is_some() && (binary_has_changed || !dst.exists()) { **ok_at = None; } @@ -252,19 +248,12 @@ impl<'a> LedgerReconciler<'a> { // Determine which checkpoint to use by the next available height/time match self.target_height.1 { - HeightRequest::Absolute(height) => manager - .checkpoints() - .sorted_by(|(a, _), (b, _)| b.block_height.cmp(&a.block_height)) - .find_map(|(c, path)| (c.block_height <= height).then_some(path)), - HeightRequest::Checkpoint(span) => span.as_timestamp().and_then(|timestamp| { - manager - .checkpoints() - .sorted_by(|(a, _), (b, _)| b.timestamp.cmp(&a.timestamp)) - .find_map(|(c, path)| (c.timestamp <= timestamp).then_some(path)) - }), + HeightRequest::Absolute(height) => manager.nearest_with_height(height), + HeightRequest::Checkpoint(span) => manager.nearest_with_span(span), // top cannot be a target height _ => None, } + .map(|(_, path)| path) .ok_or(ReconcileError::NoAvailableCheckpoints(self.target_height.1)) .cloned() } diff --git a/crates/agent/src/rpc/control.rs b/crates/agent/src/rpc/control.rs index 8999d305..80eaadc8 100644 --- a/crates/agent/src/rpc/control.rs +++ b/crates/agent/src/rpc/control.rs @@ -16,7 +16,7 @@ use snops_common::{ }, error::{AgentError, SnarkosRequestError}, }, - state::{AgentId, AgentState, EnvId, InternedId, NetworkId, PortConfig}, + state::{AgentId, AgentState, EnvId, InternedId, NetworkId, PortConfig, ReconcileOptions}, }; use tarpc::context::Context; use tracing::{error, info, trace}; @@ -93,12 +93,14 @@ impl AgentService for AgentRpcServer { // Queue a reconcile immediately as we have received new state. // The reconciler will decide if anything has actually changed - self.state.update_agent_state(handshake.state).await; + self.state + .update_agent_state(handshake.state, handshake.reconcile_opts) + .await; } - async fn set_agent_state(self, _: Context, target: AgentState) { + async fn set_agent_state(self, _: Context, target: AgentState, opts: ReconcileOptions) { info!("Received new agent state, queuing reconcile..."); - self.state.update_agent_state(target).await; + self.state.update_agent_state(target, opts).await; } async fn clear_peer_addr(self, _: Context, agent_id: AgentId) { diff --git a/crates/agent/src/state.rs b/crates/agent/src/state.rs index 6e8a4660..8f6b0da4 100644 --- a/crates/agent/src/state.rs +++ b/crates/agent/src/state.rs @@ -11,7 +11,7 @@ use reqwest::Url; use snops_common::{ api::AgentEnvInfo, rpc::{agent::node::NodeServiceClient, control::ControlServiceClient, error::ReconcileError}, - state::{AgentId, AgentPeer, AgentState, EnvId, TransferId, TransferStatus}, + state::{AgentId, AgentPeer, AgentState, EnvId, ReconcileOptions, TransferId, TransferStatus}, util::OpaqueDebug, }; use tarpc::context; @@ -42,7 +42,7 @@ pub struct GlobalState { pub agent_state: RwLock>, /// A sender for emitting the next time to reconcile the agent. /// Helpful for scheduling the next reconciliation. - pub queue_reconcile_tx: Sender, + pub queue_reconcile_tx: Sender<(Instant, ReconcileOptions)>, pub env_info: RwLock)>>, // Map of agent IDs to their resolved addresses. pub resolved_addrs: RwLock>, @@ -86,9 +86,9 @@ impl GlobalState { .collect::>() } - pub async fn queue_reconcile(&self, duration: Duration) -> bool { + pub async fn queue_reconcile(&self, duration: Duration, opts: ReconcileOptions) -> bool { self.queue_reconcile_tx - .try_send(Instant::now() + duration) + .try_send((Instant::now() + duration, opts)) .is_ok() } @@ -99,6 +99,7 @@ impl GlobalState { *self.env_info.write().await = info; } + /// Fetch the environment info for the given env_id, caching the result. pub async fn get_env_info(&self, env_id: EnvId) -> Result, ReconcileError> { match self.env_info.read().await.as_ref() { Some((id, info)) if *id == env_id => return Ok(info.clone()), @@ -151,7 +152,7 @@ impl GlobalState { self.node_client.read().await.clone() } - pub async fn update_agent_state(&self, state: AgentState) { + pub async fn update_agent_state(&self, state: AgentState, opts: ReconcileOptions) { if let Err(e) = self.db.set_agent_state(&state) { error!("failed to save agent state to db: {e}"); } @@ -159,7 +160,7 @@ impl GlobalState { *self.agent_state.write().await = state; // Queue a reconcile to apply the new state - self.queue_reconcile(Duration::ZERO).await; + self.queue_reconcile(Duration::ZERO, opts).await; } pub async fn re_fetch_peer_addrs(&self) { diff --git a/crates/checkpoint/Cargo.toml b/crates/checkpoint/Cargo.toml index 244d2035..1526b4a3 100644 --- a/crates/checkpoint/Cargo.toml +++ b/crates/checkpoint/Cargo.toml @@ -15,6 +15,7 @@ aleo-std = { workspace = true, optional = true } anyhow = { workspace = true, optional = true } chrono.workspace = true glob.workspace = true +lazysort.workspace = true rayon.workspace = true serde = { workspace = true, optional = true } snarkvm = { workspace = true, optional = true } diff --git a/crates/checkpoint/src/manager.rs b/crates/checkpoint/src/manager.rs index 8492c232..d06d32c7 100644 --- a/crates/checkpoint/src/manager.rs +++ b/crates/checkpoint/src/manager.rs @@ -1,12 +1,15 @@ use std::{collections::BTreeMap, fs, path::PathBuf}; use chrono::{DateTime, TimeDelta, Utc}; +use lazysort::SortedBy; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use tracing::{error, trace}; #[cfg(feature = "write")] use crate::errors::{ManagerCullError, ManagerInsertError, ManagerPollError}; -use crate::{errors::ManagerLoadError, path_from_height, CheckpointHeader, RetentionPolicy}; +use crate::{ + errors::ManagerLoadError, path_from_height, CheckpointHeader, RetentionPolicy, RetentionSpan, +}; #[derive(Debug, Clone)] pub struct CheckpointManager { @@ -215,6 +218,28 @@ impl CheckpointManager { pub fn checkpoints(&self) -> impl Iterator { self.checkpoints.values() } + + /// Find the nearest checkpoint with a height less than or equal to the + /// given height + pub fn nearest_with_height(&self, height: u32) -> Option<&(CheckpointHeader, PathBuf)> { + self.checkpoints() + .sorted_by(|(a, _), (b, _)| b.block_height.cmp(&a.block_height)) + .find(|(c, _)| (c.block_height <= height)) + } + + /// Find the nearest checkpoint with a timestamp less than or equal to the + /// given span + pub fn nearest_with_span(&self, span: RetentionSpan) -> Option<&(CheckpointHeader, PathBuf)> { + self.nearest_with_timestamp(span.as_timestamp()?) + } + + /// Find the nearest checkpoint with a timestamp less than or equal to the + /// given timestamp + pub fn nearest_with_timestamp(&self, timestamp: i64) -> Option<&(CheckpointHeader, PathBuf)> { + self.checkpoints() + .sorted_by(|(a, _), (b, _)| b.timestamp.cmp(&a.timestamp)) + .find(|(c, _)| (c.timestamp <= timestamp)) + } } impl std::fmt::Display for CheckpointManager { diff --git a/crates/common/src/api.rs b/crates/common/src/api.rs index 0bee7bb4..8bdede66 100644 --- a/crates/common/src/api.rs +++ b/crates/common/src/api.rs @@ -31,7 +31,7 @@ pub struct AgentEnvInfo { pub storage: StorageInfo, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub struct StorageInfo { /// String id of this storage pub id: StorageId, diff --git a/crates/common/src/rpc/control/agent.rs b/crates/common/src/rpc/control/agent.rs index 719f1b77..9c902b73 100644 --- a/crates/common/src/rpc/control/agent.rs +++ b/crates/common/src/rpc/control/agent.rs @@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize}; use crate::rpc::error::*; use crate::state::snarkos_status::SnarkOSLiteBlock; -use crate::state::AgentId; +use crate::state::{AgentId, ReconcileOptions}; use crate::{ prelude::EnvId, state::{AgentState, NetworkId, PortConfig}, @@ -15,6 +15,7 @@ pub struct Handshake { pub jwt: Option, pub loki: Option, pub state: AgentState, + pub reconcile_opts: ReconcileOptions, } /// The RPC service that agents implement as a server. @@ -32,7 +33,7 @@ pub trait AgentService { /// Control plane instructs the agent to reconcile towards a particular /// state. - async fn set_agent_state(to: AgentState); + async fn set_agent_state(to: AgentState, opts: ReconcileOptions); /// Broadcast a transaction locally async fn broadcast_tx(tx: String) -> Result<(), AgentError>; diff --git a/crates/common/src/state/reconcile.rs b/crates/common/src/state/reconcile.rs index 755f4641..7582a52e 100644 --- a/crates/common/src/state/reconcile.rs +++ b/crates/common/src/state/reconcile.rs @@ -5,6 +5,23 @@ use serde::{Deserialize, Serialize}; use super::TransferId; +#[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Serialize, Deserialize)] +pub struct ReconcileOptions { + /// When true, the reconciler will fetch the latest env info + pub refetch_info: bool, + /// When true, the reconciler will force the node to shut down + pub force_shutdown: bool, +} + +impl ReconcileOptions { + pub fn union(self, other: Self) -> Self { + Self { + refetch_info: self.refetch_info || other.refetch_info, + force_shutdown: self.force_shutdown || other.force_shutdown, + } + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub enum ReconcileCondition { /// A file is being transferred. diff --git a/crates/controlplane/src/env/mod.rs b/crates/controlplane/src/env/mod.rs index 97f40e0e..a75cd439 100644 --- a/crates/controlplane/src/env/mod.rs +++ b/crates/controlplane/src/env/mod.rs @@ -13,7 +13,8 @@ use snops_common::{ api::{AgentEnvInfo, EnvInfo}, node_targets::NodeTargets, state::{ - AgentId, AgentPeer, AgentState, CannonId, EnvId, NetworkId, NodeKey, NodeState, TxPipeId, + AgentId, AgentPeer, AgentState, CannonId, EnvId, NetworkId, NodeKey, NodeState, + ReconcileOptions, TxPipeId, }, }; use tokio::sync::Semaphore; @@ -375,6 +376,10 @@ impl Environment { .collect(), )?; + let storage_changed = prev_env + .as_ref() + .is_some_and(|prev| prev.storage.info() != storage.info()); + let env = Arc::new(Environment { id: env_id, storage, @@ -407,12 +412,23 @@ impl Environment { } // Emit state changes to all agents within this environment - env.update_all_agents(&state).await?; + env.update_all_agents( + &state, + ReconcileOptions { + refetch_info: storage_changed, + ..Default::default() + }, + ) + .await?; Ok(env_id) } - async fn update_all_agents(&self, state: &GlobalState) -> Result<(), EnvError> { + async fn update_all_agents( + &self, + state: &GlobalState, + opts: ReconcileOptions, + ) -> Result<(), EnvError> { let mut pending_changes = vec![]; for entry in self.node_states.iter() { @@ -450,7 +466,7 @@ impl Environment { pending_changes.push((agent_id, agent_state)); } - state.update_agent_states(pending_changes).await; + state.update_agent_states_opts(pending_changes, opts).await; Ok(()) } @@ -679,7 +695,10 @@ impl Environment { // Otherwise do a normal reconcile } else { state - .queue_many_reconciles(pending_reconciles.into_iter().map(|(id, _)| id)) + .queue_many_reconciles( + pending_reconciles.into_iter().map(|(id, _)| id), + Default::default(), + ) .await; } } diff --git a/crates/controlplane/src/server/websocket.rs b/crates/controlplane/src/server/websocket.rs index 95b9b3a9..007b71fe 100644 --- a/crates/controlplane/src/server/websocket.rs +++ b/crates/controlplane/src/server/websocket.rs @@ -122,6 +122,11 @@ async fn handle_socket( let client = client.clone(); let mut handshake = Handshake { loki: state.cli.loki.as_ref().map(|u| u.to_string()), + // Encourage the agent to refetch its info on connect + reconcile_opts: ReconcileOptions { + refetch_info: true, + ..Default::default() + }, ..Default::default() }; diff --git a/crates/controlplane/src/state/reconcile.rs b/crates/controlplane/src/state/reconcile.rs index a12dda11..103d1cd4 100644 --- a/crates/controlplane/src/state/reconcile.rs +++ b/crates/controlplane/src/state/reconcile.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use futures_util::future::join_all; -use snops_common::state::{AgentId, AgentState, NodeKey}; +use snops_common::state::{AgentId, AgentState, NodeKey, ReconcileOptions}; use tracing::{error, info}; use super::GlobalState; @@ -22,8 +22,17 @@ pub fn pending_reconcile_node_map<'a>( } impl GlobalState { - /// Reconcile a bunch of agents at once. pub async fn update_agent_states(&self, iter: impl IntoIterator) { + self.update_agent_states_opts(iter, Default::default()) + .await; + } + + /// Reconcile a bunch of agents at once. + pub async fn update_agent_states_opts( + &self, + iter: impl IntoIterator, + opts: ReconcileOptions, + ) { let mut agent_ids = vec![]; for (id, target) in iter { @@ -36,12 +45,13 @@ impl GlobalState { } } - self.queue_many_reconciles(agent_ids).await; + self.queue_many_reconciles(agent_ids, opts).await; } pub async fn queue_many_reconciles( &self, iter: impl IntoIterator, + opts: ReconcileOptions, ) -> (usize, usize) { let mut handles = vec![]; let mut agent_ids = vec![]; @@ -58,9 +68,9 @@ impl GlobalState { agent_ids.push(id); let target = agent.state.clone(); - handles.push(tokio::spawn( - async move { client.set_agent_state(target).await }, - )); + handles.push(tokio::spawn(async move { + client.set_agent_state(target, opts).await + })); } if handles.is_empty() { diff --git a/crates/controlplane/src/state/rpc.rs b/crates/controlplane/src/state/rpc.rs index e384bbd2..f55f82ae 100644 --- a/crates/controlplane/src/state/rpc.rs +++ b/crates/controlplane/src/state/rpc.rs @@ -3,7 +3,9 @@ use std::{fmt::Display, time::Duration}; use serde::de::DeserializeOwned; use snops_common::{ rpc::{control::agent::AgentServiceClient, error::SnarkosRequestError}, - state::{snarkos_status::SnarkOSLiteBlock, AgentId, AgentState, EnvId, NetworkId}, + state::{ + snarkos_status::SnarkOSLiteBlock, AgentId, AgentState, EnvId, NetworkId, ReconcileOptions, + }, }; use tarpc::{client::RpcError, context}; @@ -13,8 +15,12 @@ use crate::error::StateError; pub struct AgentClient(pub(crate) AgentServiceClient); impl AgentClient { - pub async fn set_agent_state(&self, to: AgentState) -> Result<(), RpcError> { - self.0.set_agent_state(context::current(), to).await + pub async fn set_agent_state( + &self, + to: AgentState, + opts: ReconcileOptions, + ) -> Result<(), RpcError> { + self.0.set_agent_state(context::current(), to, opts).await } pub async fn clear_peer_addr(&self, peer: AgentId) -> Result<(), RpcError> { From 7e898f6d74b8ca645ac519757b00e1ef5b0739b3 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 30 Nov 2024 20:49:18 -0500 Subject: [PATCH 36/68] fix(aot): fix proposal cache not being deleted --- crates/aot/src/runner/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/aot/src/runner/mod.rs b/crates/aot/src/runner/mod.rs index f701cfa6..d2f43798 100644 --- a/crates/aot/src/runner/mod.rs +++ b/crates/aot/src/runner/mod.rs @@ -291,7 +291,7 @@ impl Runner { }; tracing::error!("failed to load proposal cache: {e}"); - if let Err(e) = std::fs::remove_dir_all(&proposal_cache_path) { + if let Err(e) = std::fs::remove_file(&proposal_cache_path) { tracing::error!("failed to remove proposal cache: {e}"); } } From 745ba8f282a13841f2ef81f5aa49eb1bccc742cf Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 30 Nov 2024 20:52:14 -0500 Subject: [PATCH 37/68] feat(snops): support "any" instead of wildcard for nodetargets --- crates/common/src/node_targets.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/common/src/node_targets.rs b/crates/common/src/node_targets.rs index 7f336378..7d6fe6b6 100644 --- a/crates/common/src/node_targets.rs +++ b/crates/common/src/node_targets.rs @@ -108,7 +108,7 @@ impl<'de> Deserialize<'de> for NodeTargets { lazy_static! { static ref NODE_TARGET_REGEX: Regex = - Regex::new(r"^(?P\*|client|validator|prover)\/(?P[A-Za-z0-9\-*]+)(?:@(?P[A-Za-z0-9\-*]+))?$") + Regex::new(r"^(?P\*|any|client|validator|prover)\/(?P[A-Za-z0-9\-*]+)(?:@(?P[A-Za-z0-9\-*]+))?$") .unwrap(); } @@ -184,6 +184,7 @@ impl FromStr for NodeTarget { // match the type let ty = match &captures["ty"] { "*" => NodeTargetType::All, + "any" => NodeTargetType::All, "client" => NodeTargetType::One(NodeType::Client), "validator" => NodeTargetType::One(NodeType::Validator), "prover" => NodeTargetType::One(NodeType::Prover), @@ -194,6 +195,7 @@ impl FromStr for NodeTarget { let id = match &captures["id"] { // full wildcard "*" => NodeTargetId::All, + "any" => NodeTargetId::All, // partial wildcard id if id.contains('*') => NodeTargetId::WildcardPattern(WildMatch::new(id)), @@ -206,6 +208,7 @@ impl FromStr for NodeTarget { let ns = match captures.name("ns") { // full wildcard Some(id) if id.as_str() == "*" => NodeTargetNamespace::All, + Some(id) if id.as_str() == "any" => NodeTargetNamespace::All, // local; either explicitly stated, or empty Some(id) if id.as_str() == "local" => NodeTargetNamespace::Local, From aedee1e50673459be7e9ef782ccd01ebc15c3998 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 30 Nov 2024 20:52:45 -0500 Subject: [PATCH 38/68] feat(controlplane): reboot action does not modify agent online status --- crates/agent/src/reconcile/agent.rs | 2 + .../controlplane/src/server/actions/power.rs | 52 ++++++++++++++++--- crates/controlplane/src/state/agent.rs | 11 +++- 3 files changed, 56 insertions(+), 9 deletions(-) diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index 476db2a4..a76583c7 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -164,6 +164,8 @@ impl AgentStateReconciler { self.context.shutdown_pending = true; } + next_opts = Default::default(); + trace!("Reconciling agent state..."); let res = self.reconcile().await; if let Some(client) = self.state.get_ws_client().await { diff --git a/crates/controlplane/src/server/actions/power.rs b/crates/controlplane/src/server/actions/power.rs index c5a61692..2dbd414b 100644 --- a/crates/controlplane/src/server/actions/power.rs +++ b/crates/controlplane/src/server/actions/power.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use axum::{ response::{IntoResponse, Response}, @@ -7,7 +7,7 @@ use axum::{ use snops_common::{ action_models::WithTargets, node_targets::NodeTargets, - state::{AgentId, AgentState, EnvId}, + state::{AgentId, AgentState, EnvId, ReconcileOptions}, }; use tracing::info; @@ -90,12 +90,48 @@ pub async fn offline( wait_for_nodes(&state, env.id, nodes, pending).await } -pub async fn reboot(env: Env, json: Json) -> Response { - let offline_res = offline(env.clone(), json.clone()).await; +pub async fn reboot( + Env { env, state, .. }: Env, + Json(WithTargets { nodes, .. }): Json, +) -> Response { + let node_map = env + .matching_agents(&nodes, &state.pool) + .filter_map(|a| a.node_key().map(|k| (k.clone(), a.id))) + .collect::>(); + + let mut awaiting_agents = node_map.values().copied().collect::>(); + + // create the subscriber before updating agent states in order to + // avoid missing any events + use crate::events::prelude::*; + let mut subscriber = state + .events + .subscribe_on(NodeTargetIs(nodes) & EnvIs(env.id) & ReconcileComplete); - if !offline_res.status().is_success() { - offline_res - } else { - online(env, json).await + state + .queue_many_reconciles( + awaiting_agents.iter().copied(), + ReconcileOptions { + force_shutdown: true, + ..Default::default() + }, + ) + .await; + + // wait at most 30 seconds for all agents to reconcile + let expires = tokio::time::Instant::now() + std::time::Duration::from_secs(30); + while !awaiting_agents.is_empty() { + tokio::select! { + _ = tokio::time::sleep_until(expires) => { + break; + } + Ok(event) = subscriber.next() => { + if let Some(agent) = event.agent { + awaiting_agents.remove(&agent); + } + } + } } + + Json(node_map).into_response() } diff --git a/crates/controlplane/src/state/agent.rs b/crates/controlplane/src/state/agent.rs index c76ae0a3..bcb11b73 100644 --- a/crates/controlplane/src/state/agent.rs +++ b/crates/controlplane/src/state/agent.rs @@ -13,7 +13,9 @@ use serde::{Deserialize, Serialize}; use snops_common::{ lasso::Spur, rpc::control::agent::AgentServiceClient, - state::{AgentId, AgentModeOptions, AgentState, AgentStatus, EnvId, NodeState, PortConfig}, + state::{ + AgentId, AgentModeOptions, AgentState, AgentStatus, EnvId, NodeKey, NodeState, PortConfig, + }, INTERN, }; @@ -177,6 +179,13 @@ impl Agent { } } + pub fn node_key(&self) -> Option<&NodeKey> { + match &self.state { + AgentState::Node(_, state) => Some(&state.node_key), + _ => None, + } + } + /// The ID of this agent. pub fn id(&self) -> AgentId { self.id From 6ad74a7d423751f568dfacdd85538fb7f2e037a5 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 30 Nov 2024 20:59:36 -0500 Subject: [PATCH 39/68] fix(snops): ensure agent wipes non-persisted ledgers --- crates/agent/src/reconcile/agent.rs | 8 ++++++++ crates/common/src/state/reconcile.rs | 3 +++ crates/controlplane/src/env/mod.rs | 3 +++ 3 files changed, 14 insertions(+) diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index a76583c7..371583a0 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -164,6 +164,14 @@ impl AgentStateReconciler { self.context.shutdown_pending = true; } + // If the agent is forced to clear the last height, clear it + if next_opts.clear_last_height { + self.context.ledger_last_height = None; + if let Err(e) = self.state.db.set_last_height(None) { + error!("failed to clear last height from db: {e}"); + } + } + next_opts = Default::default(); trace!("Reconciling agent state..."); diff --git a/crates/common/src/state/reconcile.rs b/crates/common/src/state/reconcile.rs index 7582a52e..c57b367a 100644 --- a/crates/common/src/state/reconcile.rs +++ b/crates/common/src/state/reconcile.rs @@ -11,6 +11,8 @@ pub struct ReconcileOptions { pub refetch_info: bool, /// When true, the reconciler will force the node to shut down pub force_shutdown: bool, + /// When true, the reconciler will clear the last height + pub clear_last_height: bool, } impl ReconcileOptions { @@ -18,6 +20,7 @@ impl ReconcileOptions { Self { refetch_info: self.refetch_info || other.refetch_info, force_shutdown: self.force_shutdown || other.force_shutdown, + clear_last_height: self.clear_last_height || other.clear_last_height, } } } diff --git a/crates/controlplane/src/env/mod.rs b/crates/controlplane/src/env/mod.rs index a75cd439..ca0bc991 100644 --- a/crates/controlplane/src/env/mod.rs +++ b/crates/controlplane/src/env/mod.rs @@ -380,6 +380,8 @@ impl Environment { .as_ref() .is_some_and(|prev| prev.storage.info() != storage.info()); + let clear_last_height = prev_env.is_none() && !storage.persist; + let env = Arc::new(Environment { id: env_id, storage, @@ -416,6 +418,7 @@ impl Environment { &state, ReconcileOptions { refetch_info: storage_changed, + clear_last_height, ..Default::default() }, ) From 641fa78f4dae76fae0a8192014aef650678d399c Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 30 Nov 2024 21:07:24 -0500 Subject: [PATCH 40/68] fix(controlplane): allow controlplane to be higher patch version than agents --- crates/controlplane/src/agent_version.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/controlplane/src/agent_version.rs b/crates/controlplane/src/agent_version.rs index ef9a86b7..6b2331ca 100644 --- a/crates/controlplane/src/agent_version.rs +++ b/crates/controlplane/src/agent_version.rs @@ -17,7 +17,7 @@ fn cp_version() -> &'static VersionReq { op: semver::Op::GreaterEq, major: version.major, minor: Some(version.minor), - patch: Some(version.patch), + patch: Some(0), pre: Prerelease::EMPTY, }, Comparator { From 12e7aabcb1ffc93b0c3223be6465ff71a535832e Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 30 Nov 2024 21:11:06 -0500 Subject: [PATCH 41/68] fix(agent): use private key in env instead of cli args --- crates/agent/src/reconcile/command.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/agent/src/reconcile/command.rs b/crates/agent/src/reconcile/command.rs index e37553a2..9a5e376c 100644 --- a/crates/agent/src/reconcile/command.rs +++ b/crates/agent/src/reconcile/command.rs @@ -172,11 +172,11 @@ impl NodeCommand { .arg(self.ports.node.to_string()); if let Some(pk) = &self.private_key { - command.arg("--private-key").arg(pk); + command.env("PRIVATE_KEY", pk); } if let Some(pk_file) = &self.private_key_file { - command.arg("--private-key-file").arg(pk_file); + command.env("PRIVATE_KEY_FILE", pk_file); } // conditionally add retention policy From a74ce9933b214751d164add4bf067b14a4effafe Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 30 Nov 2024 22:48:10 -0500 Subject: [PATCH 42/68] fix(agent): fix compute agent not downloading binary, fix compute agent including 'Installation - 100.00% complete' message in transaction --- crates/agent/src/api.rs | 9 ++++----- crates/agent/src/rpc/control.rs | 12 ++++++++++-- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/crates/agent/src/api.rs b/crates/agent/src/api.rs index b749f3f0..f91f95a9 100644 --- a/crates/agent/src/api.rs +++ b/crates/agent/src/api.rs @@ -117,7 +117,7 @@ pub async fn check_binary( // this also checks for sha256 differences, along with last modified time // against the target - if !get_file_issues( + let file_issues = get_file_issues( &client, &source_url, path, @@ -125,10 +125,9 @@ pub async fn check_binary( binary.sha256.as_deref(), false, ) - .await - .map(|e| e.is_none()) - .unwrap_or(true) - { + .await; + + if file_issues.is_ok_and(|issues| issues.is_none()) { // check permissions and ensure 0o755 let perms = path.metadata()?.permissions(); if perms.mode() != 0o755 { diff --git a/crates/agent/src/rpc/control.rs b/crates/agent/src/rpc/control.rs index 80eaadc8..285a7a1c 100644 --- a/crates/agent/src/rpc/control.rs +++ b/crates/agent/src/rpc/control.rs @@ -261,7 +261,7 @@ impl AgentService for AgentRpcServer { ) .await .map_err(|e| { - error!("failed obtain runner binary: {e}"); + error!("failed obtain compute binary: {e}"); AgentError::ProcessFailed })?; @@ -273,8 +273,16 @@ impl AgentService for AgentRpcServer { ) .await { - Ok(exec) => { + Ok(mut exec) => { let elapsed = start.elapsed().as_millis(); + + // Truncate the output to the first { + // because Aleo decided to print parameters.aleo.org download + // status to stdout... + if let Some(index) = exec.find("{") { + exec = exec.split_off(index); + } + info!("Authorization executed in {elapsed}ms"); trace!("authorization output: {exec}"); Ok(exec) From 8ddc6ae09d41bb818ace38b82ad6ba9149e04105 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 30 Nov 2024 23:09:31 -0500 Subject: [PATCH 43/68] refactor(events): namespace agent events --- crates/controlplane/src/events/models.rs | 105 +++++++++++------- crates/controlplane/src/events/test_filter.rs | 86 +++++++------- .../src/events/test_filter_ops.rs | 20 ++-- crates/controlplane/src/events/test_stream.rs | 12 +- .../controlplane/src/server/actions/power.rs | 4 +- crates/controlplane/src/server/rpc.rs | 12 +- crates/controlplane/src/server/websocket.rs | 10 +- 7 files changed, 133 insertions(+), 116 deletions(-) diff --git a/crates/controlplane/src/events/models.rs b/crates/controlplane/src/events/models.rs index 5c08eda7..caa5c950 100644 --- a/crates/controlplane/src/events/models.rs +++ b/crates/controlplane/src/events/models.rs @@ -18,14 +18,20 @@ pub struct Event { } #[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(tag = "kind")] +#[serde(tag = "type")] pub enum EventKind { + Agent(AgentEvent), +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "kind")] +pub enum AgentEvent { /// An agent connects to the control plane - AgentConnected, + Connected, /// An agent completes a handshake with the control plane - AgentHandshakeComplete, + HandshakeComplete, /// An agent disconnects from the control plane - AgentDisconnected, + Disconnected, /// An agent finishes a reconcile ReconcileComplete, /// An agent updates its reconcile status @@ -35,22 +41,7 @@ pub enum EventKind { /// An agent emits a node status NodeStatus(NodeStatus), /// An agent emits a block update - Block(LatestBlockInfo), -} - -impl EventKind { - pub fn filter(&self) -> EventKindFilter { - match self { - EventKind::AgentConnected => EventKindFilter::AgentConnected, - EventKind::AgentHandshakeComplete => EventKindFilter::AgentHandshakeComplete, - EventKind::AgentDisconnected => EventKindFilter::AgentDisconnected, - EventKind::ReconcileComplete => EventKindFilter::ReconcileComplete, - EventKind::Reconcile(_) => EventKindFilter::Reconcile, - EventKind::ReconcileError(_) => EventKindFilter::ReconcileError, - EventKind::NodeStatus(_) => EventKindFilter::NodeStatus, - EventKind::Block(_) => EventKindFilter::Block, - } - } + BlockInfo(LatestBlockInfo), } #[derive(Clone, Copy, Debug, PartialEq)] @@ -59,11 +50,29 @@ pub enum EventKindFilter { AgentConnected, AgentHandshakeComplete, AgentDisconnected, - ReconcileComplete, - Reconcile, - ReconcileError, - NodeStatus, - Block, + AgentReconcileComplete, + AgentReconcile, + AgentReconcileError, + AgentNodeStatus, + AgentBlockInfo, +} + +impl EventKind { + pub fn filter(&self) -> EventKindFilter { + use AgentEvent::*; + use EventKind::*; + + match self { + Agent(Connected) => EventKindFilter::AgentConnected, + Agent(HandshakeComplete) => EventKindFilter::AgentHandshakeComplete, + Agent(Disconnected) => EventKindFilter::AgentDisconnected, + Agent(ReconcileComplete) => EventKindFilter::AgentReconcileComplete, + Agent(Reconcile(_)) => EventKindFilter::AgentReconcile, + Agent(ReconcileError(_)) => EventKindFilter::AgentReconcileError, + Agent(NodeStatus(_)) => EventKindFilter::AgentNodeStatus, + Agent(BlockInfo(_)) => EventKindFilter::AgentBlockInfo, + } + } } #[derive(Clone, Debug, PartialEq)] @@ -103,13 +112,13 @@ impl Event { } } - pub fn replace_kind(&self, kind: EventKind) -> Self { + pub fn replace_kind(&self, kind: impl Into) -> Self { Self { created_at: Utc::now(), agent: self.agent, node_key: self.node_key.clone(), env: self.env, - kind, + kind: kind.into().kind, } } @@ -134,13 +143,21 @@ impl From for EventFilter { } } -impl EventKind { - pub fn event(self) -> Event { - Event::new(self) +pub trait EventHelpers { + fn event(self) -> Event; + fn with_agent(self, agent: &Agent) -> Event; + fn with_agent_id(self, agent_id: AgentId) -> Event; + fn with_node_key(self, node_key: NodeKey) -> Event; + fn with_env_id(self, env_id: EnvId) -> Event; +} + +impl> EventHelpers for T { + fn event(self) -> Event { + self.into() } - pub fn with_agent(self, agent: &Agent) -> Event { - let mut event = Event::new(self); + fn with_agent(self, agent: &Agent) -> Event { + let mut event = self.into(); event.agent = Some(agent.id); if let AgentState::Node(env_id, node) = &agent.state { event.node_key = Some(node.node_key.clone()); @@ -149,21 +166,33 @@ impl EventKind { event } - pub fn with_agent_id(self, agent_id: AgentId) -> Event { - let mut event = Event::new(self); + fn with_agent_id(self, agent_id: AgentId) -> Event { + let mut event = self.into(); event.agent = Some(agent_id); event } - pub fn with_node_key(self, node_key: NodeKey) -> Event { - let mut event = Event::new(self); + fn with_node_key(self, node_key: NodeKey) -> Event { + let mut event = self.into(); event.node_key = Some(node_key); event } - pub fn with_env_id(self, env_id: EnvId) -> Event { - let mut event = Event::new(self); + fn with_env_id(self, env_id: EnvId) -> Event { + let mut event = self.into(); event.env = Some(env_id); event } } + +impl From for Event { + fn from(kind: EventKind) -> Self { + Self::new(kind) + } +} + +impl From for Event { + fn from(kind: AgentEvent) -> Self { + Self::new(EventKind::Agent(kind)) + } +} diff --git a/crates/controlplane/src/events/test_filter.rs b/crates/controlplane/src/events/test_filter.rs index d431186e..6bb21ac2 100644 --- a/crates/controlplane/src/events/test_filter.rs +++ b/crates/controlplane/src/events/test_filter.rs @@ -2,18 +2,14 @@ use std::str::FromStr; use chrono::Utc; use lazy_static::lazy_static; -use snops_common::node_targets::NodeTargets; -use snops_common::rpc::error::ReconcileError; -use snops_common::state::InternedId; -use snops_common::state::LatestBlockInfo; -use snops_common::state::NodeKey; -use snops_common::state::NodeStatus; -use snops_common::state::ReconcileStatus; - -use super::EventFilter::*; -use super::EventKind::*; -use super::EventKindFilter as EKF; -use crate::events::Event; +use snops_common::{ + node_targets::NodeTargets, + rpc::error::ReconcileError, + state::{InternedId, LatestBlockInfo, NodeKey, NodeStatus, ReconcileStatus}, +}; + +use super::{AgentEvent::*, EventFilter::*, EventKind::*, EventKindFilter::*}; +use crate::events::{Event, EventHelpers}; lazy_static! { static ref A: InternedId = InternedId::from_str("a").unwrap(); @@ -24,9 +20,9 @@ lazy_static! { #[test] fn test_unfiltered() { - assert!(AgentConnected.event().matches(&Unfiltered)); - assert!(AgentHandshakeComplete.event().matches(&Unfiltered)); - assert!(AgentDisconnected.event().matches(&Unfiltered)); + assert!(Connected.event().matches(&Unfiltered)); + assert!(HandshakeComplete.event().matches(&Unfiltered)); + assert!(Disconnected.event().matches(&Unfiltered)); assert!(ReconcileComplete.event().matches(&Unfiltered)); assert!(Reconcile(ReconcileStatus::empty()) .event() @@ -35,86 +31,82 @@ fn test_unfiltered() { .event() .matches(&Unfiltered)); assert!(NodeStatus(NodeStatus::Unknown).event().matches(&Unfiltered)); - assert!(Block(LatestBlockInfo::default()) + assert!(BlockInfo(LatestBlockInfo::default()) .event() .matches(&Unfiltered)); } #[test] fn test_all_of() { - assert!(AgentConnected + assert!(Connected .event() - .matches(&AllOf(vec![EventIs(EKF::AgentConnected)]))); + .matches(&AllOf(vec![EventIs(AgentConnected)]))); let e = Event { created_at: Utc::now(), agent: Some(*A), node_key: Some(NodeKey::from_str("client/foo").unwrap()), env: Some(*B), - kind: AgentConnected, + kind: Agent(Connected), }; - assert!(e.matches(&(EKF::AgentConnected & AgentIs(*A)))); - assert!(e.matches(&(EKF::AgentConnected & NodeKeyIs(NodeKey::from_str("client/foo").unwrap())))); - assert!(e.matches(&(EKF::AgentConnected & EnvIs(*B)))); + assert!(e.matches(&(AgentConnected & AgentIs(*A)))); + assert!(e.matches(&(AgentConnected & NodeKeyIs(NodeKey::from_str("client/foo").unwrap())))); + assert!(e.matches(&(AgentConnected & EnvIs(*B)))); assert!(e.matches(&(AgentIs(*A) & NodeTargetIs(NodeTargets::ALL) & EnvIs(*B)))); - assert!(!e.matches(&(EKF::AgentConnected & AgentIs(*B)))); - assert!( - !e.matches(&(EKF::AgentConnected & NodeKeyIs(NodeKey::from_str("client/bar").unwrap()))) - ); - assert!(!e.matches(&(EKF::AgentConnected & EnvIs(*A)))); + assert!(!e.matches(&(AgentConnected & AgentIs(*B)))); + assert!(!e.matches(&(AgentConnected & NodeKeyIs(NodeKey::from_str("client/bar").unwrap())))); + assert!(!e.matches(&(AgentConnected & EnvIs(*A)))); assert!(!e.matches(&(AgentIs(*B) & NodeTargetIs(NodeTargets::ALL) & EnvIs(*B)))); } #[test] fn test_any_of() { - assert!(AgentConnected + assert!(Connected .event() - .matches(&AnyOf(vec![EventIs(EKF::AgentConnected)]))); + .matches(&AnyOf(vec![EventIs(AgentConnected)]))); let e = Event { created_at: Utc::now(), agent: Some(*A), node_key: Some(NodeKey::from_str("client/foo").unwrap()), env: Some(*B), - kind: AgentConnected, + kind: Agent(Connected), }; - assert!(e.matches(&(EKF::AgentConnected | AgentIs(*A)))); - assert!(e.matches(&(EKF::AgentConnected | NodeKeyIs(NodeKey::from_str("client/foo").unwrap())))); - assert!(e.matches(&(EKF::AgentConnected | EnvIs(*B)))); + assert!(e.matches(&(AgentConnected | AgentIs(*A)))); + assert!(e.matches(&(AgentConnected | NodeKeyIs(NodeKey::from_str("client/foo").unwrap())))); + assert!(e.matches(&(AgentConnected | EnvIs(*B)))); assert!(e.matches(&(AgentIs(*A) | NodeTargetIs(NodeTargets::ALL) | EnvIs(*B)))); - assert!(e.matches(&(EKF::AgentConnected | AgentIs(*B)))); - assert!(e.matches(&(EKF::AgentConnected | NodeKeyIs(NodeKey::from_str("client/bar").unwrap())))); - assert!(e.matches(&(EKF::AgentConnected | EnvIs(*A)))); + assert!(e.matches(&(AgentConnected | AgentIs(*B)))); + assert!(e.matches(&(AgentConnected | NodeKeyIs(NodeKey::from_str("client/bar").unwrap())))); + assert!(e.matches(&(AgentConnected | EnvIs(*A)))); assert!(e.matches(&(AgentIs(*B) | NodeTargetIs(NodeTargets::ALL) | EnvIs(*B)))); - assert!(!e.matches(&(EKF::AgentDisconnected | AgentIs(*C)))); - assert!( - !e.matches(&(EKF::AgentDisconnected | NodeKeyIs(NodeKey::from_str("client/bar").unwrap()))) - ); + assert!(!e.matches(&(AgentDisconnected | AgentIs(*C)))); + assert!(!e.matches(&(AgentDisconnected | NodeKeyIs(NodeKey::from_str("client/bar").unwrap())))); } #[test] fn test_one_of() { - assert!(AgentConnected + assert!(Connected .event() - .matches(&OneOf(vec![EventIs(EKF::AgentConnected)]))); + .matches(&OneOf(vec![EventIs(AgentConnected)]))); let e = Event { created_at: Utc::now(), agent: Some(*A), node_key: Some(NodeKey::from_str("client/foo").unwrap()), env: Some(*B), - kind: AgentConnected, + kind: Agent(Connected), }; - assert!(e.matches(&(EKF::AgentConnected ^ AgentIs(*B)))); - assert!(e.matches(&(EKF::AgentConnected & (AgentIs(*A) ^ AgentIs(*B) ^ AgentIs(*C))))); + assert!(e.matches(&(AgentConnected ^ AgentIs(*B)))); + assert!(e.matches(&(AgentConnected & (AgentIs(*A) ^ AgentIs(*B) ^ AgentIs(*C))))); - assert!(!e.matches(&(EKF::AgentConnected ^ AgentIs(*A)))); - assert!(e.matches(&(!(EKF::AgentConnected ^ AgentIs(*A))))); + assert!(!e.matches(&(AgentConnected ^ AgentIs(*A)))); + assert!(e.matches(&(!(AgentConnected ^ AgentIs(*A))))); } diff --git a/crates/controlplane/src/events/test_filter_ops.rs b/crates/controlplane/src/events/test_filter_ops.rs index 179f148f..2870be53 100644 --- a/crates/controlplane/src/events/test_filter_ops.rs +++ b/crates/controlplane/src/events/test_filter_ops.rs @@ -16,10 +16,10 @@ lazy_static! { #[test] fn test_filter_bitand() { assert_eq!(Unfiltered & Unfiltered, Unfiltered); - assert_eq!(Block & Unfiltered, EventIs(Block)); + assert_eq!(AgentBlockInfo & Unfiltered, EventIs(AgentBlockInfo)); assert_eq!( - Block & AgentIs(*A), - AllOf(vec![EventIs(Block), AgentIs(*A)]) + AgentBlockInfo & AgentIs(*A), + AllOf(vec![EventIs(AgentBlockInfo), AgentIs(*A)]) ); assert_eq!( AgentIs(*A) & AgentIs(*B), @@ -34,10 +34,10 @@ fn test_filter_bitand() { #[test] fn test_filter_bitor() { assert_eq!(Unfiltered | Unfiltered, Unfiltered); - assert_eq!(Block | Unfiltered, Unfiltered); + assert_eq!(AgentBlockInfo | Unfiltered, Unfiltered); assert_eq!( - Block | AgentIs(*A), - AnyOf(vec![EventIs(Block), AgentIs(*A)]) + AgentBlockInfo | AgentIs(*A), + AnyOf(vec![EventIs(AgentBlockInfo), AgentIs(*A)]) ); assert_eq!( AgentIs(*A) | AgentIs(*B), @@ -52,10 +52,10 @@ fn test_filter_bitor() { #[test] fn test_filter_bitxor() { assert_eq!(Unfiltered ^ Unfiltered, Unfiltered); - assert_eq!(Block ^ Unfiltered, EventIs(Block)); + assert_eq!(AgentBlockInfo ^ Unfiltered, EventIs(AgentBlockInfo)); assert_eq!( - Block ^ AgentIs(*A), - OneOf(vec![EventIs(Block), AgentIs(*A)]) + AgentBlockInfo ^ AgentIs(*A), + OneOf(vec![EventIs(AgentBlockInfo), AgentIs(*A)]) ); assert_eq!( AgentIs(*A) ^ AgentIs(*B), @@ -70,7 +70,7 @@ fn test_filter_bitxor() { #[test] fn test_filter_not() { assert_eq!(!Unfiltered, Not(Box::new(Unfiltered))); - assert_eq!(!Block, Not(Box::new(EventIs(Block)))); + assert_eq!(!AgentBlockInfo, Not(Box::new(EventIs(AgentBlockInfo)))); assert_eq!(!AgentIs(*A), Not(Box::new(AgentIs(*A)))); assert_eq!( !AgentIs(*A) & AgentIs(*B), diff --git a/crates/controlplane/src/events/test_stream.rs b/crates/controlplane/src/events/test_stream.rs index b2ad17d5..1d7ffc66 100644 --- a/crates/controlplane/src/events/test_stream.rs +++ b/crates/controlplane/src/events/test_stream.rs @@ -3,10 +3,8 @@ use std::str::FromStr; use lazy_static::lazy_static; use snops_common::state::InternedId; -use super::EventFilter::*; -use super::EventKind as EK; -use super::EventKindFilter::*; -use super::Events; +use super::{AgentEvent::*, EventFilter::*, EventKindFilter::*, Events}; +use crate::events::EventHelpers; lazy_static! { static ref A: InternedId = InternedId::from_str("a").unwrap(); @@ -29,9 +27,9 @@ fn test_stream_filtering() { assert_eq!(sub_b.collect_many().len(), 0); assert_eq!(sub_connected.collect_many().len(), 0); - events.emit(EK::AgentConnected.with_agent_id(*A)); - events.emit(EK::AgentDisconnected.with_agent_id(*A)); - events.emit(EK::Block(Default::default()).with_agent_id(*B)); + events.emit(Connected.with_agent_id(*A)); + events.emit(Disconnected.with_agent_id(*A)); + events.emit(BlockInfo(Default::default()).with_agent_id(*B)); assert_eq!(sub_all.collect_many().len(), 3); assert_eq!(sub_a.collect_many().len(), 2); diff --git a/crates/controlplane/src/server/actions/power.rs b/crates/controlplane/src/server/actions/power.rs index 2dbd414b..8e9e3f32 100644 --- a/crates/controlplane/src/server/actions/power.rs +++ b/crates/controlplane/src/server/actions/power.rs @@ -28,7 +28,7 @@ async fn wait_for_nodes( use crate::events::prelude::*; let mut subscriber = state .events - .subscribe_on(NodeTargetIs(nodes) & EnvIs(env_id) & ReconcileComplete); + .subscribe_on(NodeTargetIs(nodes) & EnvIs(env_id) & AgentReconcileComplete); state.update_agent_states(pending).await; @@ -106,7 +106,7 @@ pub async fn reboot( use crate::events::prelude::*; let mut subscriber = state .events - .subscribe_on(NodeTargetIs(nodes) & EnvIs(env.id) & ReconcileComplete); + .subscribe_on(NodeTargetIs(nodes) & EnvIs(env.id) & AgentReconcileComplete); state .queue_many_reconciles( diff --git a/crates/controlplane/src/server/rpc.rs b/crates/controlplane/src/server/rpc.rs index 90ca36a3..a7f95a54 100644 --- a/crates/controlplane/src/server/rpc.rs +++ b/crates/controlplane/src/server/rpc.rs @@ -22,7 +22,7 @@ use tracing::warn; use crate::{ error::StateError, - events::EventKind, + events::{AgentEvent, EventHelpers}, state::{AddrMap, AgentAddrs, AppState}, }; @@ -145,7 +145,7 @@ impl ControlService for ControlRpcServer { self.state .events - .emit(EventKind::Block(info.clone()).with_agent(&agent)); + .emit(AgentEvent::BlockInfo(info.clone()).with_agent(&agent)); agent.status.block_info = Some(info.clone()); let agent_id = agent.id(); @@ -204,7 +204,7 @@ impl ControlService for ControlRpcServer { agent.status.node_status = status.clone(); self.state .events - .emit(EventKind::NodeStatus(status).with_agent(&agent)); + .emit(AgentEvent::NodeStatus(status).with_agent(&agent)); } async fn post_reconcile_status( @@ -220,12 +220,12 @@ impl ControlService for ControlRpcServer { // Emit events for this reconcile - let ev = EventKind::ReconcileComplete.with_agent(&agent); + let ev = AgentEvent::ReconcileComplete.with_agent(&agent); let is_complete = status.as_ref().is_ok_and(|e| e.inner.is_some()); self.state.events.emit(ev.replace_kind(match status { - Ok(res) => EventKind::Reconcile(res), - Err(err) => EventKind::ReconcileError(err), + Ok(res) => AgentEvent::Reconcile(res), + Err(err) => AgentEvent::ReconcileError(err), })); if is_complete { diff --git a/crates/controlplane/src/server/websocket.rs b/crates/controlplane/src/server/websocket.rs index 007b71fe..1ebe319e 100644 --- a/crates/controlplane/src/server/websocket.rs +++ b/crates/controlplane/src/server/websocket.rs @@ -28,7 +28,7 @@ use tracing::{error, info, warn}; use super::{jwt::Claims, rpc::ControlRpcServer}; use crate::{ agent_version::agent_version_ok, - events::EventKind, + events::{AgentEvent, EventHelpers}, server::{ jwt::JWT_SECRET, rpc::{MuxedMessageIncoming, MuxedMessageOutgoing}, @@ -151,9 +151,7 @@ async fn handle_socket( warn!("Connecting agent {id} is trying to identify with an invalid nonce"); break 'reconnect; } - state - .events - .emit(EventKind::AgentConnected.with_agent(&agent)); + state.events.emit(AgentEvent::Connected.with_agent(&agent)); match agent.env() { Some(env) if !state.envs.contains_key(&env) => { @@ -221,7 +219,7 @@ async fn handle_socket( let client2 = client.clone(); tokio::spawn(async move { let agent = state2.pool.get(&id)?; - let event = EventKind::AgentHandshakeComplete.with_agent(&agent); + let event = AgentEvent::HandshakeComplete.with_agent(&agent); // Prevent readonly agent from being held over the handshake RPC drop(agent); @@ -378,7 +376,7 @@ async fn handle_socket( state .events - .emit(EventKind::AgentDisconnected.with_agent(&agent)); + .emit(AgentEvent::Disconnected.with_agent(&agent)); } info!("Agent {id} disconnected"); From eb98876daf9afeaebddb5b5b194ea538d83cf66f Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 30 Nov 2024 23:36:37 -0500 Subject: [PATCH 44/68] refactor(cannon): use Arc for transaction ids --- crates/controlplane/src/cannon/context.rs | 22 +++--- crates/controlplane/src/cannon/error.rs | 6 +- crates/controlplane/src/cannon/mod.rs | 71 +++++++++++-------- crates/controlplane/src/cannon/router.rs | 4 +- crates/controlplane/src/cannon/source.rs | 2 +- crates/controlplane/src/cannon/status.rs | 3 +- crates/controlplane/src/db.rs | 4 +- .../controlplane/src/server/actions/deploy.rs | 2 +- .../src/server/actions/execute.rs | 10 +-- crates/controlplane/src/state/transactions.rs | 8 +-- 10 files changed, 71 insertions(+), 61 deletions(-) diff --git a/crates/controlplane/src/cannon/context.rs b/crates/controlplane/src/cannon/context.rs index 0949090e..18168e85 100644 --- a/crates/controlplane/src/cannon/context.rs +++ b/crates/controlplane/src/cannon/context.rs @@ -35,7 +35,7 @@ pub struct ExecutionContext { pub(crate) source: TxSource, pub(crate) sink: TxSink, pub(crate) fired_txs: Arc, - pub(crate) transactions: Arc>, + pub(crate) transactions: Arc, TransactionTracker>>, } impl ExecutionContext { @@ -147,7 +147,7 @@ impl ExecutionContext { } // write the transaction status to the store and update the transaction tracker - pub fn write_tx_status(&self, tx_id: &str, status: TransactionSendState) { + pub fn write_tx_status(&self, tx_id: &Arc, status: TransactionSendState) { let key = (self.env_id, self.id, tx_id.to_owned()); if let Some(mut tx) = self.transactions.get_mut(tx_id) { if let Err(e) = TransactionTracker::write_status(&self.state, &key, status) { @@ -160,10 +160,10 @@ impl ExecutionContext { } } - pub fn remove_tx_tracker(&self, tx_id: String) { + pub fn remove_tx_tracker(&self, tx_id: Arc) { let _ = self.transactions.remove(&tx_id); if let Err(e) = - TransactionTracker::delete(&self.state, &(self.env_id, self.id, tx_id.to_owned())) + TransactionTracker::delete(&self.state, &(self.env_id, self.id, tx_id.clone())) { error!( "cannon {}.{} failed to delete transaction {tx_id}: {e:?}", @@ -175,11 +175,11 @@ impl ExecutionContext { /// Execute an authorization on the source's compute target async fn execute_auth( &self, - tx_id: String, + tx_id: Arc, auth: Arc, query_path: &str, events: TransactionStatusSender, - ) -> Result<(), (String, CannonError)> { + ) -> Result<(), (Arc, CannonError)> { events.send(TransactionStatusEvent::ExecuteQueued); match self .source @@ -207,8 +207,8 @@ impl ExecutionContext { async fn fire_tx( &self, sink_pipe: Option>, - tx_id: String, - ) -> Result { + tx_id: Arc, + ) -> Result, CannonError> { let latest_height = self .state .get_env_block_info(self.env_id) @@ -216,7 +216,7 @@ impl ExecutionContext { // ensure transaction is being tracked let Some(tracker) = self.transactions.get(&tx_id).map(|v| v.value().clone()) else { - return Err(CannonError::TransactionLost(self.id, tx_id)); + return Err(CannonError::TransactionLost(self.id, tx_id.to_string())); }; // ensure transaction is ready to be broadcasted if !matches!( @@ -225,7 +225,7 @@ impl ExecutionContext { ) { return Err(CannonError::InvalidTransactionState( self.id, - tx_id, + tx_id.to_string(), format!( "expected unsent or broadcasted, got {}", tracker.status.label() @@ -235,7 +235,7 @@ impl ExecutionContext { // ensure transaction blob exists let Some(tx_blob) = tracker.transaction else { - return Err(CannonError::TransactionLost(self.id, tx_id)); + return Err(CannonError::TransactionLost(self.id, tx_id.to_string())); }; let tx_str = match serde_json::to_string(&tx_blob) { diff --git a/crates/controlplane/src/cannon/error.rs b/crates/controlplane/src/cannon/error.rs index b80fcc6f..53ecdace 100644 --- a/crates/controlplane/src/cannon/error.rs +++ b/crates/controlplane/src/cannon/error.rs @@ -1,4 +1,4 @@ -use std::path::PathBuf; +use std::{path::PathBuf, sync::Arc}; use axum::http::StatusCode; use serde::{ser::SerializeStruct, Serialize, Serializer}; @@ -170,12 +170,12 @@ pub enum CannonError { #[error("send `auth` error for cannon `{0}`: {1}")] SendAuthError( CannonId, - #[source] tokio::sync::mpsc::error::SendError<(String, TransactionStatusSender)>, + #[source] tokio::sync::mpsc::error::SendError<(Arc, TransactionStatusSender)>, ), #[error("send `tx` error for cannon `{0}`: {1}")] SendTxError( CannonId, - #[source] tokio::sync::mpsc::error::SendError, + #[source] tokio::sync::mpsc::error::SendError>, ), #[error(transparent)] DatabaseWriteError(#[from] DatabaseError), diff --git a/crates/controlplane/src/cannon/mod.rs b/crates/controlplane/src/cannon/mod.rs index 951bbc20..bfaf54be 100644 --- a/crates/controlplane/src/cannon/mod.rs +++ b/crates/controlplane/src/cannon/mod.rs @@ -101,19 +101,19 @@ pub struct CannonInstance { child: Option, /// channel to send transaction ids to the the task - pub(crate) tx_sender: UnboundedSender, + pub(crate) tx_sender: UnboundedSender>, /// channel to send authorizations (by transaction id) to the the task - pub(crate) auth_sender: UnboundedSender<(String, TransactionStatusSender)>, + pub(crate) auth_sender: UnboundedSender<(Arc, TransactionStatusSender)>, /// transaction ids that are currently being processed - pub(crate) transactions: Arc>, + pub(crate) transactions: Arc, TransactionTracker>>, pub(crate) received_txs: Arc, pub(crate) fired_txs: Arc, } pub struct CannonReceivers { - transactions: UnboundedReceiver, - authorizations: UnboundedReceiver<(String, TransactionStatusSender)>, + transactions: UnboundedReceiver>, + authorizations: UnboundedReceiver<(Arc, TransactionStatusSender)>, } pub type CannonInstanceMeta = (EnvId, NetworkId, StorageId, PathBuf); @@ -127,11 +127,10 @@ impl CannonInstance { txs: &AtomicU64, ) -> u64 { let index = txs.fetch_add(1, std::sync::atomic::Ordering::Relaxed); - if let Err(e) = state - .db - .tx_index - .save(&(env_id, cannon_id, String::new()), &PackedUint(index)) - { + if let Err(e) = state.db.tx_index.save( + &(env_id, cannon_id, Arc::new(String::new())), + &PackedUint(index), + ) { error!("cannon {env_id}.{cannon_id} failed to save received tx count: {e}"); } index @@ -142,22 +141,23 @@ impl CannonInstance { state: &GlobalState, env_id: EnvId, cannon_id: CannonId, - ) -> (DashMap, AtomicU64) { + ) -> (DashMap, TransactionTracker>, AtomicU64) { let transactions = DashMap::new(); // Restore the received transaction count (empty string key for tx_index) - let received_txs = match state - .db - .tx_index - .restore(&(env_id, cannon_id, String::new())) - { - Ok(Some(index)) => AtomicU64::new(index.0), - Ok(None) => AtomicU64::new(0), - Err(e) => { - error!("cannon {env_id}.{cannon_id} failed to parse received tx count: {e}"); - AtomicU64::new(0) - } - }; + let received_txs = + match state + .db + .tx_index + .restore(&(env_id, cannon_id, Arc::new(String::new()))) + { + Ok(Some(index)) => AtomicU64::new(index.0), + Ok(None) => AtomicU64::new(0), + Err(e) => { + error!("cannon {env_id}.{cannon_id} failed to parse received tx count: {e}"); + AtomicU64::new(0) + } + }; let statuses = match state.db.tx_status.read_with_prefix(&(env_id, cannon_id)) { Ok(statuses) => statuses, @@ -369,10 +369,10 @@ impl CannonInstance { /// to the desired sink pub fn proxy_broadcast( &self, - tx_id: String, + tx_id: Arc, body: serde_json::Value, ) -> Result<(), CannonError> { - let key = (self.env_id, self.id, tx_id.to_owned()); + let key = (self.env_id, self.id, Arc::clone(&tx_id)); // if the transaction is in the cache, it has already been broadcasted if let Some(cache) = self.global_state.env_network_cache.get(&self.env_id) { @@ -383,7 +383,10 @@ impl CannonInstance { self.env_id, self.id ); } - return Err(CannonError::TransactionAlreadyExists(self.id, tx_id)); + return Err(CannonError::TransactionAlreadyExists( + self.id, + tx_id.to_string(), + )); } } @@ -391,7 +394,10 @@ impl CannonInstance { let tracker = if let Some(mut tx) = self.transactions.get(&tx_id).as_deref().cloned() { // if we receive a transaction that is not executing, it is a duplicate if !matches!(tx.status, TransactionSendState::Executing(_)) { - return Err(CannonError::TransactionAlreadyExists(self.id, tx_id)); + return Err(CannonError::TransactionAlreadyExists( + self.id, + tx_id.to_string(), + )); } // clear attempts (as this was a successful execute) @@ -442,7 +448,7 @@ impl CannonInstance { &self, body: Authorization, events: TransactionStatusSender, - ) -> Result { + ) -> Result, CannonError> { let Some(storage) = self .global_state .get_env(self.env_id) @@ -484,16 +490,19 @@ impl CannonInstance { transaction: None, status: TransactionSendState::Authorized, }; + + let tx_id = Arc::new(tx_id); + // write the transaction to the store to prevent data loss tracker.write( &self.global_state, - &(self.env_id, self.id, tx_id.to_owned()), + &(self.env_id, self.id, Arc::clone(&tx_id)), )?; - self.transactions.insert(tx_id.to_owned(), tracker); + self.transactions.insert(Arc::clone(&tx_id), tracker); trace!("cannon {}.{} received auth {tx_id}", self.env_id, self.id); self.auth_sender - .send((tx_id.to_owned(), events)) + .send((Arc::clone(&tx_id), events)) .map_err(|e| CannonError::SendAuthError(self.id, e))?; Ok(tx_id) diff --git a/crates/controlplane/src/cannon/router.rs b/crates/controlplane/src/cannon/router.rs index 20c3576e..3e78a6ee 100644 --- a/crates/controlplane/src/cannon/router.rs +++ b/crates/controlplane/src/cannon/router.rs @@ -1,4 +1,4 @@ -use std::{str::FromStr, time::Duration}; +use std::{str::FromStr, sync::Arc, time::Duration}; use axum::{ extract::{Path, Query, State}, @@ -324,7 +324,7 @@ async fn transaction( return ServerError::BadRequest("body missing transaction ID".to_owned()).into_response(); }; - match cannon.proxy_broadcast(tx_id, body.take()) { + match cannon.proxy_broadcast(Arc::new(tx_id), body.take()) { Ok(_) => StatusCode::OK.into_response(), Err(e) => ServerError::from(e).into_response(), } diff --git a/crates/controlplane/src/cannon/source.rs b/crates/controlplane/src/cannon/source.rs index 8bf46d1e..252fa63d 100644 --- a/crates/controlplane/src/cannon/source.rs +++ b/crates/controlplane/src/cannon/source.rs @@ -153,7 +153,7 @@ impl ComputeTarget { &self, ctx: &ExecutionContext, query_path: &str, - tx_id: &str, + tx_id: &Arc, auth: &Authorization, events: &TransactionStatusSender, ) -> Result<(), CannonError> { diff --git a/crates/controlplane/src/cannon/status.rs b/crates/controlplane/src/cannon/status.rs index f319ed87..ff1b43d1 100644 --- a/crates/controlplane/src/cannon/status.rs +++ b/crates/controlplane/src/cannon/status.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; use snops_common::{format::DataFormat, state::AgentId}; use tokio::sync::mpsc::Sender; @@ -49,7 +50,7 @@ pub enum TransactionStatusEvent { /// Status of a transaction as presented internally for tracking and /// preventing data loss. -#[derive(Debug, Clone, Copy, Eq, PartialEq)] +#[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)] pub enum TransactionSendState { /// Authorization has been received. This step is skipped if a /// transaction is created/broadcasted directly. diff --git a/crates/controlplane/src/db.rs b/crates/controlplane/src/db.rs index 870eccfa..f18e8301 100644 --- a/crates/controlplane/src/db.rs +++ b/crates/controlplane/src/db.rs @@ -1,4 +1,4 @@ -use std::path::Path; +use std::{path::Path, sync::Arc}; use snops_common::{ aot_cmds::Authorization, @@ -13,7 +13,7 @@ use crate::{ state::Agent, }; -pub type TxEntry = (EnvId, CannonId, String); +pub type TxEntry = (EnvId, CannonId, Arc); pub struct Database { #[allow(unused)] diff --git a/crates/controlplane/src/server/actions/deploy.rs b/crates/controlplane/src/server/actions/deploy.rs index 20e5cd33..9ced5b61 100644 --- a/crates/controlplane/src/server/actions/deploy.rs +++ b/crates/controlplane/src/server/actions/deploy.rs @@ -65,7 +65,7 @@ pub async fn deploy_inner( env: &Environment, events: TransactionStatusSender, query: Option, -) -> Result { +) -> Result, ExecutionError> { let DeployAction { cannon: cannon_id, private_key, diff --git a/crates/controlplane/src/server/actions/execute.rs b/crates/controlplane/src/server/actions/execute.rs index dfdbc0e2..77fd0bef 100644 --- a/crates/controlplane/src/server/actions/execute.rs +++ b/crates/controlplane/src/server/actions/execute.rs @@ -27,7 +27,7 @@ use crate::{ }; pub async fn execute_status( - tx_id: String, + tx_id: Arc, mut rx: mpsc::Receiver, ) -> Result, ActionError> { use TransactionStatusEvent::*; @@ -39,15 +39,15 @@ pub async fn execute_status( loop { select! { _ = &mut timeout => { - return Err(ActionError::ExecuteStatusTimeout { tx_id, agent_id, retries }); + return Err(ActionError::ExecuteStatusTimeout { tx_id: tx_id.to_string(), agent_id, retries }); }, Some(msg) = rx.recv() => { match msg { ExecuteAborted => { - return Err(ActionError::ExecuteStatusAborted { tx_id, retries}); + return Err(ActionError::ExecuteStatusAborted { tx_id: tx_id.to_string(), retries}); }, ExecuteFailed(msg) => { - return Err(ActionError::ExecuteStatusFailed { message: msg, tx_id, retries }); + return Err(ActionError::ExecuteStatusFailed { message: msg, tx_id: tx_id.to_string(), retries }); }, Executing(id) => { agent_id = Some(id.to_string()); @@ -113,7 +113,7 @@ pub async fn execute_inner( env: &Environment, events: TransactionStatusSender, query: Option, -) -> Result { +) -> Result, ExecutionError> { let ExecuteAction { cannon: cannon_id, private_key, diff --git a/crates/controlplane/src/state/transactions.rs b/crates/controlplane/src/state/transactions.rs index 18f0205f..748bb616 100644 --- a/crates/controlplane/src/state/transactions.rs +++ b/crates/controlplane/src/state/transactions.rs @@ -100,10 +100,10 @@ pub async fn tracking_task(state: Arc) { } struct PendingTransactions { - to_execute: Vec, - to_broadcast: Vec, - to_remove: Vec, - to_confirm: Vec<(String, Option)>, + to_execute: Vec>, + to_broadcast: Vec>, + to_remove: Vec>, + to_confirm: Vec<(Arc, Option)>, } /// Get a list of transactions that need to be executed, broadcasted, removed, From 9cf6e58e067ee01fee43e720f9d958d4b168dd38 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sat, 30 Nov 2024 23:55:45 -0500 Subject: [PATCH 45/68] feat(cannon): WIP migrate cannon events to the new event streams --- crates/controlplane/src/cannon/context.rs | 21 ++++- crates/controlplane/src/events/models.rs | 91 ++++++++++++++++--- crates/controlplane/src/events/test_filter.rs | 6 ++ crates/controlplane/src/server/rpc.rs | 27 ++++-- crates/controlplane/src/server/websocket.rs | 4 +- crates/controlplane/src/state/global.rs | 17 ++++ 6 files changed, 139 insertions(+), 27 deletions(-) diff --git a/crates/controlplane/src/cannon/context.rs b/crates/controlplane/src/cannon/context.rs index 18168e85..c07f5a61 100644 --- a/crates/controlplane/src/cannon/context.rs +++ b/crates/controlplane/src/cannon/context.rs @@ -21,7 +21,8 @@ use super::{ }; use crate::{ cannon::source::ComputeTarget, - state::{GlobalState, REST_CLIENT}, + events::{EventHelpers, TransactionAbortReason, TransactionEvent}, + state::{GetGlobalState, GlobalState, REST_CLIENT}, }; /// Information a transaction cannon needs for execution via spawned task @@ -98,6 +99,7 @@ impl ExecutionContext { let Some(tracker) = self.transactions.get(&tx_id) else { error!("cannon {env_id}.{cannon_id} missing transaction tracker for {tx_id}"); events.send(TransactionStatusEvent::ExecuteAborted); + TransactionEvent::ExecuteAborted(TransactionAbortReason::MissingTracker).with_cannon_ctx(&self, tx_id).emit(&self); continue; }; // ensure the transaction is in the correct state @@ -105,6 +107,7 @@ impl ExecutionContext { error!("cannon {env_id}.{cannon_id} unexpected status for {tx_id}: {:?}", tracker.status); // TODO: remove this auth and log it somewhere events.send(TransactionStatusEvent::ExecuteAborted); + TransactionEvent::ExecuteAborted(TransactionAbortReason::UnexpectedStatus(tracker.status)).with_cannon_ctx(&self, tx_id).emit(&self); continue; } // ensure the transaction has an authorization (more than likely unreachable) @@ -112,6 +115,7 @@ impl ExecutionContext { error!("cannon {env_id}.{cannon_id} missing authorization for {tx_id}"); // TODO: remove the auth anyway events.send(TransactionStatusEvent::ExecuteAborted); + TransactionEvent::ExecuteAborted(TransactionAbortReason::MissingAuthorization).with_cannon_ctx(&self, tx_id).emit(&self); continue; }; @@ -180,6 +184,9 @@ impl ExecutionContext { query_path: &str, events: TransactionStatusSender, ) -> Result<(), (Arc, CannonError)> { + TransactionEvent::ExecuteQueued + .with_cannon_ctx(self, tx_id.clone()) + .emit(self); events.send(TransactionStatusEvent::ExecuteQueued); match self .source @@ -190,6 +197,9 @@ impl ExecutionContext { // Can't execute the auth if no agents are available. // The transaction task will handle re-appending the auth. Err(CannonError::Source(SourceError::NoAvailableAgents(_))) => { + TransactionEvent::ExecuteAwaitingCompute + .with_cannon_ctx(self, tx_id.clone()) + .emit(self); events.send(TransactionStatusEvent::ExecuteAwaitingCompute); Ok(()) } @@ -197,6 +207,9 @@ impl ExecutionContext { // reset the transaction status to authorized so it can be re-executed self.write_tx_status(&tx_id, TransactionSendState::Authorized); events.send(TransactionStatusEvent::ExecuteFailed(e.to_string())); + TransactionEvent::ExecuteFailed(e.to_string()) + .with_cannon_ctx(self, tx_id.clone()) + .emit(self); Err((tx_id, e)) } res => res.map_err(|e| (tx_id, e)), @@ -360,3 +373,9 @@ impl ExecutionContext { Ok(tx_id) } } + +impl<'a> GetGlobalState<'a> for &'a ExecutionContext { + fn global_state(self) -> &'a GlobalState { + &self.state + } +} diff --git a/crates/controlplane/src/events/models.rs b/crates/controlplane/src/events/models.rs index caa5c950..d3e86911 100644 --- a/crates/controlplane/src/events/models.rs +++ b/crates/controlplane/src/events/models.rs @@ -1,12 +1,20 @@ +use std::sync::Arc; + use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use snops_common::{ node_targets::NodeTargets, rpc::error::ReconcileError, - state::{AgentId, AgentState, EnvId, LatestBlockInfo, NodeKey, NodeStatus, ReconcileStatus}, + state::{ + AgentId, AgentState, EnvId, InternedId, LatestBlockInfo, NodeKey, NodeStatus, + ReconcileStatus, + }, }; -use crate::state::Agent; +use crate::{ + cannon::{context::ExecutionContext, status::TransactionSendState}, + state::{Agent, GetGlobalState}, +}; #[derive(Clone, Debug, Serialize, Deserialize)] pub struct Event { @@ -14,6 +22,8 @@ pub struct Event { pub agent: Option, pub node_key: Option, pub env: Option, + pub transaction: Option>, + pub cannon: Option, pub kind: EventKind, } @@ -21,6 +31,7 @@ pub struct Event { #[serde(tag = "type")] pub enum EventKind { Agent(AgentEvent), + Transaction(TransactionEvent), } #[derive(Clone, Debug, Serialize, Deserialize)] @@ -44,6 +55,23 @@ pub enum AgentEvent { BlockInfo(LatestBlockInfo), } +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "kind")] +pub enum TransactionEvent { + ExecuteAborted(TransactionAbortReason), + ExecuteQueued, + ExecuteAwaitingCompute, + ExecuteFailed(String), +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "reason")] +pub enum TransactionAbortReason { + MissingTracker, + UnexpectedStatus(TransactionSendState), + MissingAuthorization, +} + #[derive(Clone, Copy, Debug, PartialEq)] #[repr(u8)] pub enum EventKindFilter { @@ -55,12 +83,17 @@ pub enum EventKindFilter { AgentReconcileError, AgentNodeStatus, AgentBlockInfo, + TransactionExecuteAborted, + TransactionExecuteQueued, + TransactionExecuteAwaitingCompute, + TransactionExecuteFailed, } impl EventKind { pub fn filter(&self) -> EventKindFilter { use AgentEvent::*; use EventKind::*; + use TransactionEvent::*; match self { Agent(Connected) => EventKindFilter::AgentConnected, @@ -71,6 +104,12 @@ impl EventKind { Agent(ReconcileError(_)) => EventKindFilter::AgentReconcileError, Agent(NodeStatus(_)) => EventKindFilter::AgentNodeStatus, Agent(BlockInfo(_)) => EventKindFilter::AgentBlockInfo, + Transaction(ExecuteAborted(_)) => EventKindFilter::TransactionExecuteAborted, + Transaction(ExecuteQueued) => EventKindFilter::TransactionExecuteQueued, + Transaction(ExecuteAwaitingCompute) => { + EventKindFilter::TransactionExecuteAwaitingCompute + } + Transaction(ExecuteFailed(_)) => EventKindFilter::TransactionExecuteFailed, } } } @@ -108,6 +147,8 @@ impl Event { agent: None, node_key: None, env: None, + transaction: None, + cannon: None, kind, } } @@ -118,22 +159,15 @@ impl Event { agent: self.agent, node_key: self.node_key.clone(), env: self.env, + transaction: self.transaction.clone(), + cannon: self.cannon, kind: kind.into().kind, } } - pub fn with_agent(mut self, agent: &Agent) -> Self { - self.agent = Some(agent.id); - if let AgentState::Node(env_id, node) = &agent.state { - self.node_key = Some(node.node_key.clone()); - self.env = Some(*env_id); - } - self - } - - pub fn with_env(mut self, env_id: EnvId) -> Self { - self.env = Some(env_id); - self + #[inline] + pub fn emit<'a>(self, state: impl GetGlobalState<'a>) { + state.global_state().events.emit(self) } } @@ -149,6 +183,9 @@ pub trait EventHelpers { fn with_agent_id(self, agent_id: AgentId) -> Event; fn with_node_key(self, node_key: NodeKey) -> Event; fn with_env_id(self, env_id: EnvId) -> Event; + fn with_transaction(self, transaction: Arc) -> Event; + fn with_cannon(self, cannon: InternedId) -> Event; + fn with_cannon_ctx(self, ctx: &ExecutionContext, transaction: Arc) -> Event; } impl> EventHelpers for T { @@ -183,6 +220,26 @@ impl> EventHelpers for T { event.env = Some(env_id); event } + + fn with_transaction(self, transaction: Arc) -> Event { + let mut event = self.into(); + event.transaction = Some(transaction); + event + } + + fn with_cannon(self, cannon: InternedId) -> Event { + let mut event = self.into(); + event.cannon = Some(cannon); + event + } + + fn with_cannon_ctx(self, ctx: &ExecutionContext, transaction: Arc) -> Event { + let mut event = self.into(); + event.cannon = Some(ctx.id); + event.env = Some(ctx.env_id); + event.transaction = Some(transaction); + event + } } impl From for Event { @@ -196,3 +253,9 @@ impl From for Event { Self::new(EventKind::Agent(kind)) } } + +impl From for Event { + fn from(kind: TransactionEvent) -> Self { + Self::new(EventKind::Transaction(kind)) + } +} diff --git a/crates/controlplane/src/events/test_filter.rs b/crates/controlplane/src/events/test_filter.rs index 6bb21ac2..6b900046 100644 --- a/crates/controlplane/src/events/test_filter.rs +++ b/crates/controlplane/src/events/test_filter.rs @@ -47,6 +47,8 @@ fn test_all_of() { agent: Some(*A), node_key: Some(NodeKey::from_str("client/foo").unwrap()), env: Some(*B), + transaction: None, + cannon: None, kind: Agent(Connected), }; @@ -72,6 +74,8 @@ fn test_any_of() { agent: Some(*A), node_key: Some(NodeKey::from_str("client/foo").unwrap()), env: Some(*B), + transaction: None, + cannon: None, kind: Agent(Connected), }; @@ -101,6 +105,8 @@ fn test_one_of() { agent: Some(*A), node_key: Some(NodeKey::from_str("client/foo").unwrap()), env: Some(*B), + transaction: None, + cannon: None, kind: Agent(Connected), }; diff --git a/crates/controlplane/src/server/rpc.rs b/crates/controlplane/src/server/rpc.rs index a7f95a54..2422a4c6 100644 --- a/crates/controlplane/src/server/rpc.rs +++ b/crates/controlplane/src/server/rpc.rs @@ -23,7 +23,7 @@ use tracing::warn; use crate::{ error::StateError, events::{AgentEvent, EventHelpers}, - state::{AddrMap, AgentAddrs, AppState}, + state::{AddrMap, AgentAddrs, AppState, GetGlobalState, GlobalState}, }; define_rpc_mux!(parent; @@ -143,9 +143,9 @@ impl ControlService for ControlRpcServer { update_time: Utc::now(), }; - self.state - .events - .emit(AgentEvent::BlockInfo(info.clone()).with_agent(&agent)); + AgentEvent::BlockInfo(info.clone()) + .with_agent(&agent) + .emit(&self); agent.status.block_info = Some(info.clone()); let agent_id = agent.id(); @@ -202,9 +202,9 @@ impl ControlService for ControlRpcServer { }; agent.status.node_status = status.clone(); - self.state - .events - .emit(AgentEvent::NodeStatus(status).with_agent(&agent)); + AgentEvent::NodeStatus(status) + .with_agent(&agent) + .emit(&self); } async fn post_reconcile_status( @@ -223,13 +223,14 @@ impl ControlService for ControlRpcServer { let ev = AgentEvent::ReconcileComplete.with_agent(&agent); let is_complete = status.as_ref().is_ok_and(|e| e.inner.is_some()); - self.state.events.emit(ev.replace_kind(match status { + ev.replace_kind(match status { Ok(res) => AgentEvent::Reconcile(res), Err(err) => AgentEvent::ReconcileError(err), - })); + }) + .emit(&self); if is_complete { - self.state.events.emit(ev); + ev.emit(&self); } } } @@ -273,3 +274,9 @@ fn resolve_addrs( }) .collect()) } + +impl<'a> GetGlobalState<'a> for &'a ControlRpcServer { + fn global_state(self) -> &'a GlobalState { + &self.state + } +} diff --git a/crates/controlplane/src/server/websocket.rs b/crates/controlplane/src/server/websocket.rs index 1ebe319e..af3b6075 100644 --- a/crates/controlplane/src/server/websocket.rs +++ b/crates/controlplane/src/server/websocket.rs @@ -151,7 +151,7 @@ async fn handle_socket( warn!("Connecting agent {id} is trying to identify with an invalid nonce"); break 'reconnect; } - state.events.emit(AgentEvent::Connected.with_agent(&agent)); + AgentEvent::Connected.with_agent(&agent).emit(&state); match agent.env() { Some(env) if !state.envs.contains_key(&env) => { @@ -225,7 +225,7 @@ async fn handle_socket( drop(agent); match client2.handshake(context::current(), handshake).await { - Ok(()) => state2.events.emit(event), + Ok(()) => event.emit(&state2), Err(e) => error!("failed to perform agent {id} handshake: {e}"), } diff --git a/crates/controlplane/src/state/global.rs b/crates/controlplane/src/state/global.rs index f213f373..0cc14132 100644 --- a/crates/controlplane/src/state/global.rs +++ b/crates/controlplane/src/state/global.rs @@ -357,3 +357,20 @@ impl GlobalState { Err(EnvRequestError::NoResponsiveNodes) } } + +pub trait GetGlobalState<'a> { + /// Returns the global state. + fn global_state(self) -> &'a GlobalState; +} + +impl<'a> GetGlobalState<'a> for &'a GlobalState { + fn global_state(self) -> &'a GlobalState { + self + } +} + +impl<'a> GetGlobalState<'a> for &'a Arc { + fn global_state(self) -> &'a GlobalState { + self + } +} From 32af512f9c3ce5f08375f4c5618621f616e66874 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sun, 1 Dec 2024 05:01:16 -0500 Subject: [PATCH 46/68] refactor(controlplane): replace old transaction status tracking with event streams --- crates/controlplane/src/cannon/context.rs | 33 +++++---- crates/controlplane/src/cannon/error.rs | 3 +- crates/controlplane/src/cannon/mod.rs | 14 ++-- crates/controlplane/src/cannon/router.rs | 23 +++--- crates/controlplane/src/cannon/source.rs | 63 +++++----------- crates/controlplane/src/cannon/status.rs | 48 +----------- crates/controlplane/src/env/cache.rs | 5 ++ crates/controlplane/src/events/filter.rs | 4 + crates/controlplane/src/events/models.rs | 67 +++++++++++++---- .../controlplane/src/server/actions/deploy.rs | 36 +++------ .../src/server/actions/execute.rs | 74 +++++++++---------- crates/controlplane/src/server/error.rs | 7 +- crates/controlplane/src/state/transactions.rs | 54 +++++++++----- 13 files changed, 202 insertions(+), 229 deletions(-) diff --git a/crates/controlplane/src/cannon/context.rs b/crates/controlplane/src/cannon/context.rs index c07f5a61..a209eeea 100644 --- a/crates/controlplane/src/cannon/context.rs +++ b/crates/controlplane/src/cannon/context.rs @@ -6,7 +6,7 @@ use futures_util::{stream::FuturesUnordered, StreamExt}; use lazysort::SortedBy; use snops_common::{ aot_cmds::Authorization, - state::{CannonId, EnvId, NetworkId}, + state::{AgentId, CannonId, EnvId, NetworkId}, }; use tracing::{error, trace, warn}; @@ -15,7 +15,7 @@ use super::{ file::TransactionSink, sink::TxSink, source::TxSource, - status::{TransactionSendState, TransactionStatusEvent, TransactionStatusSender}, + status::TransactionSendState, tracker::TransactionTracker, CannonReceivers, }; @@ -94,11 +94,10 @@ impl ExecutionContext { // ------------------------ // receive authorizations and forward the executions to the compute target - Some((tx_id, events)) = rx.authorizations.recv() => { + Some(tx_id) = rx.authorizations.recv() => { // ensure the transaction tracker exists let Some(tracker) = self.transactions.get(&tx_id) else { error!("cannon {env_id}.{cannon_id} missing transaction tracker for {tx_id}"); - events.send(TransactionStatusEvent::ExecuteAborted); TransactionEvent::ExecuteAborted(TransactionAbortReason::MissingTracker).with_cannon_ctx(&self, tx_id).emit(&self); continue; }; @@ -106,7 +105,6 @@ impl ExecutionContext { if tracker.status != TransactionSendState::Authorized { error!("cannon {env_id}.{cannon_id} unexpected status for {tx_id}: {:?}", tracker.status); // TODO: remove this auth and log it somewhere - events.send(TransactionStatusEvent::ExecuteAborted); TransactionEvent::ExecuteAborted(TransactionAbortReason::UnexpectedStatus(tracker.status)).with_cannon_ctx(&self, tx_id).emit(&self); continue; } @@ -114,12 +112,11 @@ impl ExecutionContext { let Some(auth) = &tracker.authorization else { error!("cannon {env_id}.{cannon_id} missing authorization for {tx_id}"); // TODO: remove the auth anyway - events.send(TransactionStatusEvent::ExecuteAborted); TransactionEvent::ExecuteAborted(TransactionAbortReason::MissingAuthorization).with_cannon_ctx(&self, tx_id).emit(&self); continue; }; - auth_execs.push(self.execute_auth(tx_id, Arc::clone(auth), &query_path, events)); + auth_execs.push(self.execute_auth(tx_id, Arc::clone(auth), &query_path)); } // receive transaction ids and forward them to the sink target Some(tx) = rx.transactions.recv() => { @@ -182,16 +179,14 @@ impl ExecutionContext { tx_id: Arc, auth: Arc, query_path: &str, - events: TransactionStatusSender, ) -> Result<(), (Arc, CannonError)> { - TransactionEvent::ExecuteQueued + TransactionEvent::AuthorizationReceived(Arc::clone(&auth)) .with_cannon_ctx(self, tx_id.clone()) .emit(self); - events.send(TransactionStatusEvent::ExecuteQueued); match self .source .compute - .execute(self, query_path, &tx_id, &auth, &events) + .execute(self, query_path, &tx_id, &auth) .await { // Can't execute the auth if no agents are available. @@ -200,13 +195,11 @@ impl ExecutionContext { TransactionEvent::ExecuteAwaitingCompute .with_cannon_ctx(self, tx_id.clone()) .emit(self); - events.send(TransactionStatusEvent::ExecuteAwaitingCompute); Ok(()) } Err(e) => { // reset the transaction status to authorized so it can be re-executed self.write_tx_status(&tx_id, TransactionSendState::Authorized); - events.send(TransactionStatusEvent::ExecuteFailed(e.to_string())); TransactionEvent::ExecuteFailed(e.to_string()) .with_cannon_ctx(self, tx_id.clone()) .emit(self); @@ -283,11 +276,19 @@ impl ExecutionContext { let network = self.network; // update the transaction status and increment the broadcast attempts - let update_status = || { + let update_status = |agent: Option| { self.write_tx_status( &tx_id, TransactionSendState::Broadcasted(latest_height, Utc::now()), ); + let mut ev = TransactionEvent::Broadcasted { + height: latest_height, + timestamp: Utc::now(), + } + .with_cannon_ctx(self, Arc::clone(&tx_id)); + ev.agent = agent; + ev.emit(self); + if let Err(e) = TransactionTracker::inc_attempts( &self.state, &(env_id, cannon_id, tx_id.to_owned()), @@ -311,7 +312,7 @@ impl ExecutionContext { continue; } - update_status(); + update_status(agent); return Ok(tx_id); } @@ -355,7 +356,7 @@ impl ExecutionContext { } } - update_status(); + update_status(None); return Ok(tx_id); } } diff --git a/crates/controlplane/src/cannon/error.rs b/crates/controlplane/src/cannon/error.rs index 53ecdace..a86f3297 100644 --- a/crates/controlplane/src/cannon/error.rs +++ b/crates/controlplane/src/cannon/error.rs @@ -12,7 +12,6 @@ use snops_common::{ use strum_macros::AsRefStr; use thiserror::Error; -use super::status::TransactionStatusSender; use crate::{env::error::EnvRequestError, error::StateError}; #[derive(Debug, Error, AsRefStr)] @@ -170,7 +169,7 @@ pub enum CannonError { #[error("send `auth` error for cannon `{0}`: {1}")] SendAuthError( CannonId, - #[source] tokio::sync::mpsc::error::SendError<(Arc, TransactionStatusSender)>, + #[source] tokio::sync::mpsc::error::SendError>, ), #[error("send `tx` error for cannon `{0}`: {1}")] SendTxError( diff --git a/crates/controlplane/src/cannon/mod.rs b/crates/controlplane/src/cannon/mod.rs index bfaf54be..eecaf649 100644 --- a/crates/controlplane/src/cannon/mod.rs +++ b/crates/controlplane/src/cannon/mod.rs @@ -23,7 +23,7 @@ use snops_common::{ format::PackedUint, state::{CannonId, EnvId, NetworkId, StorageId}, }; -use status::{TransactionSendState, TransactionStatusSender}; +use status::TransactionSendState; use tokio::{ sync::{ mpsc::{UnboundedReceiver, UnboundedSender}, @@ -103,7 +103,7 @@ pub struct CannonInstance { /// channel to send transaction ids to the the task pub(crate) tx_sender: UnboundedSender>, /// channel to send authorizations (by transaction id) to the the task - pub(crate) auth_sender: UnboundedSender<(Arc, TransactionStatusSender)>, + pub(crate) auth_sender: UnboundedSender>, /// transaction ids that are currently being processed pub(crate) transactions: Arc, TransactionTracker>>, @@ -113,7 +113,7 @@ pub struct CannonInstance { pub struct CannonReceivers { transactions: UnboundedReceiver>, - authorizations: UnboundedReceiver<(Arc, TransactionStatusSender)>, + authorizations: UnboundedReceiver>, } pub type CannonInstanceMeta = (EnvId, NetworkId, StorageId, PathBuf); @@ -444,11 +444,7 @@ impl CannonInstance { } /// Called by axum to forward /cannon//auth to a listen source - pub async fn proxy_auth( - &self, - body: Authorization, - events: TransactionStatusSender, - ) -> Result, CannonError> { + pub async fn proxy_auth(&self, body: Authorization) -> Result, CannonError> { let Some(storage) = self .global_state .get_env(self.env_id) @@ -502,7 +498,7 @@ impl CannonInstance { trace!("cannon {}.{} received auth {tx_id}", self.env_id, self.id); self.auth_sender - .send((Arc::clone(&tx_id), events)) + .send(Arc::clone(&tx_id)) .map_err(|e| CannonError::SendAuthError(self.id, e))?; Ok(tx_id) diff --git a/crates/controlplane/src/cannon/router.rs b/crates/controlplane/src/cannon/router.rs index 3e78a6ee..7f03ab6c 100644 --- a/crates/controlplane/src/cannon/router.rs +++ b/crates/controlplane/src/cannon/router.rs @@ -13,9 +13,8 @@ use snops_common::{ key_source::KeySource, state::{id_or_none, KeyState, NetworkId}, }; -use tokio::sync::mpsc; -use super::{source::QueryTarget, status::TransactionStatusSender, Authorization}; +use super::{source::QueryTarget, Authorization}; use crate::{ server::{actions::execute::execute_status, error::ServerError}, state::AppState, @@ -362,22 +361,20 @@ async fn authorization( }; if query.is_async() { - return match cannon - .proxy_auth(body, TransactionStatusSender::empty()) - .await - { + return match cannon.proxy_auth(body).await { Ok(tx_id) => (StatusCode::ACCEPTED, Json(tx_id)).into_response(), Err(e) => ServerError::from(e).into_response(), }; } - let (tx, rx) = mpsc::channel(10); - - match cannon - .proxy_auth(body, TransactionStatusSender::new(tx)) - .await - { - Ok(tx_id) => execute_status(tx_id, rx).await.into_response(), + match cannon.proxy_auth(body).await { + Ok(tx_id) => { + use crate::events::EventFilter::*; + let subscriber = state + .events + .subscribe_on(TransactionIs(tx_id.clone()) & EnvIs(env_id) & CannonIs(cannon_id)); + execute_status(tx_id, subscriber).await.into_response() + } Err(e) => ServerError::from(e).into_response(), } } diff --git a/crates/controlplane/src/cannon/source.rs b/crates/controlplane/src/cannon/source.rs index 252fa63d..84c20be7 100644 --- a/crates/controlplane/src/cannon/source.rs +++ b/crates/controlplane/src/cannon/source.rs @@ -11,11 +11,14 @@ use tracing::error; use super::{ error::{CannonError, SourceError}, net::get_available_port, - status::{TransactionSendState, TransactionStatusEvent, TransactionStatusSender}, + status::TransactionSendState, tracker::TransactionTracker, ExecutionContext, }; -use crate::env::set::find_compute_agent; +use crate::{ + env::set::find_compute_agent, + events::{EventHelpers, TransactionEvent}, +}; /// Represents an instance of a local query service. #[derive(Clone, Debug, Serialize, Deserialize)] @@ -155,7 +158,6 @@ impl ComputeTarget { query_path: &str, tx_id: &Arc, auth: &Authorization, - events: &TransactionStatusSender, ) -> Result<(), CannonError> { match self { ComputeTarget::Agent { labels } => { @@ -165,7 +167,10 @@ impl ComputeTarget { .ok_or(SourceError::NoAvailableAgents("authorization"))?; // emit status updates & increment attempts - events.send(TransactionStatusEvent::Executing(agent_id)); + TransactionEvent::Executing + .with_cannon_ctx(ctx, Arc::clone(tx_id)) + .with_agent_id(agent_id) + .emit(ctx); ctx.write_tx_status(tx_id, TransactionSendState::Executing(Utc::now())); if let Err(e) = TransactionTracker::inc_attempts( &ctx.state, @@ -191,9 +196,12 @@ impl ComputeTarget { let transaction = match serde_json::from_str::>(&transaction_json) { Ok(transaction) => transaction, Err(e) => { - events.send(TransactionStatusEvent::ExecuteFailed(format!( - "failed to parse transaction JSON: {transaction_json}", - ))); + TransactionEvent::ExecuteFailed(format!( + "failed to parse transaction JSON: {e}\n{transaction_json}" + )) + .with_cannon_ctx(ctx, Arc::clone(tx_id)) + .with_agent_id(agent_id) + .emit(ctx); return Err(CannonError::Source(SourceError::Json( "parse compute tx", e, @@ -235,7 +243,10 @@ impl ComputeTarget { tx.status = TransactionSendState::Unsent; tx.transaction = Some(Arc::clone(&transaction)); } - events.send(TransactionStatusEvent::ExecuteComplete(transaction)); + TransactionEvent::ExecuteComplete(Arc::clone(&transaction)) + .with_cannon_ctx(ctx, Arc::clone(tx_id)) + .with_agent_id(agent_id) + .emit(ctx); Ok(()) } @@ -266,39 +277,3 @@ impl ComputeTarget { } } } - -// I use this to generate example yaml... -/* #[cfg(test)] -mod test { - use super::*; - use crate::{ - cannon::source::{ComputeTarget, CreditsTxMode, LocalService, TxMode}, - schema::nodes::KeySource, - }; - use std::str::FromStr; - - #[test] - fn what_does_it_look_like() { - println!( - "{}", - serde_yaml::to_string(&TxSource::Playback { - file_name: "test".to_string(), - }) - .unwrap() - ); - println!( - "{}", - serde_yaml::to_string(&TxSource::RealTime { - query: QueryTarget::Local(LocalService { sync_from: None }), - compute: ComputeTarget::Agent { labels: None }, - tx_modes: [TxMode::Credits(CreditsTxMode::TransferPublic)] - .into_iter() - .collect(), - private_keys: vec![KeySource::from_str("committee.$").unwrap()], - addresses: vec![KeySource::from_str("committee.$").unwrap()], - }) - .unwrap() - ); - } -} - */ diff --git a/crates/controlplane/src/cannon/status.rs b/crates/controlplane/src/cannon/status.rs index ff1b43d1..1a78f662 100644 --- a/crates/controlplane/src/cannon/status.rs +++ b/crates/controlplane/src/cannon/status.rs @@ -1,52 +1,6 @@ -use std::sync::Arc; - use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; -use snops_common::{format::DataFormat, state::AgentId}; -use tokio::sync::mpsc::Sender; - -pub struct TransactionStatusSender(Option>); - -impl TransactionStatusSender { - pub fn new(sender: Sender) -> Self { - Self(Some(sender)) - } - - pub fn empty() -> Self { - Self(None) - } - - pub fn send(&self, status: TransactionStatusEvent) { - if let Some(sender) = &self.0 { - let _ = sender.try_send(status); - } - } -} - -/// An event that represents the latest status of a transaction. -pub enum TransactionStatusEvent { - /// Authorization has been aborted - ExecuteAborted, - /// Authorization has been queued for execution. - ExecuteQueued, - /// No agents are available for the execution - ExecuteAwaitingCompute, - /// An agent was found and the authorization is being executed - Executing(AgentId), - /// Execute RPC failed - ExecuteFailed(String), - /// Agent has completed the execution - ExecuteComplete(Arc), - // TODO: Implement the following statuses - // /// API has received the transaction broadcast - // BroadcastReceived, - // /// Control plane has forwarded the transaction to a peer - // BroadcastForwarded, - // /// An error occurred while broadcasting the transaction - // BroadcastFailed, - // /// Transaction was found in the network, return the block hash - // TransactionConfirmed(String), -} +use snops_common::format::DataFormat; /// Status of a transaction as presented internally for tracking and /// preventing data loss. diff --git a/crates/controlplane/src/env/cache.rs b/crates/controlplane/src/env/cache.rs index 0272dcbc..3a954837 100644 --- a/crates/controlplane/src/env/cache.rs +++ b/crates/controlplane/src/env/cache.rs @@ -177,6 +177,11 @@ impl NetworkCache { self.transaction_to_block_hash.contains_key(tx_id) } + /// Find a block hash given a transaction id + pub fn find_transaction(&self, tx_id: &str) -> Option<&ABlockHash> { + self.transaction_to_block_hash.get(tx_id) + } + /// Check if the latest stored info is within the range of the provided /// height pub fn is_recent_block(&self, height: u32) -> bool { diff --git a/crates/controlplane/src/events/filter.rs b/crates/controlplane/src/events/filter.rs index c6b9f6bf..05c3a8c0 100644 --- a/crates/controlplane/src/events/filter.rs +++ b/crates/controlplane/src/events/filter.rs @@ -10,6 +10,10 @@ impl Event { EventFilter::Not(f) => !self.matches(f), EventFilter::AgentIs(agent) => self.agent == Some(*agent), EventFilter::EnvIs(env) => self.env == Some(*env), + EventFilter::TransactionIs(transaction) => { + self.transaction.as_ref() == Some(transaction) + } + EventFilter::CannonIs(cannon) => self.cannon == Some(*cannon), EventFilter::EventIs(kind) => self.kind.filter() == *kind, EventFilter::NodeKeyIs(node_key) => self.node_key.as_ref() == Some(node_key), EventFilter::NodeTargetIs(node_targets) => self diff --git a/crates/controlplane/src/events/models.rs b/crates/controlplane/src/events/models.rs index d3e86911..94236ddc 100644 --- a/crates/controlplane/src/events/models.rs +++ b/crates/controlplane/src/events/models.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use snops_common::{ + aot_cmds::Authorization, node_targets::NodeTargets, rpc::error::ReconcileError, state::{ @@ -58,10 +59,29 @@ pub enum AgentEvent { #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(tag = "kind")] pub enum TransactionEvent { + /// The authorization was inserted into the cannon + AuthorizationReceived(Arc), + /// The transaction execution was aborted ExecuteAborted(TransactionAbortReason), - ExecuteQueued, + /// The transaction is awaiting compute resources ExecuteAwaitingCompute, + /// An execution failed to complete after multiple attempts + ExecuteExceeded { attempts: u32 }, + /// The transaction execution failed ExecuteFailed(String), + /// The transaction is currently executing + Executing, + /// The transaction execution is complete + ExecuteComplete(Arc), + /// The transaction has been broadcasted + Broadcasted { + height: Option, + timestamp: DateTime, + }, + /// The transaction broadcast has exceeded the maximum number of attempts + BroadcastExceeded { attempts: u32 }, + /// The transaction has been confirmed by the network + Confirmed { hash: String }, } #[derive(Clone, Debug, Serialize, Deserialize)] @@ -83,33 +103,44 @@ pub enum EventKindFilter { AgentReconcileError, AgentNodeStatus, AgentBlockInfo, + TransactionAuthorizationReceived, TransactionExecuteAborted, - TransactionExecuteQueued, TransactionExecuteAwaitingCompute, + TransactionExecuteExceeded, TransactionExecuteFailed, + TransactionExecuting, + TransactionExecuteComplete, + TransactionBroadcasted, + TransactionBroadcastExceeded, + TransactionConfirmed, } impl EventKind { pub fn filter(&self) -> EventKindFilter { use AgentEvent::*; use EventKind::*; + use EventKindFilter::*; use TransactionEvent::*; match self { - Agent(Connected) => EventKindFilter::AgentConnected, - Agent(HandshakeComplete) => EventKindFilter::AgentHandshakeComplete, - Agent(Disconnected) => EventKindFilter::AgentDisconnected, - Agent(ReconcileComplete) => EventKindFilter::AgentReconcileComplete, - Agent(Reconcile(_)) => EventKindFilter::AgentReconcile, - Agent(ReconcileError(_)) => EventKindFilter::AgentReconcileError, - Agent(NodeStatus(_)) => EventKindFilter::AgentNodeStatus, - Agent(BlockInfo(_)) => EventKindFilter::AgentBlockInfo, - Transaction(ExecuteAborted(_)) => EventKindFilter::TransactionExecuteAborted, - Transaction(ExecuteQueued) => EventKindFilter::TransactionExecuteQueued, - Transaction(ExecuteAwaitingCompute) => { - EventKindFilter::TransactionExecuteAwaitingCompute - } - Transaction(ExecuteFailed(_)) => EventKindFilter::TransactionExecuteFailed, + Agent(Connected) => AgentConnected, + Agent(HandshakeComplete) => AgentHandshakeComplete, + Agent(Disconnected) => AgentDisconnected, + Agent(ReconcileComplete) => AgentReconcileComplete, + Agent(Reconcile(_)) => AgentReconcile, + Agent(ReconcileError(_)) => AgentReconcileError, + Agent(NodeStatus(_)) => AgentNodeStatus, + Agent(BlockInfo(_)) => AgentBlockInfo, + Transaction(AuthorizationReceived(_)) => TransactionAuthorizationReceived, + Transaction(ExecuteAborted(_)) => TransactionExecuteAborted, + Transaction(ExecuteAwaitingCompute) => TransactionExecuteAwaitingCompute, + Transaction(ExecuteExceeded { .. }) => TransactionExecuteExceeded, + Transaction(ExecuteFailed(_)) => TransactionExecuteFailed, + Transaction(Executing) => TransactionExecuting, + Transaction(ExecuteComplete(_)) => TransactionExecuteComplete, + Transaction(Broadcasted { .. }) => TransactionBroadcasted, + Transaction(BroadcastExceeded { .. }) => TransactionBroadcastExceeded, + Transaction(Confirmed { .. }) => TransactionConfirmed, } } } @@ -132,6 +163,10 @@ pub enum EventFilter { AgentIs(AgentId), /// Filter by environment ID EnvIs(EnvId), + /// Filter by transaction ID + TransactionIs(Arc), + /// Filter by cannon ID + CannonIs(InternedId), /// Filter by event kind EventIs(EventKindFilter), /// Filter by node key diff --git a/crates/controlplane/src/server/actions/deploy.rs b/crates/controlplane/src/server/actions/deploy.rs index 9ced5b61..fee9ee95 100644 --- a/crates/controlplane/src/server/actions/deploy.rs +++ b/crates/controlplane/src/server/actions/deploy.rs @@ -11,11 +11,10 @@ use snops_common::{ aot_cmds::{AotCmd, Authorization}, state::KeyState, }; -use tokio::sync::mpsc; use super::{execute::execute_status, Env}; use crate::{ - cannon::{error::AuthorizeError, router::AuthQuery, status::TransactionStatusSender}, + cannon::{error::AuthorizeError, router::AuthQuery}, env::{error::ExecutionError, Environment}, server::error::ServerError, state::GlobalState, @@ -28,33 +27,23 @@ pub async fn deploy( Json(action): Json, ) -> Response { let query_addr = env.cannons.get(&action.cannon).map(|c| c.get_local_query()); + let cannon_id = action.cannon; if query.is_async() { - return match deploy_inner( - &state, - action, - &env, - TransactionStatusSender::empty(), - query_addr, - ) - .await - { + return match deploy_inner(&state, action, &env, query_addr).await { Ok(tx_id) => (StatusCode::ACCEPTED, Json(tx_id)).into_response(), Err(e) => ServerError::from(e).into_response(), }; } - let (tx, rx) = mpsc::channel(10); - match deploy_inner( - &state, - action, - &env, - TransactionStatusSender::new(tx), - query_addr, - ) - .await - { - Ok(tx_id) => execute_status(tx_id, rx).await.into_response(), + match deploy_inner(&state, action, &env, query_addr).await { + Ok(tx_id) => { + use crate::events::EventFilter::*; + let subscriber = state + .events + .subscribe_on(TransactionIs(tx_id.clone()) & EnvIs(env.id) & CannonIs(cannon_id)); + execute_status(tx_id, subscriber).await.into_response() + } Err(e) => ServerError::from(e).into_response(), } } @@ -63,7 +52,6 @@ pub async fn deploy_inner( state: &GlobalState, action: DeployAction, env: &Environment, - events: TransactionStatusSender, query: Option, ) -> Result, ExecutionError> { let DeployAction { @@ -119,7 +107,7 @@ pub async fn deploy_inner( serde_json::from_str(&auth_str).map_err(AuthorizeError::Json)?; // proxy it to a listen cannon - let tx_id = cannon.proxy_auth(authorization, events).await?; + let tx_id = cannon.proxy_auth(authorization).await?; Ok(tx_id) } diff --git a/crates/controlplane/src/server/actions/execute.rs b/crates/controlplane/src/server/actions/execute.rs index 77fd0bef..69ec52cb 100644 --- a/crates/controlplane/src/server/actions/execute.rs +++ b/crates/controlplane/src/server/actions/execute.rs @@ -12,25 +12,22 @@ use snops_common::{ aot_cmds::{AotCmd, Authorization}, state::KeyState, }; -use tokio::{select, sync::mpsc}; +use tokio::select; use super::Env; use crate::{ - cannon::{ - error::AuthorizeError, - router::AuthQuery, - status::{TransactionStatusEvent, TransactionStatusSender}, - }, + cannon::{error::AuthorizeError, router::AuthQuery}, env::{error::ExecutionError, Environment}, + events::{Event, EventKind, EventSubscriber}, server::error::{ActionError, ServerError}, state::GlobalState, }; pub async fn execute_status( tx_id: Arc, - mut rx: mpsc::Receiver, + mut rx: EventSubscriber, ) -> Result, ActionError> { - use TransactionStatusEvent::*; + use crate::events::TransactionEvent::*; let mut timeout = Box::pin(tokio::time::sleep(std::time::Duration::from_secs(30))); let mut agent_id = None; @@ -41,16 +38,28 @@ pub async fn execute_status( _ = &mut timeout => { return Err(ActionError::ExecuteStatusTimeout { tx_id: tx_id.to_string(), agent_id, retries }); }, - Some(msg) = rx.recv() => { - match msg { - ExecuteAborted => { - return Err(ActionError::ExecuteStatusAborted { tx_id: tx_id.to_string(), retries}); + Ok(ev) = rx.next() => { + let Event{ kind: EventKind::Transaction(ev), agent, .. } = ev.as_ref() else { + continue; + }; + + match ev { + ExecuteAborted(reason) => { + return Err(ActionError::ExecuteStatusAborted { + tx_id: tx_id.to_string(), + retries, + reason: reason.clone(), + }); }, - ExecuteFailed(msg) => { - return Err(ActionError::ExecuteStatusFailed { message: msg, tx_id: tx_id.to_string(), retries }); + ExecuteFailed(message) => { + return Err(ActionError::ExecuteStatusFailed { + message: message.to_string(), + tx_id: tx_id.to_string(), + retries, + }); }, - Executing(id) => { - agent_id = Some(id.to_string()); + Executing => { + agent_id = agent.map(|id| id.to_string()); }, ExecuteAwaitingCompute => { retries += 1; @@ -76,33 +85,23 @@ pub async fn execute( Json(action): Json, ) -> Response { let query_addr = env.cannons.get(&action.cannon).map(|c| c.get_local_query()); + let cannon_id = action.cannon; if query.is_async() { - return match execute_inner( - &state, - action, - &env, - TransactionStatusSender::empty(), - query_addr, - ) - .await - { + return match execute_inner(&state, action, &env, query_addr).await { Ok(tx_id) => (StatusCode::ACCEPTED, Json(tx_id)).into_response(), Err(e) => ServerError::from(e).into_response(), }; } - let (tx, rx) = mpsc::channel(10); - match execute_inner( - &state, - action, - &env, - TransactionStatusSender::new(tx), - query_addr, - ) - .await - { - Ok(tx_id) => execute_status(tx_id, rx).await.into_response(), + match execute_inner(&state, action, &env, query_addr).await { + Ok(tx_id) => { + use crate::events::EventFilter::*; + let subscriber = state + .events + .subscribe_on(TransactionIs(tx_id.clone()) & EnvIs(env.id) & CannonIs(cannon_id)); + execute_status(tx_id, subscriber).await.into_response() + } Err(e) => ServerError::from(e).into_response(), } } @@ -111,7 +110,6 @@ pub async fn execute_inner( state: &GlobalState, action: ExecuteAction, env: &Environment, - events: TransactionStatusSender, query: Option, ) -> Result, ExecutionError> { let ExecuteAction { @@ -185,7 +183,7 @@ pub async fn execute_inner( serde_json::from_str(&auth_str).map_err(AuthorizeError::Json)?; // proxy it to a listen cannon - let tx_id = cannon.proxy_auth(authorization, events).await?; + let tx_id = cannon.proxy_auth(authorization).await?; Ok(tx_id) } diff --git a/crates/controlplane/src/server/error.rs b/crates/controlplane/src/server/error.rs index d4699ce4..df7d3ce7 100644 --- a/crates/controlplane/src/server/error.rs +++ b/crates/controlplane/src/server/error.rs @@ -11,6 +11,7 @@ use crate::{ cannon::error::CannonError, env::error::{EnvError, EnvRequestError, ExecutionError}, error::DeserializeError, + events::TransactionAbortReason, schema::error::{SchemaError, StorageError}, }; @@ -116,7 +117,11 @@ pub enum ActionError { retries: i32, }, #[error("execution aborted")] - ExecuteStatusAborted { tx_id: String, retries: i32 }, + ExecuteStatusAborted { + tx_id: String, + retries: i32, + reason: TransactionAbortReason, + }, #[error("execution failed")] ExecuteStatusFailed { message: String, diff --git a/crates/controlplane/src/state/transactions.rs b/crates/controlplane/src/state/transactions.rs index 748bb616..8d1ec028 100644 --- a/crates/controlplane/src/state/transactions.rs +++ b/crates/controlplane/src/state/transactions.rs @@ -7,9 +7,9 @@ use tokio::time::timeout; use tracing::{info, trace}; use super::GlobalState; -use crate::cannon::{ - status::{TransactionSendState, TransactionStatusSender}, - tracker::TransactionTracker, +use crate::{ + cannon::{status::TransactionSendState, tracker::TransactionTracker}, + events::{EventHelpers, TransactionEvent}, }; /// This task re-sends all transactions that have not been confirmed, @@ -33,7 +33,7 @@ pub async fn tracking_task(state: Arc) { for tx_id in pending.to_execute { if let Err(e) = cannon .auth_sender - .send((tx_id.clone(), TransactionStatusSender::empty())) + .send(tx_id.clone()) { tracing::error!( "cannon {env_id}.{cannon_id} failed to send auth {tx_id} to cannon: {e:?}" @@ -57,30 +57,33 @@ pub async fn tracking_task(state: Arc) { let state = state.clone(); let cannon_target = cannon.sink.target.as_ref(); async move { - if let Some(cache) = state.env_network_cache.get(&env_id) { - if cache.has_transaction(&tx_id) { - trace!("cannon {env_id}.{cannon_id} confirmed transaction {tx_id} (cache hit)"); - return Some(tx_id); - } + let (tx_id, hash) = if let Some(hash) = state.env_network_cache.get(&env_id).and_then(|cache| cache.find_transaction(&tx_id).cloned()) { + trace!("cannon {env_id}.{cannon_id} confirmed transaction {tx_id} (cache hit)"); + (tx_id, hash.to_string()) } // check if the transaction not is in the cache, then check the peers - if let Some(target) = cannon_target { + else if let Some(target) = cannon_target { match timeout(Duration::from_secs(1), state.snarkos_get::>(env_id, format!("/find/blockHash/{tx_id}"), target)).await { - Ok(Ok(Some(_hash))) => { + Ok(Ok(Some(hash))) => { trace!("cannon {env_id}.{cannon_id} confirmed transaction {tx_id} (get request)"); - return Some(tx_id) - } - Ok(Ok(None)) => { - // the transaction is not in the cache + (tx_id, hash) } - _ => {} + // the transaction is not in the cache + _ => return None, } + } else { + return None; + }; - } + // Emit a confirmed event + TransactionEvent::Confirmed { hash } + .with_cannon(cannon_id) + .with_env_id(env_id) + .with_transaction(Arc::clone(&tx_id)).emit(&state); - None + Some(tx_id) }})).await; // remove all the transactions that are confirmed or expired @@ -125,15 +128,22 @@ fn get_pending_transactions(state: &GlobalState) -> Vec<((EnvId, CannonId), Pend for tx in cannon.transactions.iter() { let tx_id = tx.key().to_owned(); - let key = (env_id, cannon_id, tx_id.to_owned()); + let key = (env_id, cannon_id, Arc::clone(&tx_id)); let attempts = TransactionTracker::get_attempts(state, &key); + let ev = TransactionEvent::Executing + .with_cannon(cannon_id) + .with_env_id(env_id) + .with_transaction(Arc::clone(&tx_id)); + match tx.status { // any authorized transaction that is not started should be queued TransactionSendState::Authorized => { if cannon.sink.authorize_attempts.is_some_and(|a| attempts > a) { info!("cannon {env_id}.{cannon_id} removed auth {tx_id} (too many attempts)"); to_remove.push(tx_id); + ev.replace_kind(TransactionEvent::ExecuteExceeded { attempts }) + .emit(state); } else { to_execute.push((tx_id, tx.index)); } @@ -145,6 +155,8 @@ fn get_pending_transactions(state: &GlobalState) -> Vec<((EnvId, CannonId), Pend { if cannon.sink.authorize_attempts.is_some_and(|a| attempts > a) { info!("cannon {env_id}.{cannon_id} removed auth {tx_id} (too many attempts)"); + ev.replace_kind(TransactionEvent::ExecuteExceeded { attempts }) + .emit(state); to_remove.push(tx_id); } else { to_execute.push((tx_id, tx.index)); @@ -154,6 +166,8 @@ fn get_pending_transactions(state: &GlobalState) -> Vec<((EnvId, CannonId), Pend TransactionSendState::Unsent => { if cannon.sink.broadcast_attempts.is_some_and(|a| attempts > a) { info!("cannon {env_id}.{cannon_id} removed broadcast {tx_id} (too many attempts)"); + ev.replace_kind(TransactionEvent::BroadcastExceeded { attempts }) + .emit(state); to_remove.push(tx_id); } else { to_broadcast.push((tx_id, tx.index)); @@ -190,6 +204,8 @@ fn get_pending_transactions(state: &GlobalState) -> Vec<((EnvId, CannonId), Pend { if cannon.sink.broadcast_attempts.is_some_and(|a| attempts > a) { info!("cannon {env_id}.{cannon_id} removed broadcast {tx_id} (too many attempts)"); + ev.replace_kind(TransactionEvent::BroadcastExceeded { attempts }) + .emit(state); to_remove.push(tx_id); } else { to_broadcast.push((tx_id, tx.index)); From 8974e79325dd79882302cfa00041b9869d8e60c8 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Sun, 1 Dec 2024 23:13:59 -0500 Subject: [PATCH 47/68] fix(aot,controlplane): use correct cost calculation for isonets, use --cost-v1 in aot to specify using deprecated cost calculation --- crates/aot/src/auth/auth_fee.rs | 60 +++++++++++++------ crates/aot/src/auth/auth_program.rs | 5 +- crates/aot/src/auth/mod.rs | 22 ++++++- crates/aot/src/program/cost.rs | 6 +- crates/common/src/aot_cmds/mod.rs | 15 +++++ .../controlplane/src/server/actions/deploy.rs | 2 + .../src/server/actions/execute.rs | 2 + 7 files changed, 90 insertions(+), 22 deletions(-) diff --git a/crates/aot/src/auth/auth_fee.rs b/crates/aot/src/auth/auth_fee.rs index d7260a61..a05edfa9 100644 --- a/crates/aot/src/auth/auth_fee.rs +++ b/crates/aot/src/auth/auth_fee.rs @@ -1,10 +1,10 @@ -use anyhow::{bail, Ok, Result}; +use anyhow::{anyhow, bail, Ok, Result}; use clap::Args; use clap_stdin::MaybeStdin; use rand::{CryptoRng, Rng}; use snarkvm::{ ledger::Deployment, - prelude::Field, + prelude::{cost_in_microcredits_v1, Field}, synthesizer::{ process::{cost_in_microcredits_v2, deployment_cost}, Process, @@ -52,6 +52,9 @@ pub struct AuthorizeFee { /// The seed to use for the authorization generation #[clap(long)] pub seed: Option, + /// Enable cost v1 for the transaction cost estimation (v2 by default) + #[clap(long, default_value_t = false)] + pub cost_v1: bool, } impl AuthorizeFee { @@ -65,7 +68,10 @@ impl AuthorizeFee { query::add_many_programs_to_process(&mut process, programs, query)?; } - (auth.to_execution_id()?, estimate_cost(&process, &auth)?) + ( + auth.to_execution_id()?, + estimate_cost(&process, &auth, !self.cost_v1)?, + ) } (None, Some(deployment), None, None) => { let deployment = deployment.into_inner(); @@ -128,7 +134,11 @@ pub fn fee_auth( Ok(Some(fee)) } -pub fn estimate_cost(process: &Process, func: &Authorization) -> Result { +pub fn estimate_cost( + process: &Process, + func: &Authorization, + use_cost_v2: bool, +) -> Result { let transitions = func.transitions(); let storage_cost = { @@ -174,21 +184,35 @@ pub fn estimate_cost(process: &Process, func: &Authorization) }; //execution.size_in_bytes().map_err(|e| e.to_string())?; - // Compute the finalize cost in microcredits. - let mut finalize_cost = 0u64; - // Iterate over the transitions to accumulate the finalize cost. - for (_key, transition) in transitions { - // Retrieve the function name, program id, and program. - let function_name = *transition.function_name(); + let finalize_cost = if use_cost_v2 { + // cost v2 uses the finalize cost of the first transition + let transition = transitions + .values() + .next() + .ok_or(anyhow!("No transitions"))?; let stack = process.get_stack(transition.program_id())?; - let cost = cost_in_microcredits_v2(stack, &function_name)?; + cost_in_microcredits_v2(stack, transition.function_name())? + } else { + // Compute the finalize cost in microcredits. + let mut finalize_cost = 0u64; + + // Iterate over the transitions to accumulate the finalize cost. + for (_key, transition) in transitions { + // Retrieve the function name, program id, and program. + let function_name = *transition.function_name(); + let stack = process.get_stack(transition.program_id())?; + let cost = cost_in_microcredits_v1(stack, &function_name)?; + + // Accumulate the finalize cost. + if let Some(cost) = finalize_cost.checked_add(cost) { + finalize_cost = cost; + } else { + bail!("The finalize cost computation overflowed for an execution") + }; + } + + finalize_cost + }; - // Accumulate the finalize cost. - if let Some(cost) = finalize_cost.checked_add(cost) { - finalize_cost = cost; - } else { - bail!("The finalize cost computation overflowed for an execution") - }; - } Ok(storage_cost + finalize_cost) } diff --git a/crates/aot/src/auth/auth_program.rs b/crates/aot/src/auth/auth_program.rs index 7b941ebd..d15477eb 100644 --- a/crates/aot/src/auth/auth_program.rs +++ b/crates/aot/src/auth/auth_program.rs @@ -26,6 +26,9 @@ pub struct AuthorizeProgram { /// The seed to use for the authorization generation #[clap(long)] pub seed: Option, + /// Enable cost v1 for the transaction cost estimation (v2 by default) + #[clap(long, default_value_t = false)] + pub cost_v1: bool, } impl AuthorizeProgram { @@ -51,7 +54,7 @@ impl AuthorizeProgram { &mut super::rng_from_seed(self.seed), )?; - let cost = estimate_cost(&process, &auth)?; + let cost = estimate_cost(&process, &auth, !self.cost_v1)?; Ok((auth, cost)) } diff --git a/crates/aot/src/auth/mod.rs b/crates/aot/src/auth/mod.rs index 51ea6707..ee270d3a 100644 --- a/crates/aot/src/auth/mod.rs +++ b/crates/aot/src/auth/mod.rs @@ -46,6 +46,9 @@ pub struct CostCommand { query: Option, #[clap(flatten)] auth: AuthArgs, + /// Enable cost v1 for the transaction cost estimation (v2 by default) + #[clap(long, default_value_t = false)] + pub cost_v1: bool, } /// Authorize a program execution. @@ -65,6 +68,9 @@ pub struct AuthProgramCommand { /// The seed to use for the authorization generation #[clap(long)] pub seed: Option, + /// Enable cost v1 for the transaction cost estimation (v2 by default) + #[clap(long, default_value_t = false)] + pub cost_v1: bool, } /// Deploy a program to the network. @@ -84,6 +90,9 @@ pub struct AuthDeployCommand { /// The seed to use for the authorization generation #[clap(long)] pub seed: Option, + /// Enable cost v1 for the transaction cost estimation (v2 by default) + #[clap(long, default_value_t = false)] + pub cost_v1: bool, } impl AuthCommand { @@ -118,7 +127,11 @@ impl AuthCommand { println!("{id}"); Ok(()) } - AuthCommand::Cost(CostCommand { query, auth }) => { + AuthCommand::Cost(CostCommand { + query, + auth, + cost_v1, + }) => { let cost = match auth.pick()? { AuthBlob::Program { auth, .. } => { let auth = auth.into(); @@ -132,7 +145,7 @@ impl AuthCommand { query::add_many_programs_to_process(&mut process, programs, query)?; } - estimate_cost(&process, &auth)? + estimate_cost(&process, &auth, !cost_v1)? } AuthBlob::Deploy { deployment, .. } => deployment_cost(&deployment)?.0, }; @@ -146,6 +159,7 @@ impl AuthCommand { program_opts, fee_opts, seed, + cost_v1, }) => { let query = program_opts.query.clone(); @@ -154,6 +168,7 @@ impl AuthCommand { key: key.clone(), options: program_opts, seed, + cost_v1, } .parse()?; @@ -172,6 +187,7 @@ impl AuthCommand { id: Some(auth.to_execution_id()?), cost: Some(cost), seed, + cost_v1, } .parse()?; @@ -191,6 +207,7 @@ impl AuthCommand { deploy_opts, fee_opts, seed, + cost_v1, }) => { // authorize the deployment without a fee let AuthBlob::Deploy { @@ -227,6 +244,7 @@ impl AuthCommand { id: Some(deployment.to_deployment_id()?), cost: Some(deployment_cost(&deployment)?.0), seed, + cost_v1, } .parse()? .map(Into::into); diff --git a/crates/aot/src/program/cost.rs b/crates/aot/src/program/cost.rs index 60c86cef..c372a33f 100644 --- a/crates/aot/src/program/cost.rs +++ b/crates/aot/src/program/cost.rs @@ -25,6 +25,9 @@ pub struct CostCommand { /// Program inputs (eg. 1u64 5field) #[clap(num_args = 1, value_delimiter = ' ')] inputs: Vec>, + /// Enable cost v1 for the transaction cost estimation (v2 by default) + #[clap(long, default_value_t = false)] + pub cost_v1: bool, } impl CostCommand { @@ -34,6 +37,7 @@ impl CostCommand { program, function, inputs, + cost_v1, } = self; let program = program.contents()?; @@ -57,7 +61,7 @@ impl CostCommand { &mut rand::thread_rng(), )?; - estimate_cost(&process, &auth) + estimate_cost(&process, &auth, !cost_v1) } else { let deployment = process.deploy::(&program, &mut rand::thread_rng())?; Ok(deployment_cost(&deployment)?.0) diff --git a/crates/common/src/aot_cmds/mod.rs b/crates/common/src/aot_cmds/mod.rs index d03f1191..ded792c2 100644 --- a/crates/common/src/aot_cmds/mod.rs +++ b/crates/common/src/aot_cmds/mod.rs @@ -74,6 +74,7 @@ impl AotCmd { query: Option<&String>, priority_fee: Option, fee_record: Option<&String>, + cost_v1: bool, ) -> Result { let mut command = Command::new(&self.bin); command @@ -101,6 +102,10 @@ impl AotCmd { command.arg("--record").arg(fee_record); } + if cost_v1 { + command.arg("--cost-v1"); + } + command .arg(format!("{program_id}/{function_name}")) .args(inputs); @@ -122,6 +127,7 @@ impl AotCmd { query: Option<&String>, priority_fee: Option, fee_record: Option<&String>, + cost_v1: bool, ) -> Result { let mut command = Command::new(&self.bin); command @@ -150,6 +156,10 @@ impl AotCmd { command.arg("--record").arg(fee_record); } + if cost_v1 { + command.arg("--cost-v1"); + } + command.arg("-"); let mut child = command @@ -204,6 +214,7 @@ impl AotCmd { authorization: &str, priority_fee: Option, fee_record: Option<&String>, + cost_v1: bool, ) -> Result { let mut command = Command::new(&self.bin); command @@ -223,6 +234,10 @@ impl AotCmd { command.arg("--record").arg(fee_record); } + if cost_v1 { + command.arg("--cost-v1"); + } + Self::handle_output( command.output().await, "output", diff --git a/crates/controlplane/src/server/actions/deploy.rs b/crates/controlplane/src/server/actions/deploy.rs index fee9ee95..3806866a 100644 --- a/crates/controlplane/src/server/actions/deploy.rs +++ b/crates/controlplane/src/server/actions/deploy.rs @@ -99,6 +99,8 @@ pub async fn deploy_inner( query.as_ref(), priority_fee, fee_record.as_ref(), + // use cost_v1 when we are not using the native genesis + !env.storage.native_genesis, ) .await?; diff --git a/crates/controlplane/src/server/actions/execute.rs b/crates/controlplane/src/server/actions/execute.rs index 69ec52cb..916243a7 100644 --- a/crates/controlplane/src/server/actions/execute.rs +++ b/crates/controlplane/src/server/actions/execute.rs @@ -175,6 +175,8 @@ pub async fn execute_inner( query.as_ref(), priority_fee, fee_record.as_ref(), + // use cost_v1 when we are not using the native genesis + !env.storage.native_genesis, ) .await?; From 373f289fb76766b50549b321c960cc5d616b73f5 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 2 Dec 2024 02:24:54 -0500 Subject: [PATCH 48/68] feat(controlplane): events string parsing --- crates/controlplane/src/events/filter.rs | 40 ++- .../controlplane/src/events/filter_parse.rs | 320 ++++++++++++++++++ crates/controlplane/src/events/mod.rs | 9 +- crates/controlplane/src/events/models.rs | 191 ++++------- crates/controlplane/src/events/stream.rs | 4 + .../src/events/test_filter_parse.rs | 169 +++++++++ crates/controlplane/src/events/traits.rs | 95 ++++++ 7 files changed, 695 insertions(+), 133 deletions(-) create mode 100644 crates/controlplane/src/events/filter_parse.rs create mode 100644 crates/controlplane/src/events/test_filter_parse.rs create mode 100644 crates/controlplane/src/events/traits.rs diff --git a/crates/controlplane/src/events/filter.rs b/crates/controlplane/src/events/filter.rs index 05c3a8c0..a0b70eda 100644 --- a/crates/controlplane/src/events/filter.rs +++ b/crates/controlplane/src/events/filter.rs @@ -1,4 +1,42 @@ -use super::{Event, EventFilter}; +use std::sync::Arc; + +use snops_common::{ + node_targets::NodeTargets, + state::{AgentId, EnvId, InternedId, NodeKey}, +}; + +use super::{Event, EventKindFilter}; + +#[derive(Clone, Debug, PartialEq)] + +pub enum EventFilter { + /// No filter + Unfiltered, + + /// Logical AND of filters + AllOf(Vec), + /// Logical OR of filters + AnyOf(Vec), + /// Logical XOR of filters + OneOf(Vec), + /// Logical NOT of filter + Not(Box), + + /// Filter by agent ID + AgentIs(AgentId), + /// Filter by environment ID + EnvIs(EnvId), + /// Filter by transaction ID + TransactionIs(Arc), + /// Filter by cannon ID + CannonIs(InternedId), + /// Filter by event kind + EventIs(EventKindFilter), + /// Filter by node key + NodeKeyIs(NodeKey), + /// Filter by node target + NodeTargetIs(NodeTargets), +} impl Event { pub fn matches(&self, filter: &EventFilter) -> bool { diff --git a/crates/controlplane/src/events/filter_parse.rs b/crates/controlplane/src/events/filter_parse.rs new file mode 100644 index 00000000..40be55dc --- /dev/null +++ b/crates/controlplane/src/events/filter_parse.rs @@ -0,0 +1,320 @@ +use std::{fmt::Display, str::FromStr, sync::Arc}; + +use snops_common::node_targets::{NodeTarget, NodeTargets}; + +use super::EventFilter; +use crate::events::EventKindFilter; + +/* + +Example EventFilter string representation: + +unfiltered +any-of(agent-connected, agent-disconnected) +all-of(not(agent-is(foo-bar)), env-is(default)) +node-key-is(client/foo) +node-target-is(client/test-*@*) +node-target-is(client/any) +not(unfiltered) + +*/ + +#[derive(Debug, Copy, Clone)] +enum Token<'a> { + OpenParen, + CloseParen, + Comma, + Whitespace, + Text(&'a str), +} + +impl<'a> Token<'a> { + fn label(self) -> &'static str { + match self { + Token::OpenParen => "open paren", + Token::CloseParen => "close paren", + Token::Comma => "comma", + Token::Whitespace => "whitespace", + Token::Text(_) => "text", + } + } + + fn text(self) -> Option<&'a str> { + match self { + Token::Text(s) => Some(s), + _ => None, + } + } + + fn parsed_text(self) -> Option> { + self.text().map(|s| s.trim().parse()) + } + + fn open_paren(self) -> Option<()> { + matches!(self, Token::OpenParen).then(|| ()) + } + + fn close_paren(self) -> Option<()> { + matches!(self, Token::CloseParen).then(|| ()) + } +} + +struct Lexer<'a> { + string: &'a str, + chars: std::iter::Peekable>>, +} + +impl<'a> Lexer<'a> { + fn new(string: &'a str) -> Lexer<'a> { + Lexer { + string, + chars: string.chars().enumerate().peekable(), + } + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Token<'a>; + + fn next(&mut self) -> Option { + let (index, c) = self.chars.next()?; + Some(match c { + '(' => Token::OpenParen, + ')' => Token::CloseParen, + ',' => Token::Comma, + c if c.is_whitespace() => { + while let Some((_, c)) = self.chars.peek() { + if !c.is_whitespace() { + break; + } + self.chars.next(); + } + // In the future, we might want to return the whitespace + + // let end = self + // .chars + // .peek() + // .map_or_else(|| self.string.len(), |(i, _)| *i); + // Token::Whitespace(&self.string[index..end]) + + Token::Whitespace + } + _ => { + while let Some((_, c)) = self.chars.peek() { + if c == &'(' || c == &')' || c == &',' { + break; + } + self.chars.next(); + } + let end = self + .chars + .peek() + .map_or_else(|| self.string.len(), |(i, _)| *i); + Token::Text(&self.string[index..end]) + } + }) + } +} + +#[derive(Debug, thiserror::Error)] +pub enum EventFilterParseError { + #[error("invalid filter: {0}")] + InvalidFilter(String), + #[error("expected token {0:?}, received {1}")] + ExpectedToken(EventFilterParsable, String), + #[error("error parsing {0:?}: {1}")] + ParseError(EventFilterParsable, String), + #[error("unexpected trailing tokens")] + TrailingTokens, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub enum EventFilterParsable { + OpenParen, + CloseParen, + CommaOrCloseParen, + FilterName, + AgentId, + EnvId, + TransactionId, + CannonId, + EventKind, + NodeKey, + NodeTarget, +} + +struct FilterParser<'a> { + tokens: std::iter::Peekable>, +} + +fn expect_token<'a, T>( + token: Option>, + label: EventFilterParsable, + matcher: impl Fn(Token<'a>) -> Option, +) -> Result { + use EventFilterParseError::*; + let token = token.ok_or_else(|| ExpectedToken(label, "EOF".to_string()))?; + matcher(token).ok_or_else(|| ExpectedToken(label, token.label().to_string())) +} + +fn expect_parsed_text( + token: Option, + label: EventFilterParsable, +) -> Result +where + ::Err: Display, +{ + expect_token(token, label, |token| token.parsed_text::())? + .map_err(|e| EventFilterParseError::ParseError(label, e.to_string())) +} + +fn expect_open_paren(token: Option) -> Result<(), EventFilterParseError> { + expect_token(token, EventFilterParsable::OpenParen, |token| { + token.open_paren() + }) +} + +fn expect_close_paren(token: Option) -> Result<(), EventFilterParseError> { + expect_token(token, EventFilterParsable::CloseParen, |token| { + token.close_paren() + }) +} + +impl<'a> FilterParser<'a> { + fn new(str: &'a str) -> Self { + Self { + tokens: Lexer::new(str).peekable(), + } + } + + fn next(&mut self) -> Option> { + self.tokens.next() + } + + fn expect_parens( + &mut self, + filter: impl Fn(&mut Self) -> Result, + ) -> Result { + self.trim_whitespace(); + expect_open_paren(self.next())?; + self.trim_whitespace(); + let filter = filter(self)?; + expect_close_paren(self.next())?; + Ok(filter) + } + + fn expect_filter(&mut self) -> Result { + self.trim_whitespace(); + use EventFilterParsable as P; + use EventFilterParseError::*; + + let filter_name = expect_token(self.next(), P::FilterName, |token| token.text())?; + + match filter_name.trim() { + "unfiltered" => Ok(EventFilter::Unfiltered), + "any-of" => self.expect_parens(|t| t.expect_filter_vec().map(EventFilter::AnyOf)), + "all-of" => self.expect_parens(|t| t.expect_filter_vec().map(EventFilter::AllOf)), + "one-of" => self.expect_parens(|t| t.expect_filter_vec().map(EventFilter::OneOf)), + "not" => self.expect_parens(|t| Ok(EventFilter::Not(Box::new(t.expect_filter()?)))), + "agent-is" => self.expect_parens(|t| { + expect_parsed_text(t.next(), P::AgentId).map(EventFilter::AgentIs) + }), + "env-is" => self + .expect_parens(|t| expect_parsed_text(t.next(), P::EnvId).map(EventFilter::EnvIs)), + "transaction-is" => self.expect_parens(|t| { + Ok(EventFilter::TransactionIs(Arc::new( + expect_token(t.next(), P::TransactionId, |token| token.text())?.to_string(), + ))) + }), + "cannon-is" => self.expect_parens(|t| { + expect_parsed_text(t.next(), P::CannonId).map(EventFilter::CannonIs) + }), + "event-is" => self.expect_parens(|t| { + expect_parsed_text(t.next(), P::EventKind).map(EventFilter::EventIs) + }), + "node-key-is" => self.expect_parens(|t| { + expect_parsed_text(t.next(), P::NodeKey).map(EventFilter::NodeKeyIs) + }), + "node-target-is" => self.expect_parens(|t| { + expect_parsed_text::(t.next(), P::NodeTarget) + .map(|t| EventFilter::NodeTargetIs(NodeTargets::One(t))) + }), + + // Try to parse as an event kind filter as a fallback + unknown => unknown + .parse::() + .map(EventFilter::EventIs) + .map_err(|_| InvalidFilter(unknown.to_string())), + } + } + + fn expect_filter_vec(&mut self) -> Result, EventFilterParseError> { + self.trim_whitespace(); + let mut filters = Vec::new(); + loop { + match self.tokens.peek() { + Some(Token::CloseParen) => break, + Some(_) => { + filters.push(self.expect_filter()?); + self.trim_whitespace(); + + // Expect either a comma or a close paren + match self.tokens.peek() { + // This also supports trailing commas + Some(Token::Comma) => { + self.tokens.next(); + self.trim_whitespace(); + } + Some(Token::CloseParen) => break, + Some(_) => { + return Err(EventFilterParseError::ExpectedToken( + EventFilterParsable::CommaOrCloseParen, + self.tokens.peek().unwrap().label().to_string(), + )) + } + None => { + return Err(EventFilterParseError::ExpectedToken( + EventFilterParsable::CommaOrCloseParen, + "EOF".to_string(), + )) + } + } + } + None => { + return Err(EventFilterParseError::ExpectedToken( + EventFilterParsable::CloseParen, + "EOF".to_string(), + )) + } + } + } + Ok(filters) + } + + /// Remove leading whitespace tokens from the token stream. + fn trim_whitespace(&mut self) { + while let Some(Token::Whitespace) = self.tokens.peek() { + self.tokens.next(); + } + } + + fn trailing_tokens(&mut self) -> Result<(), EventFilterParseError> { + self.trim_whitespace(); + if self.tokens.next().is_some() { + Err(EventFilterParseError::TrailingTokens) + } else { + Ok(()) + } + } +} + +impl FromStr for EventFilter { + type Err = EventFilterParseError; + + fn from_str(s: &str) -> Result { + let mut parser = FilterParser::new(s); + let filter = parser.expect_filter()?; + parser.trailing_tokens()?; + Ok(filter) + } +} diff --git a/crates/controlplane/src/events/mod.rs b/crates/controlplane/src/events/mod.rs index 675d4556..fb60b7b6 100644 --- a/crates/controlplane/src/events/mod.rs +++ b/crates/controlplane/src/events/mod.rs @@ -2,12 +2,15 @@ mod models; pub use models::*; mod stream; pub use stream::*; - +mod filter_parse; +mod traits; +pub use traits::*; mod filter; +pub use filter::*; mod filter_ops; pub mod prelude { - pub use super::models::EventFilter::*; + pub use super::filter::EventFilter::*; pub use super::models::EventKindFilter::*; pub use super::models::*; } @@ -17,4 +20,6 @@ mod test_filter; #[cfg(test)] mod test_filter_ops; #[cfg(test)] +mod test_filter_parse; +#[cfg(test)] mod test_stream; diff --git a/crates/controlplane/src/events/models.rs b/crates/controlplane/src/events/models.rs index 94236ddc..45ff918f 100644 --- a/crates/controlplane/src/events/models.rs +++ b/crates/controlplane/src/events/models.rs @@ -1,21 +1,14 @@ -use std::sync::Arc; +use std::{fmt::Display, str::FromStr, sync::Arc}; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use snops_common::{ aot_cmds::Authorization, - node_targets::NodeTargets, rpc::error::ReconcileError, - state::{ - AgentId, AgentState, EnvId, InternedId, LatestBlockInfo, NodeKey, NodeStatus, - ReconcileStatus, - }, + state::{AgentId, EnvId, InternedId, LatestBlockInfo, NodeKey, NodeStatus, ReconcileStatus}, }; -use crate::{ - cannon::{context::ExecutionContext, status::TransactionSendState}, - state::{Agent, GetGlobalState}, -}; +use crate::{cannon::status::TransactionSendState, state::GetGlobalState}; #[derive(Clone, Debug, Serialize, Deserialize)] pub struct Event { @@ -29,14 +22,14 @@ pub struct Event { } #[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(tag = "type")] +#[serde(tag = "type", rename_all = "snake_case")] pub enum EventKind { Agent(AgentEvent), Transaction(TransactionEvent), } #[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(tag = "kind")] +#[serde(tag = "kind", rename_all = "snake_case")] pub enum AgentEvent { /// An agent connects to the control plane Connected, @@ -57,7 +50,7 @@ pub enum AgentEvent { } #[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(tag = "kind")] +#[serde(tag = "kind", rename_all = "snake_case")] pub enum TransactionEvent { /// The authorization was inserted into the cannon AuthorizationReceived(Arc), @@ -85,7 +78,7 @@ pub enum TransactionEvent { } #[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(tag = "reason")] +#[serde(tag = "reason", rename_all = "snake_case")] pub enum TransactionAbortReason { MissingTracker, UnexpectedStatus(TransactionSendState), @@ -93,7 +86,6 @@ pub enum TransactionAbortReason { } #[derive(Clone, Copy, Debug, PartialEq)] -#[repr(u8)] pub enum EventKindFilter { AgentConnected, AgentHandshakeComplete, @@ -145,34 +137,62 @@ impl EventKind { } } -#[derive(Clone, Debug, PartialEq)] -pub enum EventFilter { - /// No filter - Unfiltered, +impl FromStr for EventKindFilter { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + // kebab-case + "agent-connected" => Ok(Self::AgentConnected), + "agent-handshake-complete" => Ok(Self::AgentHandshakeComplete), + "agent-disconnected" => Ok(Self::AgentDisconnected), + "agent-reconcile-complete" => Ok(Self::AgentReconcileComplete), + "agent-reconcile" => Ok(Self::AgentReconcile), + "agent-reconcile-error" => Ok(Self::AgentReconcileError), + "agent-node-status" => Ok(Self::AgentNodeStatus), + "agent-block-info" => Ok(Self::AgentBlockInfo), + "transaction-authorization-received" => Ok(Self::TransactionAuthorizationReceived), + "transaction-execute-aborted" => Ok(Self::TransactionExecuteAborted), + "transaction-execute-awaiting-compute" => Ok(Self::TransactionExecuteAwaitingCompute), + "transaction-execute-exceeded" => Ok(Self::TransactionExecuteExceeded), + "transaction-execute-failed" => Ok(Self::TransactionExecuteFailed), + "transaction-executing" => Ok(Self::TransactionExecuting), + "transaction-execute-complete" => Ok(Self::TransactionExecuteComplete), + "transaction-broadcasted" => Ok(Self::TransactionBroadcasted), + "transaction-broadcast-exceeded" => Ok(Self::TransactionBroadcastExceeded), + "transaction-confirmed" => Ok(Self::TransactionConfirmed), + _ => Err(format!("invalid event kind: {s}")), + } + } +} - /// Logical AND of filters - AllOf(Vec), - /// Logical OR of filters - AnyOf(Vec), - /// Logical XOR of filters - OneOf(Vec), - /// Logical NOT of filter - Not(Box), +impl Display for EventKindFilter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use EventKindFilter::*; - /// Filter by agent ID - AgentIs(AgentId), - /// Filter by environment ID - EnvIs(EnvId), - /// Filter by transaction ID - TransactionIs(Arc), - /// Filter by cannon ID - CannonIs(InternedId), - /// Filter by event kind - EventIs(EventKindFilter), - /// Filter by node key - NodeKeyIs(NodeKey), - /// Filter by node target - NodeTargetIs(NodeTargets), + let s = match self { + AgentConnected => "agent-connected", + AgentHandshakeComplete => "agent-handshake-complete", + AgentDisconnected => "agent-disconnected", + AgentReconcileComplete => "agent-reconcile-complete", + AgentReconcile => "agent-reconcile", + AgentReconcileError => "agent-reconcile-error", + AgentNodeStatus => "agent-node-status", + AgentBlockInfo => "agent-block-info", + TransactionAuthorizationReceived => "transaction-authorization-received", + TransactionExecuteAborted => "transaction-execute-aborted", + TransactionExecuteAwaitingCompute => "transaction-execute-awaiting-compute", + TransactionExecuteExceeded => "transaction-execute-exceeded", + TransactionExecuteFailed => "transaction-execute-failed", + TransactionExecuting => "transaction-executing", + TransactionExecuteComplete => "transaction-execute-complete", + TransactionBroadcasted => "transaction-broadcasted", + TransactionBroadcastExceeded => "transaction-broadcast-exceeded", + TransactionConfirmed => "transaction-confirmed", + }; + + write!(f, "{}", s) + } } impl Event { @@ -205,92 +225,3 @@ impl Event { state.global_state().events.emit(self) } } - -impl From for EventFilter { - fn from(kind: EventKindFilter) -> Self { - EventFilter::EventIs(kind) - } -} - -pub trait EventHelpers { - fn event(self) -> Event; - fn with_agent(self, agent: &Agent) -> Event; - fn with_agent_id(self, agent_id: AgentId) -> Event; - fn with_node_key(self, node_key: NodeKey) -> Event; - fn with_env_id(self, env_id: EnvId) -> Event; - fn with_transaction(self, transaction: Arc) -> Event; - fn with_cannon(self, cannon: InternedId) -> Event; - fn with_cannon_ctx(self, ctx: &ExecutionContext, transaction: Arc) -> Event; -} - -impl> EventHelpers for T { - fn event(self) -> Event { - self.into() - } - - fn with_agent(self, agent: &Agent) -> Event { - let mut event = self.into(); - event.agent = Some(agent.id); - if let AgentState::Node(env_id, node) = &agent.state { - event.node_key = Some(node.node_key.clone()); - event.env = Some(*env_id); - } - event - } - - fn with_agent_id(self, agent_id: AgentId) -> Event { - let mut event = self.into(); - event.agent = Some(agent_id); - event - } - - fn with_node_key(self, node_key: NodeKey) -> Event { - let mut event = self.into(); - event.node_key = Some(node_key); - event - } - - fn with_env_id(self, env_id: EnvId) -> Event { - let mut event = self.into(); - event.env = Some(env_id); - event - } - - fn with_transaction(self, transaction: Arc) -> Event { - let mut event = self.into(); - event.transaction = Some(transaction); - event - } - - fn with_cannon(self, cannon: InternedId) -> Event { - let mut event = self.into(); - event.cannon = Some(cannon); - event - } - - fn with_cannon_ctx(self, ctx: &ExecutionContext, transaction: Arc) -> Event { - let mut event = self.into(); - event.cannon = Some(ctx.id); - event.env = Some(ctx.env_id); - event.transaction = Some(transaction); - event - } -} - -impl From for Event { - fn from(kind: EventKind) -> Self { - Self::new(kind) - } -} - -impl From for Event { - fn from(kind: AgentEvent) -> Self { - Self::new(EventKind::Agent(kind)) - } -} - -impl From for Event { - fn from(kind: TransactionEvent) -> Self { - Self::new(EventKind::Transaction(kind)) - } -} diff --git a/crates/controlplane/src/events/stream.rs b/crates/controlplane/src/events/stream.rs index 8e59b905..c1c8c608 100644 --- a/crates/controlplane/src/events/stream.rs +++ b/crates/controlplane/src/events/stream.rs @@ -80,6 +80,10 @@ impl EventSubscriber { } events } + + pub fn set_filter(&mut self, filter: impl Into) { + self.filter = filter.into(); + } } impl Stream for EventSubscriber { diff --git a/crates/controlplane/src/events/test_filter_parse.rs b/crates/controlplane/src/events/test_filter_parse.rs new file mode 100644 index 00000000..186e34c2 --- /dev/null +++ b/crates/controlplane/src/events/test_filter_parse.rs @@ -0,0 +1,169 @@ +use std::sync::Arc; + +use snops_common::{node_targets::NodeTargets, state::InternedId}; + +use super::{ + filter_parse::EventFilterParseError, + EventFilter::{self, *}, + EventKindFilter::*, +}; +use crate::events::filter_parse::EventFilterParsable; + +macro_rules! eq { + ($s:expr, $f:expr) => { + assert_eq!($s.parse::()?, $f); + }; +} + +macro_rules! err { + ($s:expr, $pattern:pat $(if $guard:expr)?) => { + assert!(match $s.parse::() { + $pattern $(if $guard)? => true, + other => { + eprintln!("Received {other:?}"); + false + } + }) + }; +} + +#[test] +fn test_each_filter() -> Result<(), EventFilterParseError> { + eq!("unfiltered", Unfiltered); + eq!("all-of(unfiltered)", AllOf(vec![Unfiltered])); + eq!("any-of(unfiltered)", AnyOf(vec![Unfiltered])); + eq!("one-of(unfiltered)", OneOf(vec![Unfiltered])); + eq!("not(unfiltered)", Not(Box::new(Unfiltered))); + eq!("agent-is(default)", AgentIs(InternedId::default())); + eq!("env-is(default)", EnvIs(InternedId::default())); + eq!( + "transaction-is(foo)", + TransactionIs(Arc::new(String::from("foo"))) + ); + eq!("cannon-is(default)", CannonIs(InternedId::default())); + eq!("event-is(agent-connected)", EventIs(AgentConnected)); + eq!( + "node-key-is(client/foo)", + NodeKeyIs("client/foo".parse().unwrap()) + ); + eq!( + "node-target-is(client/any)", + NodeTargetIs(NodeTargets::One("client/any".parse().unwrap())) + ); + + Ok(()) +} + +#[test] +fn test_array() -> Result<(), EventFilterParseError> { + eq!( + "all-of(unfiltered, unfiltered)", + AllOf(vec![Unfiltered, Unfiltered]) + ); + eq!( + "any-of(unfiltered, unfiltered)", + AnyOf(vec![Unfiltered, Unfiltered]) + ); + eq!( + "one-of(unfiltered, unfiltered)", + OneOf(vec![Unfiltered, Unfiltered]) + ); + + eq!( + "any-of( + unfiltered, + all-of(unfiltered), + any-of(unfiltered), + one-of(unfiltered), + not(unfiltered), + agent-is(default), + env-is(default), + transaction-is(foo), + cannon-is(default), + event-is(agent-connected), + node-key-is(client/foo), + node-target-is(client/any) + )", + AnyOf(vec![ + Unfiltered, + AllOf(vec![Unfiltered]), + AnyOf(vec![Unfiltered]), + OneOf(vec![Unfiltered]), + Not(Box::new(Unfiltered)), + AgentIs(InternedId::default()), + EnvIs(InternedId::default()), + TransactionIs(Arc::new(String::from("foo"))), + CannonIs(InternedId::default()), + EventIs(AgentConnected), + NodeKeyIs("client/foo".parse().unwrap()), + NodeTargetIs(NodeTargets::One("client/any".parse().unwrap())), + ]) + ); + + Ok(()) +} + +#[test] +fn test_whitespace_ignore() -> Result<(), EventFilterParseError> { + eq!( + " all-of ( unfiltered , unfiltered ) ", + AllOf(vec![Unfiltered, Unfiltered]) + ); + Ok(()) +} + +#[test] +fn test_trailing_commas() -> Result<(), EventFilterParseError> { + eq!("all-of(unfiltered,)", AllOf(vec![Unfiltered])); + Ok(()) +} + +#[test] +fn test_deep_nesting() -> Result<(), EventFilterParseError> { + eq!( + "all-of(all-of(all-of(all-of(all-of(all-of(unfiltered))))))", + AllOf(vec![AllOf(vec![AllOf(vec![AllOf(vec![AllOf(vec![ + AllOf(vec![Unfiltered]) + ])])])])]) + ); + + // not + eq!("not(not(not(not(not(not(unfiltered))))))", !!!!!!Unfiltered); + + Ok(()) +} + +#[test] +fn test_invalid() { + err!( + "invalid", + Err(EventFilterParseError::InvalidFilter(e)) if e == "invalid" + ); +} + +#[test] +fn test_expected_parens() { + use EventFilterParsable::*; + + err!( + "all-of", + Err(EventFilterParseError::ExpectedToken(a, b)) if a == OpenParen && b == "EOF" + ); + err!( + "all-of(", + Err(EventFilterParseError::ExpectedToken(a, b)) if a == CloseParen && b == "EOF" + ); + err!( + "all-of(unfiltered", + Err(EventFilterParseError::ExpectedToken(a, b)) if a == CommaOrCloseParen && b == "EOF" + ); +} + +#[test] +fn test_failed_agent_parse() { + err!( + "agent-is(|)", + Err(EventFilterParseError::ParseError(EventFilterParsable::AgentId, e)) + if e.starts_with("invalid InternedId expected pattern") + ); +} diff --git a/crates/controlplane/src/events/traits.rs b/crates/controlplane/src/events/traits.rs new file mode 100644 index 00000000..e41fc5c9 --- /dev/null +++ b/crates/controlplane/src/events/traits.rs @@ -0,0 +1,95 @@ +use std::sync::Arc; + +use snops_common::state::{AgentId, AgentState, EnvId, InternedId, NodeKey}; + +use super::{AgentEvent, Event, EventFilter, EventKind, EventKindFilter, TransactionEvent}; +use crate::{cannon::context::ExecutionContext, state::Agent}; + +impl From for EventFilter { + fn from(kind: EventKindFilter) -> Self { + EventFilter::EventIs(kind) + } +} + +pub trait EventHelpers { + fn event(self) -> Event; + fn with_agent(self, agent: &Agent) -> Event; + fn with_agent_id(self, agent_id: AgentId) -> Event; + fn with_node_key(self, node_key: NodeKey) -> Event; + fn with_env_id(self, env_id: EnvId) -> Event; + fn with_transaction(self, transaction: Arc) -> Event; + fn with_cannon(self, cannon: InternedId) -> Event; + fn with_cannon_ctx(self, ctx: &ExecutionContext, transaction: Arc) -> Event; +} + +impl> EventHelpers for T { + fn event(self) -> Event { + self.into() + } + + fn with_agent(self, agent: &Agent) -> Event { + let mut event = self.into(); + event.agent = Some(agent.id); + if let AgentState::Node(env_id, node) = &agent.state { + event.node_key = Some(node.node_key.clone()); + event.env = Some(*env_id); + } + event + } + + fn with_agent_id(self, agent_id: AgentId) -> Event { + let mut event = self.into(); + event.agent = Some(agent_id); + event + } + + fn with_node_key(self, node_key: NodeKey) -> Event { + let mut event = self.into(); + event.node_key = Some(node_key); + event + } + + fn with_env_id(self, env_id: EnvId) -> Event { + let mut event = self.into(); + event.env = Some(env_id); + event + } + + fn with_transaction(self, transaction: Arc) -> Event { + let mut event = self.into(); + event.transaction = Some(transaction); + event + } + + fn with_cannon(self, cannon: InternedId) -> Event { + let mut event = self.into(); + event.cannon = Some(cannon); + event + } + + fn with_cannon_ctx(self, ctx: &ExecutionContext, transaction: Arc) -> Event { + let mut event = self.into(); + event.cannon = Some(ctx.id); + event.env = Some(ctx.env_id); + event.transaction = Some(transaction); + event + } +} + +impl From for Event { + fn from(kind: EventKind) -> Self { + Self::new(kind) + } +} + +impl From for Event { + fn from(kind: AgentEvent) -> Self { + Self::new(EventKind::Agent(kind)) + } +} + +impl From for Event { + fn from(kind: TransactionEvent) -> Self { + Self::new(EventKind::Transaction(kind)) + } +} From e3d55d9230726ad668ae847b100ae67dfa975695 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 2 Dec 2024 18:44:10 -0500 Subject: [PATCH 49/68] fix(events): node-target-is filter now supports vec node targets --- crates/common/src/node_targets.rs | 12 +- crates/controlplane/src/events/filter.rs | 29 +++- .../controlplane/src/events/filter_parse.rs | 130 ++++++++---------- .../src/events/test_filter_parse.rs | 34 +++++ 4 files changed, 126 insertions(+), 79 deletions(-) diff --git a/crates/common/src/node_targets.rs b/crates/common/src/node_targets.rs index 7d6fe6b6..4191137d 100644 --- a/crates/common/src/node_targets.rs +++ b/crates/common/src/node_targets.rs @@ -135,13 +135,13 @@ impl fmt::Display for NodeTargets { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { NodeTargets::None => write!(f, ""), - NodeTargets::One(target) => write!(f, "{}", target), + NodeTargets::One(target) => write!(f, "{target}"), NodeTargets::Many(targets) => { let mut iter = targets.iter(); if let Some(target) = iter.next() { - write!(f, "{}", target)?; + write!(f, "{target}")?; for target in iter { - write!(f, ", {}", target)?; + write!(f, ", {target}")?; } } Ok(()) @@ -228,16 +228,16 @@ impl fmt::Display for NodeTarget { f, "{}/{}{}", match self.ty { - NodeTargetType::All => "*".to_owned(), + NodeTargetType::All => "any".to_owned(), NodeTargetType::One(ty) => ty.to_string(), }, match &self.id { - NodeTargetId::All => "*".to_owned(), + NodeTargetId::All => "any".to_owned(), NodeTargetId::WildcardPattern(pattern) => pattern.to_string(), NodeTargetId::Literal(id) => id.to_owned(), }, match &self.ns { - NodeTargetNamespace::All => "@*".to_owned(), + NodeTargetNamespace::All => "@any".to_owned(), NodeTargetNamespace::Local => "".to_owned(), NodeTargetNamespace::Literal(ns) => format!("@{}", ns), } diff --git a/crates/controlplane/src/events/filter.rs b/crates/controlplane/src/events/filter.rs index a0b70eda..6434e071 100644 --- a/crates/controlplane/src/events/filter.rs +++ b/crates/controlplane/src/events/filter.rs @@ -1,4 +1,4 @@ -use std::sync::Arc; +use std::{fmt::Display, sync::Arc}; use snops_common::{ node_targets::NodeTargets, @@ -61,3 +61,30 @@ impl Event { } } } + +fn event_filter_vec(filters: &[EventFilter]) -> String { + filters + .iter() + .map(|f| f.to_string()) + .collect::>() + .join(", ") +} + +impl Display for EventFilter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + EventFilter::Unfiltered => write!(f, "unfiltered"), + EventFilter::AllOf(vec) => write!(f, "all-of({})", event_filter_vec(vec)), + EventFilter::AnyOf(vec) => write!(f, "any-of({})", event_filter_vec(vec)), + EventFilter::OneOf(vec) => write!(f, "one-of({})", event_filter_vec(vec)), + EventFilter::Not(event_filter) => write!(f, "not({})", event_filter), + EventFilter::AgentIs(id) => write!(f, "agent-is({id})"), + EventFilter::EnvIs(id) => write!(f, "env-is({id})"), + EventFilter::TransactionIs(str) => write!(f, "transaction-is({str})"), + EventFilter::CannonIs(id) => write!(f, "cannon-is({id})"), + EventFilter::EventIs(event) => write!(f, "event-is({event})"), + EventFilter::NodeKeyIs(node_key) => write!(f, "node-key-is({node_key})"), + EventFilter::NodeTargetIs(node_targets) => write!(f, "node-target-is({node_targets})"), + } + } +} diff --git a/crates/controlplane/src/events/filter_parse.rs b/crates/controlplane/src/events/filter_parse.rs index 40be55dc..33ac5adc 100644 --- a/crates/controlplane/src/events/filter_parse.rs +++ b/crates/controlplane/src/events/filter_parse.rs @@ -5,18 +5,15 @@ use snops_common::node_targets::{NodeTarget, NodeTargets}; use super::EventFilter; use crate::events::EventKindFilter; -/* - -Example EventFilter string representation: - -unfiltered -any-of(agent-connected, agent-disconnected) -all-of(not(agent-is(foo-bar)), env-is(default)) -node-key-is(client/foo) -node-target-is(client/test-*@*) -node-target-is(client/any) -not(unfiltered) - +/* Example EventFilter string representation: + + unfiltered + any-of(agent-connected, agent-disconnected) + all-of(not(agent-is(foo-bar)), env-is(default)) + node-key-is(client/foo) + node-target-is(client/test-*@*) + node-target-is(client/any) + not(unfiltered) */ #[derive(Debug, Copy, Clone)] @@ -157,7 +154,7 @@ fn expect_token<'a, T>( matcher(token).ok_or_else(|| ExpectedToken(label, token.label().to_string())) } -fn expect_parsed_text( +fn expect_parsed( token: Option, label: EventFilterParsable, ) -> Result @@ -191,7 +188,7 @@ impl<'a> FilterParser<'a> { self.tokens.next() } - fn expect_parens( + fn parens( &mut self, filter: impl Fn(&mut Self) -> Result, ) -> Result { @@ -204,88 +201,77 @@ impl<'a> FilterParser<'a> { } fn expect_filter(&mut self) -> Result { - self.trim_whitespace(); + use EventFilter::*; use EventFilterParsable as P; - use EventFilterParseError::*; + use EventFilterParseError::InvalidFilter; + + self.trim_whitespace(); let filter_name = expect_token(self.next(), P::FilterName, |token| token.text())?; match filter_name.trim() { - "unfiltered" => Ok(EventFilter::Unfiltered), - "any-of" => self.expect_parens(|t| t.expect_filter_vec().map(EventFilter::AnyOf)), - "all-of" => self.expect_parens(|t| t.expect_filter_vec().map(EventFilter::AllOf)), - "one-of" => self.expect_parens(|t| t.expect_filter_vec().map(EventFilter::OneOf)), - "not" => self.expect_parens(|t| Ok(EventFilter::Not(Box::new(t.expect_filter()?)))), - "agent-is" => self.expect_parens(|t| { - expect_parsed_text(t.next(), P::AgentId).map(EventFilter::AgentIs) + "unfiltered" => Ok(Unfiltered), + "any-of" => self.parens(|t| t.vec_of(|s| s.expect_filter()).map(AnyOf)), + "all-of" => self.parens(|t| t.vec_of(|s| s.expect_filter()).map(AllOf)), + "one-of" => self.parens(|t| t.vec_of(|s| s.expect_filter()).map(OneOf)), + "not" => self.parens(|t| Ok(Not(Box::new(t.expect_filter()?)))), + + "agent-is" => self.parens(|t| expect_parsed(t.next(), P::AgentId).map(AgentIs)), + "env-is" => self.parens(|t| expect_parsed(t.next(), P::EnvId).map(EnvIs)), + "transaction-is" => self.parens(|t| { + expect_token(t.next(), P::TransactionId, |token| token.text()) + .map(|t| TransactionIs(Arc::new(t.to_string()))) }), - "env-is" => self - .expect_parens(|t| expect_parsed_text(t.next(), P::EnvId).map(EventFilter::EnvIs)), - "transaction-is" => self.expect_parens(|t| { - Ok(EventFilter::TransactionIs(Arc::new( - expect_token(t.next(), P::TransactionId, |token| token.text())?.to_string(), - ))) - }), - "cannon-is" => self.expect_parens(|t| { - expect_parsed_text(t.next(), P::CannonId).map(EventFilter::CannonIs) - }), - "event-is" => self.expect_parens(|t| { - expect_parsed_text(t.next(), P::EventKind).map(EventFilter::EventIs) - }), - "node-key-is" => self.expect_parens(|t| { - expect_parsed_text(t.next(), P::NodeKey).map(EventFilter::NodeKeyIs) - }), - "node-target-is" => self.expect_parens(|t| { - expect_parsed_text::(t.next(), P::NodeTarget) - .map(|t| EventFilter::NodeTargetIs(NodeTargets::One(t))) + "cannon-is" => self.parens(|t| expect_parsed(t.next(), P::CannonId).map(CannonIs)), + "event-is" => self.parens(|t| expect_parsed(t.next(), P::EventKind).map(EventIs)), + "node-key-is" => self.parens(|t| expect_parsed(t.next(), P::NodeKey).map(NodeKeyIs)), + "node-target-is" => self.parens(|t| { + t.vec_of(|t| expect_parsed::(t.next(), P::NodeTarget)) + .map(|v| NodeTargetIs(NodeTargets::from(v))) }), // Try to parse as an event kind filter as a fallback unknown => unknown .parse::() - .map(EventFilter::EventIs) + .map(EventIs) .map_err(|_| InvalidFilter(unknown.to_string())), } } - fn expect_filter_vec(&mut self) -> Result, EventFilterParseError> { + fn vec_of( + &mut self, + matcher: impl Fn(&mut Self) -> Result, + ) -> Result, EventFilterParseError> { + use EventFilterParsable::*; + use EventFilterParseError::ExpectedToken; + self.trim_whitespace(); let mut filters = Vec::new(); loop { match self.tokens.peek() { Some(Token::CloseParen) => break, - Some(_) => { - filters.push(self.expect_filter()?); - self.trim_whitespace(); + None => return Err(ExpectedToken(CloseParen, "EOF".to_string())), + Some(_) => {} + } - // Expect either a comma or a close paren - match self.tokens.peek() { - // This also supports trailing commas - Some(Token::Comma) => { - self.tokens.next(); - self.trim_whitespace(); - } - Some(Token::CloseParen) => break, - Some(_) => { - return Err(EventFilterParseError::ExpectedToken( - EventFilterParsable::CommaOrCloseParen, - self.tokens.peek().unwrap().label().to_string(), - )) - } - None => { - return Err(EventFilterParseError::ExpectedToken( - EventFilterParsable::CommaOrCloseParen, - "EOF".to_string(), - )) - } - } + filters.push(matcher(self)?); + self.trim_whitespace(); + + // Expect either a comma or a close paren + match self.tokens.peek() { + // This also supports trailing commas + Some(Token::Comma) => { + self.tokens.next(); + self.trim_whitespace(); } - None => { - return Err(EventFilterParseError::ExpectedToken( - EventFilterParsable::CloseParen, - "EOF".to_string(), + Some(Token::CloseParen) => break, + Some(_) => { + return Err(ExpectedToken( + CommaOrCloseParen, + self.tokens.peek().unwrap().label().to_string(), )) } + None => return Err(ExpectedToken(CommaOrCloseParen, "EOF".to_string())), } } Ok(filters) diff --git a/crates/controlplane/src/events/test_filter_parse.rs b/crates/controlplane/src/events/test_filter_parse.rs index 186e34c2..8a7e7a0e 100644 --- a/crates/controlplane/src/events/test_filter_parse.rs +++ b/crates/controlplane/src/events/test_filter_parse.rs @@ -100,6 +100,14 @@ fn test_array() -> Result<(), EventFilterParseError> { ]) ); + eq!( + "node-target-is(client/any,validator/any)", + NodeTargetIs(NodeTargets::Many(vec![ + "client/any".parse().unwrap(), + "validator/any".parse().unwrap(), + ])) + ); + Ok(()) } @@ -167,3 +175,29 @@ fn test_failed_agent_parse() { if e.starts_with("invalid InternedId expected pattern") ); } + +#[test] +fn test_str() { + macro_rules! test { + ($s:expr) => { + assert_eq!($s.parse::().unwrap().to_string(), $s); + }; + } + + test!("unfiltered"); + test!("any-of(unfiltered)"); + test!("all-of(unfiltered)"); + test!("one-of(unfiltered)"); + test!("not(unfiltered)"); + test!("agent-is(default)"); + test!("env-is(default)"); + test!("transaction-is(foo)"); + test!("cannon-is(default)"); + test!("event-is(agent-connected)"); + test!("node-key-is(client/foo)"); + test!("node-target-is(client/any)"); + test!("node-target-is(client/any, validator/any)"); + + test!("any-of(unfiltered, unfiltered)"); + test!("any-of(agent-is(foo), cannon-is(bar))"); +} From 8c860dd9d7a25fc01a8c2d31635bae716279adb2 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 2 Dec 2024 18:45:10 -0500 Subject: [PATCH 50/68] refactor(events): move event filters and models to common --- .../src/events/filter.rs | 5 ++-- .../src/events/filter_ops.rs | 0 .../src/events/filter_parse.rs | 3 +-- crates/common/src/events/mod.rs | 21 ++++++++++++++++ .../src/events/models.rs | 15 +++++------- .../src/events/test_filter.rs | 8 +++---- .../src/events/test_filter_ops.rs | 2 +- .../src/events/test_filter_parse.rs | 3 +-- .../src/events/traits.rs | 24 +------------------ crates/common/src/lib.rs | 1 + crates/common/src/state/mod.rs | 2 ++ .../src/state/transaction_status.rs} | 15 ++++++------ crates/controlplane/src/cannon/context.rs | 21 ++++++++++++---- crates/controlplane/src/cannon/mod.rs | 4 +--- crates/controlplane/src/cannon/router.rs | 2 +- crates/controlplane/src/cannon/source.rs | 10 ++++---- crates/controlplane/src/cannon/tracker.rs | 4 ++-- crates/controlplane/src/db.rs | 3 +-- crates/controlplane/src/events/mod.rs | 20 ---------------- crates/controlplane/src/events/stream.rs | 3 +-- crates/controlplane/src/events/test_stream.rs | 4 ++-- crates/controlplane/src/persist/env.rs | 5 ++-- .../controlplane/src/server/actions/deploy.rs | 2 +- .../src/server/actions/execute.rs | 7 +++--- .../controlplane/src/server/actions/power.rs | 4 ++-- crates/controlplane/src/server/error.rs | 4 ++-- crates/controlplane/src/server/rpc.rs | 3 ++- crates/controlplane/src/server/websocket.rs | 4 ++-- crates/controlplane/src/state/agent.rs | 17 +++++++++++++ crates/controlplane/src/state/global.rs | 12 ++++++++++ crates/controlplane/src/state/transactions.rs | 12 +++++----- 31 files changed, 128 insertions(+), 112 deletions(-) rename crates/{controlplane => common}/src/events/filter.rs (99%) rename crates/{controlplane => common}/src/events/filter_ops.rs (100%) rename crates/{controlplane => common}/src/events/filter_parse.rs (99%) create mode 100644 crates/common/src/events/mod.rs rename crates/{controlplane => common}/src/events/models.rs (96%) rename crates/{controlplane => common}/src/events/test_filter.rs (99%) rename crates/{controlplane => common}/src/events/test_filter_ops.rs (98%) rename crates/{controlplane => common}/src/events/test_filter_parse.rs (98%) rename crates/{controlplane => common}/src/events/traits.rs (68%) rename crates/{controlplane/src/cannon/status.rs => common/src/state/transaction_status.rs} (91%) diff --git a/crates/controlplane/src/events/filter.rs b/crates/common/src/events/filter.rs similarity index 99% rename from crates/controlplane/src/events/filter.rs rename to crates/common/src/events/filter.rs index 6434e071..da13bf12 100644 --- a/crates/controlplane/src/events/filter.rs +++ b/crates/common/src/events/filter.rs @@ -1,12 +1,11 @@ use std::{fmt::Display, sync::Arc}; -use snops_common::{ +use super::{Event, EventKindFilter}; +use crate::{ node_targets::NodeTargets, state::{AgentId, EnvId, InternedId, NodeKey}, }; -use super::{Event, EventKindFilter}; - #[derive(Clone, Debug, PartialEq)] pub enum EventFilter { diff --git a/crates/controlplane/src/events/filter_ops.rs b/crates/common/src/events/filter_ops.rs similarity index 100% rename from crates/controlplane/src/events/filter_ops.rs rename to crates/common/src/events/filter_ops.rs diff --git a/crates/controlplane/src/events/filter_parse.rs b/crates/common/src/events/filter_parse.rs similarity index 99% rename from crates/controlplane/src/events/filter_parse.rs rename to crates/common/src/events/filter_parse.rs index 33ac5adc..333800bc 100644 --- a/crates/controlplane/src/events/filter_parse.rs +++ b/crates/common/src/events/filter_parse.rs @@ -1,9 +1,8 @@ use std::{fmt::Display, str::FromStr, sync::Arc}; -use snops_common::node_targets::{NodeTarget, NodeTargets}; - use super::EventFilter; use crate::events::EventKindFilter; +use crate::node_targets::{NodeTarget, NodeTargets}; /* Example EventFilter string representation: diff --git a/crates/common/src/events/mod.rs b/crates/common/src/events/mod.rs new file mode 100644 index 00000000..f0cfa860 --- /dev/null +++ b/crates/common/src/events/mod.rs @@ -0,0 +1,21 @@ +mod models; +pub use models::*; +mod filter_parse; +mod traits; +pub use traits::*; +mod filter; +pub use filter::*; +mod filter_ops; + +pub mod prelude { + pub use super::filter::EventFilter::*; + pub use super::models::EventKindFilter::*; + pub use super::models::*; +} + +#[cfg(test)] +mod test_filter; +#[cfg(test)] +mod test_filter_ops; +#[cfg(test)] +mod test_filter_parse; diff --git a/crates/controlplane/src/events/models.rs b/crates/common/src/events/models.rs similarity index 96% rename from crates/controlplane/src/events/models.rs rename to crates/common/src/events/models.rs index 45ff918f..fe33573c 100644 --- a/crates/controlplane/src/events/models.rs +++ b/crates/common/src/events/models.rs @@ -2,14 +2,16 @@ use std::{fmt::Display, str::FromStr, sync::Arc}; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; -use snops_common::{ + +use crate::{ aot_cmds::Authorization, rpc::error::ReconcileError, - state::{AgentId, EnvId, InternedId, LatestBlockInfo, NodeKey, NodeStatus, ReconcileStatus}, + state::{ + AgentId, EnvId, InternedId, LatestBlockInfo, NodeKey, NodeStatus, ReconcileStatus, + TransactionSendState, + }, }; -use crate::{cannon::status::TransactionSendState, state::GetGlobalState}; - #[derive(Clone, Debug, Serialize, Deserialize)] pub struct Event { pub created_at: DateTime, @@ -219,9 +221,4 @@ impl Event { kind: kind.into().kind, } } - - #[inline] - pub fn emit<'a>(self, state: impl GetGlobalState<'a>) { - state.global_state().events.emit(self) - } } diff --git a/crates/controlplane/src/events/test_filter.rs b/crates/common/src/events/test_filter.rs similarity index 99% rename from crates/controlplane/src/events/test_filter.rs rename to crates/common/src/events/test_filter.rs index 6b900046..3173b173 100644 --- a/crates/controlplane/src/events/test_filter.rs +++ b/crates/common/src/events/test_filter.rs @@ -2,15 +2,15 @@ use std::str::FromStr; use chrono::Utc; use lazy_static::lazy_static; -use snops_common::{ + +use super::{AgentEvent::*, EventFilter::*, EventKind::*, EventKindFilter::*}; +use crate::events::{Event, EventHelpers}; +use crate::{ node_targets::NodeTargets, rpc::error::ReconcileError, state::{InternedId, LatestBlockInfo, NodeKey, NodeStatus, ReconcileStatus}, }; -use super::{AgentEvent::*, EventFilter::*, EventKind::*, EventKindFilter::*}; -use crate::events::{Event, EventHelpers}; - lazy_static! { static ref A: InternedId = InternedId::from_str("a").unwrap(); static ref B: InternedId = InternedId::from_str("b").unwrap(); diff --git a/crates/controlplane/src/events/test_filter_ops.rs b/crates/common/src/events/test_filter_ops.rs similarity index 98% rename from crates/controlplane/src/events/test_filter_ops.rs rename to crates/common/src/events/test_filter_ops.rs index 2870be53..42ab1a25 100644 --- a/crates/controlplane/src/events/test_filter_ops.rs +++ b/crates/common/src/events/test_filter_ops.rs @@ -1,10 +1,10 @@ use std::str::FromStr; use lazy_static::lazy_static; -use snops_common::state::InternedId; use super::EventFilter::*; use super::EventKindFilter::*; +use crate::state::InternedId; lazy_static! { static ref A: InternedId = InternedId::from_str("a").unwrap(); diff --git a/crates/controlplane/src/events/test_filter_parse.rs b/crates/common/src/events/test_filter_parse.rs similarity index 98% rename from crates/controlplane/src/events/test_filter_parse.rs rename to crates/common/src/events/test_filter_parse.rs index 8a7e7a0e..e5069fab 100644 --- a/crates/controlplane/src/events/test_filter_parse.rs +++ b/crates/common/src/events/test_filter_parse.rs @@ -1,13 +1,12 @@ use std::sync::Arc; -use snops_common::{node_targets::NodeTargets, state::InternedId}; - use super::{ filter_parse::EventFilterParseError, EventFilter::{self, *}, EventKindFilter::*, }; use crate::events::filter_parse::EventFilterParsable; +use crate::{node_targets::NodeTargets, state::InternedId}; macro_rules! eq { ($s:expr, $f:expr) => { diff --git a/crates/controlplane/src/events/traits.rs b/crates/common/src/events/traits.rs similarity index 68% rename from crates/controlplane/src/events/traits.rs rename to crates/common/src/events/traits.rs index e41fc5c9..82042820 100644 --- a/crates/controlplane/src/events/traits.rs +++ b/crates/common/src/events/traits.rs @@ -1,9 +1,7 @@ use std::sync::Arc; -use snops_common::state::{AgentId, AgentState, EnvId, InternedId, NodeKey}; - use super::{AgentEvent, Event, EventFilter, EventKind, EventKindFilter, TransactionEvent}; -use crate::{cannon::context::ExecutionContext, state::Agent}; +use crate::state::{AgentId, EnvId, InternedId, NodeKey}; impl From for EventFilter { fn from(kind: EventKindFilter) -> Self { @@ -13,13 +11,11 @@ impl From for EventFilter { pub trait EventHelpers { fn event(self) -> Event; - fn with_agent(self, agent: &Agent) -> Event; fn with_agent_id(self, agent_id: AgentId) -> Event; fn with_node_key(self, node_key: NodeKey) -> Event; fn with_env_id(self, env_id: EnvId) -> Event; fn with_transaction(self, transaction: Arc) -> Event; fn with_cannon(self, cannon: InternedId) -> Event; - fn with_cannon_ctx(self, ctx: &ExecutionContext, transaction: Arc) -> Event; } impl> EventHelpers for T { @@ -27,16 +23,6 @@ impl> EventHelpers for T { self.into() } - fn with_agent(self, agent: &Agent) -> Event { - let mut event = self.into(); - event.agent = Some(agent.id); - if let AgentState::Node(env_id, node) = &agent.state { - event.node_key = Some(node.node_key.clone()); - event.env = Some(*env_id); - } - event - } - fn with_agent_id(self, agent_id: AgentId) -> Event { let mut event = self.into(); event.agent = Some(agent_id); @@ -66,14 +52,6 @@ impl> EventHelpers for T { event.cannon = Some(cannon); event } - - fn with_cannon_ctx(self, ctx: &ExecutionContext, transaction: Arc) -> Event { - let mut event = self.into(); - event.cannon = Some(ctx.id); - event.env = Some(ctx.env_id); - event.transaction = Some(transaction); - event - } } impl From for Event { diff --git a/crates/common/src/lib.rs b/crates/common/src/lib.rs index b433e967..948881e8 100644 --- a/crates/common/src/lib.rs +++ b/crates/common/src/lib.rs @@ -9,6 +9,7 @@ pub mod api; pub mod binaries; pub mod constant; pub mod db; +pub mod events; pub mod format; pub mod key_source; pub mod node_targets; diff --git a/crates/common/src/state/mod.rs b/crates/common/src/state/mod.rs index b39fe99a..2f13ae83 100644 --- a/crates/common/src/state/mod.rs +++ b/crates/common/src/state/mod.rs @@ -14,6 +14,7 @@ mod port_config; mod reconcile; pub mod snarkos_status; pub mod strings; +mod transaction_status; pub use agent_mode::*; pub use agent_state::*; @@ -26,6 +27,7 @@ pub use node_state::*; pub use node_type::*; pub use port_config::*; pub use reconcile::*; +pub use transaction_status::*; lazy_static! { static ref NODE_KEY_REGEX: Regex = Regex::new( diff --git a/crates/controlplane/src/cannon/status.rs b/crates/common/src/state/transaction_status.rs similarity index 91% rename from crates/controlplane/src/cannon/status.rs rename to crates/common/src/state/transaction_status.rs index 1a78f662..8e9e5ec0 100644 --- a/crates/controlplane/src/cannon/status.rs +++ b/crates/common/src/state/transaction_status.rs @@ -1,6 +1,7 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; -use snops_common::format::DataFormat; + +use crate::format::DataFormat; /// Status of a transaction as presented internally for tracking and /// preventing data loss. @@ -46,7 +47,7 @@ impl DataFormat for TransactionSendState { fn write_data( &self, writer: &mut W, - ) -> Result { + ) -> Result { Ok(match self { TransactionSendState::Authorized => 0u8.write_data(writer)?, TransactionSendState::Executing(timestamp) => { @@ -64,9 +65,9 @@ impl DataFormat for TransactionSendState { fn read_data( reader: &mut R, header: &Self::Header, - ) -> Result { + ) -> Result { if *header != Self::LATEST_HEADER { - return Err(snops_common::format::DataReadError::unsupported( + return Err(crate::format::DataReadError::unsupported( "CannonTransactionStatus", Self::LATEST_HEADER, *header, @@ -83,7 +84,7 @@ impl DataFormat for TransactionSendState { DateTime::::read_data(reader, &())?, ), _ => { - return Err(snops_common::format::DataReadError::Custom( + return Err(crate::format::DataReadError::Custom( "Invalid CannonTransactionStatus tag".to_string(), )) } @@ -94,9 +95,9 @@ impl DataFormat for TransactionSendState { #[cfg(test)] mod test { use chrono::DateTime; - use snops_common::format::DataFormat; - use crate::cannon::status::TransactionSendState; + use super::TransactionSendState; + use crate::format::DataFormat; macro_rules! case { ($name:ident, $ty:ty, $a:expr, $b:expr) => { diff --git a/crates/controlplane/src/cannon/context.rs b/crates/controlplane/src/cannon/context.rs index a209eeea..6f85bd21 100644 --- a/crates/controlplane/src/cannon/context.rs +++ b/crates/controlplane/src/cannon/context.rs @@ -6,7 +6,8 @@ use futures_util::{stream::FuturesUnordered, StreamExt}; use lazysort::SortedBy; use snops_common::{ aot_cmds::Authorization, - state::{AgentId, CannonId, EnvId, NetworkId}, + events::{Event, TransactionAbortReason, TransactionEvent}, + state::{AgentId, CannonId, EnvId, NetworkId, TransactionSendState}, }; use tracing::{error, trace, warn}; @@ -15,14 +16,12 @@ use super::{ file::TransactionSink, sink::TxSink, source::TxSource, - status::TransactionSendState, tracker::TransactionTracker, CannonReceivers, }; use crate::{ cannon::source::ComputeTarget, - events::{EventHelpers, TransactionAbortReason, TransactionEvent}, - state::{GetGlobalState, GlobalState, REST_CLIENT}, + state::{EmitEvent, GetGlobalState, GlobalState, REST_CLIENT}, }; /// Information a transaction cannon needs for execution via spawned task @@ -380,3 +379,17 @@ impl<'a> GetGlobalState<'a> for &'a ExecutionContext { &self.state } } + +pub trait CtxEventHelper { + fn with_cannon_ctx(self, ctx: &ExecutionContext, transaction: Arc) -> Event; +} + +impl> CtxEventHelper for T { + fn with_cannon_ctx(self, ctx: &ExecutionContext, transaction: Arc) -> Event { + let mut event = self.into(); + event.cannon = Some(ctx.id); + event.env = Some(ctx.env_id); + event.transaction = Some(transaction); + event + } +} diff --git a/crates/controlplane/src/cannon/mod.rs b/crates/controlplane/src/cannon/mod.rs index eecaf649..ae8d3d37 100644 --- a/crates/controlplane/src/cannon/mod.rs +++ b/crates/controlplane/src/cannon/mod.rs @@ -5,7 +5,6 @@ mod net; pub mod router; pub mod sink; pub mod source; -pub mod status; pub mod tracker; use std::{ @@ -21,9 +20,8 @@ use dashmap::DashMap; use snops_common::{ aot_cmds::{AotCmd, Authorization}, format::PackedUint, - state::{CannonId, EnvId, NetworkId, StorageId}, + state::{CannonId, EnvId, NetworkId, StorageId, TransactionSendState}, }; -use status::TransactionSendState; use tokio::{ sync::{ mpsc::{UnboundedReceiver, UnboundedSender}, diff --git a/crates/controlplane/src/cannon/router.rs b/crates/controlplane/src/cannon/router.rs index 7f03ab6c..18e85d82 100644 --- a/crates/controlplane/src/cannon/router.rs +++ b/crates/controlplane/src/cannon/router.rs @@ -369,7 +369,7 @@ async fn authorization( match cannon.proxy_auth(body).await { Ok(tx_id) => { - use crate::events::EventFilter::*; + use snops_common::events::EventFilter::*; let subscriber = state .events .subscribe_on(TransactionIs(tx_id.clone()) & EnvIs(env_id) & CannonIs(cannon_id)); diff --git a/crates/controlplane/src/cannon/source.rs b/crates/controlplane/src/cannon/source.rs index 84c20be7..3e0b85c2 100644 --- a/crates/controlplane/src/cannon/source.rs +++ b/crates/controlplane/src/cannon/source.rs @@ -3,22 +3,22 @@ use std::sync::Arc; use chrono::Utc; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; +use snops_common::events::{EventHelpers, TransactionEvent}; +use snops_common::state::TransactionSendState; use snops_common::{ aot_cmds::Authorization, lasso::Spur, node_targets::NodeTargets, state::NetworkId, INTERN, }; use tracing::error; +use super::context::CtxEventHelper; use super::{ error::{CannonError, SourceError}, net::get_available_port, - status::TransactionSendState, tracker::TransactionTracker, ExecutionContext, }; -use crate::{ - env::set::find_compute_agent, - events::{EventHelpers, TransactionEvent}, -}; +use crate::env::set::find_compute_agent; +use crate::state::EmitEvent; /// Represents an instance of a local query service. #[derive(Clone, Debug, Serialize, Deserialize)] diff --git a/crates/controlplane/src/cannon/tracker.rs b/crates/controlplane/src/cannon/tracker.rs index f954de2f..c0bfae51 100644 --- a/crates/controlplane/src/cannon/tracker.rs +++ b/crates/controlplane/src/cannon/tracker.rs @@ -1,8 +1,8 @@ use std::sync::Arc; -use snops_common::{aot_cmds::Authorization, format::PackedUint}; +use snops_common::{aot_cmds::Authorization, format::PackedUint, state::TransactionSendState}; -use super::{error::CannonError, status::TransactionSendState}; +use super::error::CannonError; use crate::{db::TxEntry, state::GlobalState}; #[derive(Debug, Clone)] diff --git a/crates/controlplane/src/db.rs b/crates/controlplane/src/db.rs index f18e8301..b9b9679a 100644 --- a/crates/controlplane/src/db.rs +++ b/crates/controlplane/src/db.rs @@ -4,11 +4,10 @@ use snops_common::{ aot_cmds::Authorization, db::{error::DatabaseError, tree::DbTree, Database as DatabaseTrait}, format::PackedUint, - state::{AgentId, CannonId, EnvId, NetworkId, StorageId}, + state::{AgentId, CannonId, EnvId, NetworkId, StorageId, TransactionSendState}, }; use crate::{ - cannon::status::TransactionSendState, persist::{PersistEnv, PersistStorage}, state::Agent, }; diff --git a/crates/controlplane/src/events/mod.rs b/crates/controlplane/src/events/mod.rs index fb60b7b6..70f059fd 100644 --- a/crates/controlplane/src/events/mod.rs +++ b/crates/controlplane/src/events/mod.rs @@ -1,25 +1,5 @@ -mod models; -pub use models::*; mod stream; pub use stream::*; -mod filter_parse; -mod traits; -pub use traits::*; -mod filter; -pub use filter::*; -mod filter_ops; -pub mod prelude { - pub use super::filter::EventFilter::*; - pub use super::models::EventKindFilter::*; - pub use super::models::*; -} - -#[cfg(test)] -mod test_filter; -#[cfg(test)] -mod test_filter_ops; -#[cfg(test)] -mod test_filter_parse; #[cfg(test)] mod test_stream; diff --git a/crates/controlplane/src/events/stream.rs b/crates/controlplane/src/events/stream.rs index c1c8c608..f0d285ef 100644 --- a/crates/controlplane/src/events/stream.rs +++ b/crates/controlplane/src/events/stream.rs @@ -1,10 +1,9 @@ use std::{sync::Arc, task::Poll}; use futures_util::Stream; +use snops_common::events::{Event, EventFilter}; use tokio::sync::broadcast::{self, error::TryRecvError}; -use super::{Event, EventFilter}; - #[derive(Debug)] pub struct Events { tx: broadcast::Sender>, diff --git a/crates/controlplane/src/events/test_stream.rs b/crates/controlplane/src/events/test_stream.rs index 1d7ffc66..89e62631 100644 --- a/crates/controlplane/src/events/test_stream.rs +++ b/crates/controlplane/src/events/test_stream.rs @@ -1,10 +1,10 @@ use std::str::FromStr; use lazy_static::lazy_static; +use snops_common::events::{AgentEvent::*, EventFilter::*, EventHelpers, EventKindFilter::*}; use snops_common::state::InternedId; -use super::{AgentEvent::*, EventFilter::*, EventKindFilter::*, Events}; -use crate::events::EventHelpers; +use crate::events::Events; lazy_static! { static ref A: InternedId = InternedId::from_str("a").unwrap(); diff --git a/crates/controlplane/src/persist/env.rs b/crates/controlplane/src/persist/env.rs index 4dee32d8..4c85889c 100644 --- a/crates/controlplane/src/persist/env.rs +++ b/crates/controlplane/src/persist/env.rs @@ -2,15 +2,14 @@ use std::sync::Arc; use bimap::BiMap; use dashmap::DashMap; +use snops_common::state::TransactionSendState; use snops_common::state::{CannonId, EnvId, NetworkId, NodeKey, StorageId}; use tokio::sync::Semaphore; use super::prelude::*; use super::PersistNode; use crate::{ - cannon::{ - sink::TxSink, source::TxSource, status::TransactionSendState, tracker::TransactionTracker, - }, + cannon::{sink::TxSink, source::TxSource, tracker::TransactionTracker}, env::{ error::{EnvError, PrepareError}, prepare_cannons, EnvNodeState, EnvPeer, Environment, diff --git a/crates/controlplane/src/server/actions/deploy.rs b/crates/controlplane/src/server/actions/deploy.rs index 3806866a..033d6d2e 100644 --- a/crates/controlplane/src/server/actions/deploy.rs +++ b/crates/controlplane/src/server/actions/deploy.rs @@ -38,7 +38,7 @@ pub async fn deploy( match deploy_inner(&state, action, &env, query_addr).await { Ok(tx_id) => { - use crate::events::EventFilter::*; + use snops_common::events::EventFilter::*; let subscriber = state .events .subscribe_on(TransactionIs(tx_id.clone()) & EnvIs(env.id) & CannonIs(cannon_id)); diff --git a/crates/controlplane/src/server/actions/execute.rs b/crates/controlplane/src/server/actions/execute.rs index 916243a7..0d8b62e5 100644 --- a/crates/controlplane/src/server/actions/execute.rs +++ b/crates/controlplane/src/server/actions/execute.rs @@ -10,6 +10,7 @@ use serde_json::json; use snops_common::{ action_models::{AleoValue, ExecuteAction}, aot_cmds::{AotCmd, Authorization}, + events::{Event, EventKind}, state::KeyState, }; use tokio::select; @@ -18,7 +19,7 @@ use super::Env; use crate::{ cannon::{error::AuthorizeError, router::AuthQuery}, env::{error::ExecutionError, Environment}, - events::{Event, EventKind, EventSubscriber}, + events::EventSubscriber, server::error::{ActionError, ServerError}, state::GlobalState, }; @@ -27,7 +28,7 @@ pub async fn execute_status( tx_id: Arc, mut rx: EventSubscriber, ) -> Result, ActionError> { - use crate::events::TransactionEvent::*; + use snops_common::events::TransactionEvent::*; let mut timeout = Box::pin(tokio::time::sleep(std::time::Duration::from_secs(30))); let mut agent_id = None; @@ -96,7 +97,7 @@ pub async fn execute( match execute_inner(&state, action, &env, query_addr).await { Ok(tx_id) => { - use crate::events::EventFilter::*; + use snops_common::events::EventFilter::*; let subscriber = state .events .subscribe_on(TransactionIs(tx_id.clone()) & EnvIs(env.id) & CannonIs(cannon_id)); diff --git a/crates/controlplane/src/server/actions/power.rs b/crates/controlplane/src/server/actions/power.rs index 8e9e3f32..1cc79f97 100644 --- a/crates/controlplane/src/server/actions/power.rs +++ b/crates/controlplane/src/server/actions/power.rs @@ -25,7 +25,7 @@ async fn wait_for_nodes( // create the subscriber before updating agent states in order to // avoid missing any events - use crate::events::prelude::*; + use snops_common::events::prelude::*; let mut subscriber = state .events .subscribe_on(NodeTargetIs(nodes) & EnvIs(env_id) & AgentReconcileComplete); @@ -103,7 +103,7 @@ pub async fn reboot( // create the subscriber before updating agent states in order to // avoid missing any events - use crate::events::prelude::*; + use snops_common::events::prelude::*; let mut subscriber = state .events .subscribe_on(NodeTargetIs(nodes) & EnvIs(env.id) & AgentReconcileComplete); diff --git a/crates/controlplane/src/server/error.rs b/crates/controlplane/src/server/error.rs index df7d3ce7..8c37c1ee 100644 --- a/crates/controlplane/src/server/error.rs +++ b/crates/controlplane/src/server/error.rs @@ -3,7 +3,8 @@ use http::StatusCode; use serde::{ser::SerializeStruct, Serialize, Serializer}; use serde_json::json; use snops_common::{ - aot_cmds::AotCmdError, db::error::DatabaseError, impl_into_status_code, impl_into_type_str, + aot_cmds::AotCmdError, db::error::DatabaseError, events::TransactionAbortReason, + impl_into_status_code, impl_into_type_str, }; use thiserror::Error; @@ -11,7 +12,6 @@ use crate::{ cannon::error::CannonError, env::error::{EnvError, EnvRequestError, ExecutionError}, error::DeserializeError, - events::TransactionAbortReason, schema::error::{SchemaError, StorageError}, }; diff --git a/crates/controlplane/src/server/rpc.rs b/crates/controlplane/src/server/rpc.rs index 2422a4c6..741c07a3 100644 --- a/crates/controlplane/src/server/rpc.rs +++ b/crates/controlplane/src/server/rpc.rs @@ -1,6 +1,7 @@ use std::{collections::HashMap, net::IpAddr, time::Instant}; use chrono::Utc; +use snops_common::events::AgentEvent; use snops_common::{ api::AgentEnvInfo, define_rpc_mux, @@ -20,9 +21,9 @@ use snops_common::{ use tarpc::context; use tracing::warn; +use crate::state::{AgentEventHelpers, EmitEvent}; use crate::{ error::StateError, - events::{AgentEvent, EventHelpers}, state::{AddrMap, AgentAddrs, AppState, GetGlobalState, GlobalState}, }; diff --git a/crates/controlplane/src/server/websocket.rs b/crates/controlplane/src/server/websocket.rs index af3b6075..4c2f6796 100644 --- a/crates/controlplane/src/server/websocket.rs +++ b/crates/controlplane/src/server/websocket.rs @@ -13,6 +13,7 @@ use futures_util::stream::StreamExt; use http::StatusCode; use semver::Version; use serde::Deserialize; +use snops_common::events::AgentEvent; use snops_common::{ constant::HEADER_AGENT_KEY, prelude::*, @@ -28,12 +29,11 @@ use tracing::{error, info, warn}; use super::{jwt::Claims, rpc::ControlRpcServer}; use crate::{ agent_version::agent_version_ok, - events::{AgentEvent, EventHelpers}, server::{ jwt::JWT_SECRET, rpc::{MuxedMessageIncoming, MuxedMessageOutgoing}, }, - state::{Agent, AgentFlags, AppState}, + state::{Agent, AgentEventHelpers, AgentFlags, AppState, EmitEvent}, }; #[derive(Debug, Deserialize)] diff --git a/crates/controlplane/src/state/agent.rs b/crates/controlplane/src/state/agent.rs index bcb11b73..7e25f87b 100644 --- a/crates/controlplane/src/state/agent.rs +++ b/crates/controlplane/src/state/agent.rs @@ -11,6 +11,7 @@ use rand::{Rng, SeedableRng}; use rand_chacha::ChaChaRng; use serde::{Deserialize, Serialize}; use snops_common::{ + events::Event, lasso::Spur, rpc::control::agent::AgentServiceClient, state::{ @@ -348,3 +349,19 @@ impl AgentAddrs { self.external.is_some() || !self.internal.is_empty() } } + +pub trait AgentEventHelpers { + fn with_agent(self, agent: &Agent) -> Event; +} + +impl> AgentEventHelpers for T { + fn with_agent(self, agent: &Agent) -> Event { + let mut event = self.into(); + event.agent = Some(agent.id); + if let AgentState::Node(env_id, node) = &agent.state { + event.node_key = Some(node.node_key.clone()); + event.env = Some(*env_id); + } + event + } +} diff --git a/crates/controlplane/src/state/global.rs b/crates/controlplane/src/state/global.rs index 0cc14132..f945bc71 100644 --- a/crates/controlplane/src/state/global.rs +++ b/crates/controlplane/src/state/global.rs @@ -7,6 +7,7 @@ use prometheus_http_query::Client as PrometheusClient; use serde::de::DeserializeOwned; use snops_common::{ constant::ENV_AGENT_KEY, + events::Event, node_targets::NodeTargets, state::{ AgentId, AgentPeer, AgentState, EnvId, LatestBlockInfo, NetworkId, NodeType, StorageId, @@ -374,3 +375,14 @@ impl<'a> GetGlobalState<'a> for &'a Arc { self } } + +pub trait EmitEvent { + fn emit<'a>(self, state: impl GetGlobalState<'a>); +} + +impl EmitEvent for Event { + #[inline] + fn emit<'a>(self, state: impl GetGlobalState<'a>) { + state.global_state().events.emit(self); + } +} diff --git a/crates/controlplane/src/state/transactions.rs b/crates/controlplane/src/state/transactions.rs index 8d1ec028..a63715c9 100644 --- a/crates/controlplane/src/state/transactions.rs +++ b/crates/controlplane/src/state/transactions.rs @@ -2,15 +2,15 @@ use std::{sync::Arc, time::Duration}; use chrono::{TimeDelta, Utc}; use futures_util::future; -use snops_common::state::{CannonId, EnvId}; +use snops_common::{ + events::{EventHelpers, TransactionEvent}, + state::{CannonId, EnvId, TransactionSendState}, +}; use tokio::time::timeout; use tracing::{info, trace}; -use super::GlobalState; -use crate::{ - cannon::{status::TransactionSendState, tracker::TransactionTracker}, - events::{EventHelpers, TransactionEvent}, -}; +use super::{EmitEvent, GlobalState}; +use crate::cannon::tracker::TransactionTracker; /// This task re-sends all transactions that have not been confirmed, /// re-computes all transactions that have not been computed, and removes From a800c5a8b08836338af1963c5864c348b68230dd Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 2 Dec 2024 19:26:35 -0500 Subject: [PATCH 51/68] feat(controlplane): events websocket API --- crates/common/src/binaries.rs | 2 +- crates/common/src/events/filter_parse.rs | 22 ++++ crates/common/src/events/models.rs | 8 ++ .../src/server/{websocket.rs => agent_ws.rs} | 1 - crates/controlplane/src/server/api.rs | 3 +- crates/controlplane/src/server/event_ws.rs | 120 ++++++++++++++++++ crates/controlplane/src/server/mod.rs | 5 +- 7 files changed, 156 insertions(+), 5 deletions(-) rename crates/controlplane/src/server/{websocket.rs => agent_ws.rs} (99%) create mode 100644 crates/controlplane/src/server/event_ws.rs diff --git a/crates/common/src/binaries.rs b/crates/common/src/binaries.rs index 9d94c75d..9543d3d3 100644 --- a/crates/common/src/binaries.rs +++ b/crates/common/src/binaries.rs @@ -137,7 +137,7 @@ impl Serialize for BinarySource { } impl<'de> Deserialize<'de> for BinarySource { - fn deserialize(deserializer: D) -> Result + fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, { diff --git a/crates/common/src/events/filter_parse.rs b/crates/common/src/events/filter_parse.rs index 333800bc..6e29efad 100644 --- a/crates/common/src/events/filter_parse.rs +++ b/crates/common/src/events/filter_parse.rs @@ -1,5 +1,7 @@ use std::{fmt::Display, str::FromStr, sync::Arc}; +use serde::{Deserialize, Serialize, Serializer}; + use super::EventFilter; use crate::events::EventKindFilter; use crate::node_targets::{NodeTarget, NodeTargets}; @@ -303,3 +305,23 @@ impl FromStr for EventFilter { Ok(filter) } } + +impl<'de> Deserialize<'de> for EventFilter { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + String::deserialize(deserializer)? + .parse() + .map_err(serde::de::Error::custom) + } +} + +impl Serialize for EventFilter { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_str(&self.to_string()) + } +} diff --git a/crates/common/src/events/models.rs b/crates/common/src/events/models.rs index fe33573c..a6e7a491 100644 --- a/crates/common/src/events/models.rs +++ b/crates/common/src/events/models.rs @@ -3,6 +3,7 @@ use std::{fmt::Display, str::FromStr, sync::Arc}; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; +use super::EventFilter; use crate::{ aot_cmds::Authorization, rpc::error::ReconcileError, @@ -12,6 +13,13 @@ use crate::{ }, }; +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "action", rename_all = "snake_case")] +pub enum EventWsRequest { + Subscribe { id: u32, filter: EventFilter }, + Unsubscribe { id: u32 }, +} + #[derive(Clone, Debug, Serialize, Deserialize)] pub struct Event { pub created_at: DateTime, diff --git a/crates/controlplane/src/server/websocket.rs b/crates/controlplane/src/server/agent_ws.rs similarity index 99% rename from crates/controlplane/src/server/websocket.rs rename to crates/controlplane/src/server/agent_ws.rs index 4c2f6796..ff2d25d2 100644 --- a/crates/controlplane/src/server/websocket.rs +++ b/crates/controlplane/src/server/agent_ws.rs @@ -70,7 +70,6 @@ pub async fn agent_ws_handler( } ws.on_upgrade(|socket| handle_socket(socket, headers, state, query)) - .into_response() } async fn handle_socket( diff --git a/crates/controlplane/src/server/api.rs b/crates/controlplane/src/server/api.rs index 8ba54d29..a779a71c 100644 --- a/crates/controlplane/src/server/api.rs +++ b/crates/controlplane/src/server/api.rs @@ -19,7 +19,7 @@ use snops_common::{ }; use tarpc::context; -use super::{actions, error::ServerError, models::AgentStatusResponse}; +use super::{actions, error::ServerError, event_ws, models::AgentStatusResponse}; use crate::{ cannon::{router::redirect_cannon_routes, source::QueryTarget}, make_env_filter, @@ -42,6 +42,7 @@ macro_rules! unwrap_or_not_found { pub(super) fn routes() -> Router { Router::new() + .route("/events", get(event_ws::event_ws_handler)) .route("/log/:level", post(set_log_level)) .route("/agents", get(get_agents)) .route("/agents/:id", get(get_agent)) diff --git a/crates/controlplane/src/server/event_ws.rs b/crates/controlplane/src/server/event_ws.rs new file mode 100644 index 00000000..b0af7513 --- /dev/null +++ b/crates/controlplane/src/server/event_ws.rs @@ -0,0 +1,120 @@ +use std::collections::HashMap; + +use axum::{ + extract::{ + ws::{Message, WebSocket}, + Query, State, WebSocketUpgrade, + }, + response::Response, +}; +use serde::Deserialize; +use snops_common::events::{EventFilter, EventWsRequest}; +use tokio::select; + +use crate::{events::EventSubscriber, state::AppState}; + +#[derive(Debug, Deserialize)] +pub struct EventWsQuery { + #[serde(default)] + pub filter: Option, +} + +pub async fn event_ws_handler( + ws: WebSocketUpgrade, + State(state): State, + Query(query): Query, +) -> Response { + ws.on_upgrade(|socket| async { + EventWsHandler::new(state, query.filter) + .handle_ws(socket) + .await + }) +} + +struct EventWsHandler { + base_filter: Option, + subscriber: EventSubscriber, + extra_filters: HashMap, +} + +impl EventWsHandler { + fn new(state: AppState, base_filter: Option) -> Self { + let subscriber = match base_filter.clone() { + Some(filter) => state.events.subscribe_on(filter), + // Listen to no events by default + None => state.events.subscribe_on(!EventFilter::Unfiltered), + }; + Self { + base_filter, + subscriber, + extra_filters: Default::default(), + } + } + + /// Update the subscriber filter based on the base filter and extra filters + fn update_subscriber(&mut self) { + if self.extra_filters.is_empty() && self.base_filter.is_none() { + self.subscriber.set_filter(!EventFilter::Unfiltered); + return; + } + + let base_filter = self.base_filter.clone().unwrap_or(EventFilter::Unfiltered); + + self.subscriber.set_filter( + base_filter + & EventFilter::AnyOf(self.extra_filters.values().cloned().collect::>()), + ); + } + + /// Handle a request from the websocket to subscribe or unsubscribe from + /// events + fn handle_request(&mut self, req: EventWsRequest) { + match req { + EventWsRequest::Subscribe { id, filter } => { + self.extra_filters.insert(id, filter); + self.update_subscriber(); + } + EventWsRequest::Unsubscribe { id } => { + self.extra_filters.remove(&id); + self.update_subscriber(); + } + } + } + + /// Handle the websocket connection, sending events to the client and + /// handling requests to subscribe or unsubscribe from the client + async fn handle_ws(&mut self, mut socket: WebSocket) { + loop { + select! { + msg = socket.recv() => { + // Parse the message + let req = match msg { + Some(Ok(Message::Text(text))) => serde_json::from_str::(&text), + Some(Ok(Message::Binary(bin))) => serde_json::from_slice::(&bin), + Some(Err(_)) | None => break, + _ => continue, + }; + // Handle the request + match req { + Ok(req) => self.handle_request(req), + Err(_e) => break, + } + } + // Forward events to the client + Ok(event) = self.subscriber.next() => { + let json = match serde_json::to_string(&event) { + Ok(json) => json, + Err(e) => { + tracing::error!("failed to serialize event for websocket: {e}"); + break; + } + }; + if let Err(e) = socket.send(Message::Text(json)).await { + tracing::error!("failed to send event to websocket: {e}"); + break; + } + } + } + } + } +} diff --git a/crates/controlplane/src/server/mod.rs b/crates/controlplane/src/server/mod.rs index 2ee47179..277cceaf 100644 --- a/crates/controlplane/src/server/mod.rs +++ b/crates/controlplane/src/server/mod.rs @@ -9,18 +9,19 @@ use crate::{ }; pub mod actions; +mod agent_ws; mod api; mod content; pub mod error; +mod event_ws; pub mod jwt; pub mod models; pub mod prometheus; mod rpc; -mod websocket; pub async fn start(state: Arc, socket_addr: SocketAddr) -> Result<(), StartError> { let app = Router::new() - .route("/agent", get(websocket::agent_ws_handler)) + .route("/agent", get(agent_ws::agent_ws_handler)) .nest("/api/v1", api::routes()) .nest("/prometheus", prometheus::routes()) .nest("/content", content::init_routes(&state).await) From b7ba17673f14d5e33ffa4eb69a4cf90e79e6734b Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 2 Dec 2024 20:26:09 -0500 Subject: [PATCH 52/68] feat(cli): event listening --- Cargo.lock | 6 + Cargo.toml | 1 + crates/agent/Cargo.toml | 2 +- crates/cli/Cargo.toml | 6 + crates/cli/src/commands/mod.rs | 26 +++- crates/cli/src/events.rs | 124 ++++++++++++++++++ crates/cli/src/lib.rs | 1 + crates/cli/src/main.rs | 6 + crates/common/src/events/filter.rs | 2 +- crates/common/src/events/models.rs | 15 ++- crates/common/src/events/test_filter.rs | 6 +- .../src/server/actions/execute.rs | 2 +- crates/controlplane/src/server/rpc.rs | 2 +- crates/controlplane/src/state/transactions.rs | 12 +- 14 files changed, 192 insertions(+), 19 deletions(-) create mode 100644 crates/cli/src/events.rs create mode 100644 crates/cli/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 6521cd21..84ec8aa4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4637,9 +4637,15 @@ dependencies = [ "clap", "clap-stdin", "clap_complete", + "futures-util", + "http 1.1.0", "reqwest 0.12.8", + "rustls 0.23.15", + "serde", "serde_json", "snops-common", + "tokio", + "tokio-tungstenite", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 910513c1..68b2224b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -81,6 +81,7 @@ reqwest = { version = "0.12", default-features = false, features = [ ] } # Can't update this cause snarkos/vm rocksdb = { version = "0.21", default-features = false } +rustls = { version = "0.23.15", features = ["ring"] } semver = { version = "1.0", features = ["serde"] } serde = { version = "1", default-features = false, features = [ "alloc", diff --git a/crates/agent/Cargo.toml b/crates/agent/Cargo.toml index 6f01822c..de721390 100644 --- a/crates/agent/Cargo.toml +++ b/crates/agent/Cargo.toml @@ -28,6 +28,7 @@ lazysort.workspace = true local-ip-address.workspace = true nix = { workspace = true, features = ["signal"] } reqwest = { workspace = true, features = ["json", "stream"] } +rustls.workspace = true serde_json.workspace = true sha2.workspace = true simple_moving_average.workspace = true @@ -46,4 +47,3 @@ tracing-appender.workspace = true tracing.workspace = true tracing-subscriber.workspace = true url.workspace = true -rustls = { version = "0.23.15", features = ["ring"] } diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index fb71f540..14fbae9c 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -16,6 +16,12 @@ anyhow.workspace = true clap.workspace = true clap_complete.workspace = true clap-stdin.workspace = true +futures-util.workspace = true +http.workspace = true reqwest = { workspace = true, features = ["blocking", "json"] } +rustls.workspace = true +serde.workspace = true serde_json.workspace = true snops-common = { workspace = true, features = ["aot_cmds"] } +tokio = { workspace = true, features = ["macros", "signal", "rt-multi-thread"] } +tokio-tungstenite.workspace = true diff --git a/crates/cli/src/commands/mod.rs b/crates/cli/src/commands/mod.rs index a1f9edf7..92931062 100644 --- a/crates/cli/src/commands/mod.rs +++ b/crates/cli/src/commands/mod.rs @@ -1,8 +1,9 @@ use anyhow::Result; use clap::{CommandFactory, Parser}; use serde_json::Value; +use snops_common::events::EventFilter; -use crate::Cli; +use crate::{events::EventsClient, Cli}; /// The dummy value for the ids to hack around the missing required argument. pub(crate) static DUMMY_ID: &str = "dummy_value___"; @@ -25,6 +26,12 @@ pub enum Commands { SetLogLevel { level: String, }, + /// Listen to events from the control plane, optionally filtered. + Events { + /// The event filter to apply, such as `agent-connected` or + /// `all-of(env-is(default),node-target-is(validator/any))` + filter: Option, + }, #[cfg(feature = "mangen")] Man(snops_common::mangen::Mangen), #[cfg(feature = "clipages")] @@ -32,7 +39,8 @@ pub enum Commands { } impl Commands { - pub fn run(self, url: &str) -> Result<()> { + #[tokio::main] + pub async fn run(self, url: &str) -> Result<()> { let client = reqwest::blocking::Client::new(); let response = match self { @@ -49,6 +57,20 @@ impl Commands { client.post(format!("{url}/api/v1/log/{level}")).send()?; return Ok(()); } + Commands::Events { filter } => { + let mut client = EventsClient::open_with_filter(url, filter).await?; + loop { + tokio::select! { + _ = tokio::signal::ctrl_c() => break, + res = client.next() => { + let event = res?; + println!("{}", serde_json::to_string_pretty(&event)?); + } + } + } + client.close().await?; + return Ok(()); + } #[cfg(feature = "mangen")] Commands::Man(mangen) => { mangen.run( diff --git a/crates/cli/src/events.rs b/crates/cli/src/events.rs new file mode 100644 index 00000000..24f020e2 --- /dev/null +++ b/crates/cli/src/events.rs @@ -0,0 +1,124 @@ +// subscription code is not in use yet +#![allow(dead_code)] + +use std::{collections::HashSet, str::FromStr, time::Duration}; + +use anyhow::{bail, Context, Result}; +use futures_util::{SinkExt, StreamExt}; +use http::Uri; +use snops_common::events::{Event, EventFilter, EventWsRequest}; +use tokio::{net::TcpStream, select}; +use tokio_tungstenite::{ + connect_async, + tungstenite::{self, client::IntoClientRequest}, + MaybeTlsStream, WebSocketStream, +}; + +pub struct EventsClient { + counter: u32, + stream: WebSocketStream>, + subscriptions: HashSet, + ping_interval: tokio::time::Interval, +} + +impl EventsClient { + pub async fn open(url: &str) -> Result { + Self::open_with_filter(url, None).await + } + + pub async fn open_with_filter(url: &str, filter: Option) -> Result { + let (proto, hostname) = url.split_once("://").unwrap_or(("http", url)); + let proto = match proto { + "wss" | "https" => "wss", + _ => "ws", + }; + + let req = Uri::from_str(&match filter { + Some(filter) => format!("{proto}://{hostname}/api/v1/events?filter={filter}"), + None => format!("{proto}://{hostname}/api/v1/events"), + }) + .context("Invalid URI")? + .into_client_request() + .context("Invalid websocket request")?; + + let stream = match connect_async(req).await { + Ok((stream, _)) => stream, + Err(tungstenite::Error::Io(e)) if e.kind() == std::io::ErrorKind::ConnectionRefused => { + bail!("Failed to connect to websocket: Connection refused") + } + Err(e) => bail!("Failed to connect to websocket: {}", e), + }; + + Ok(Self { + counter: 0, + stream, + subscriptions: Default::default(), + ping_interval: tokio::time::interval(Duration::from_secs(10)), + }) + } + + async fn send_json(&mut self, msg: impl serde::Serialize) -> Result<()> { + self.stream + .send(tungstenite::Message::Text( + serde_json::to_string(&msg).context("Failed to serialize message")?, + )) + .await + .context("Failed to send message") + } + + /// Add an additional filter to the current subscription + pub async fn subscribe(&mut self, filter: EventFilter) -> Result { + let id = self.counter; + self.send_json(EventWsRequest::Subscribe { id, filter }) + .await?; + self.counter = self.counter.saturating_add(1); + self.subscriptions.insert(id); + Ok(id) + } + + /// Remove a filter from the current subscription + pub async fn unsubscribe(&mut self, id: u32) -> Result<()> { + if !self.subscriptions.remove(&id) { + bail!("Subscription not found: {}", id); + } + self.send_json(EventWsRequest::Unsubscribe { id }).await?; + Ok(()) + } + + /// Remove all filters from the current subscription + pub async fn unsubscribe_all(&mut self) -> Result<()> { + // Collect the ids to avoid borrowing issues + for id in self.subscriptions.drain().collect::>() { + self.send_json(EventWsRequest::Unsubscribe { id }).await?; + } + Ok(()) + } + + /// Get the next event from the stream + pub async fn next(&mut self) -> Result { + loop { + select! { + _ = self.ping_interval.tick() => { + self.stream.send(tungstenite::Message::Ping(vec![b'p', b'i', b'n', b'g'])).await.context("Failed to send ping")?; + } + msg = self.stream.next() => { + match msg { + Some(Ok(tungstenite::Message::Text(text))) => + return serde_json::from_str(&text).context("Failed to parse event"), + Some(Ok(tungstenite::Message::Binary(bin))) => + return serde_json::from_slice(&bin).context("Failed to parse event"), + None | Some(Err(_)) => bail!("Websocket closed"), + Some(Ok(_)) => continue, + + } + } + } + } + } + + /// Close the websocket connection + pub async fn close(mut self) -> Result<()> { + self.stream.close(None).await?; + Ok(()) + } +} diff --git a/crates/cli/src/lib.rs b/crates/cli/src/lib.rs new file mode 100644 index 00000000..a9970c28 --- /dev/null +++ b/crates/cli/src/lib.rs @@ -0,0 +1 @@ +pub mod events; diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index 5025c0f0..24e2b2a8 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -6,10 +6,16 @@ use clap::Parser; mod cli; pub(crate) use cli::*; +mod events; + mod commands; pub(crate) use commands::*; fn main() -> Result<()> { + rustls::crypto::ring::default_provider() + .install_default() + .expect("Failed to install rustls crypto provider"); + let cli = cli::Cli::parse(); if let Err(err) = cli.run() { diff --git a/crates/common/src/events/filter.rs b/crates/common/src/events/filter.rs index da13bf12..0828ff10 100644 --- a/crates/common/src/events/filter.rs +++ b/crates/common/src/events/filter.rs @@ -51,7 +51,7 @@ impl Event { self.transaction.as_ref() == Some(transaction) } EventFilter::CannonIs(cannon) => self.cannon == Some(*cannon), - EventFilter::EventIs(kind) => self.kind.filter() == *kind, + EventFilter::EventIs(kind) => self.content.filter() == *kind, EventFilter::NodeKeyIs(node_key) => self.node_key.as_ref() == Some(node_key), EventFilter::NodeTargetIs(node_targets) => self .node_key diff --git a/crates/common/src/events/models.rs b/crates/common/src/events/models.rs index a6e7a491..6acf49e1 100644 --- a/crates/common/src/events/models.rs +++ b/crates/common/src/events/models.rs @@ -23,12 +23,17 @@ pub enum EventWsRequest { #[derive(Clone, Debug, Serialize, Deserialize)] pub struct Event { pub created_at: DateTime, + #[serde(skip_serializing_if = "Option::is_none")] pub agent: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub node_key: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub env: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub transaction: Option>, + #[serde(skip_serializing_if = "Option::is_none")] pub cannon: Option, - pub kind: EventKind, + pub content: EventKind, } #[derive(Clone, Debug, Serialize, Deserialize)] @@ -206,7 +211,7 @@ impl Display for EventKindFilter { } impl Event { - pub fn new(kind: EventKind) -> Self { + pub fn new(content: EventKind) -> Self { Self { created_at: Utc::now(), agent: None, @@ -214,11 +219,11 @@ impl Event { env: None, transaction: None, cannon: None, - kind, + content, } } - pub fn replace_kind(&self, kind: impl Into) -> Self { + pub fn replace_content(&self, content: impl Into) -> Self { Self { created_at: Utc::now(), agent: self.agent, @@ -226,7 +231,7 @@ impl Event { env: self.env, transaction: self.transaction.clone(), cannon: self.cannon, - kind: kind.into().kind, + content: content.into().content, } } } diff --git a/crates/common/src/events/test_filter.rs b/crates/common/src/events/test_filter.rs index 3173b173..f162f986 100644 --- a/crates/common/src/events/test_filter.rs +++ b/crates/common/src/events/test_filter.rs @@ -49,7 +49,7 @@ fn test_all_of() { env: Some(*B), transaction: None, cannon: None, - kind: Agent(Connected), + content: Agent(Connected), }; assert!(e.matches(&(AgentConnected & AgentIs(*A)))); @@ -76,7 +76,7 @@ fn test_any_of() { env: Some(*B), transaction: None, cannon: None, - kind: Agent(Connected), + content: Agent(Connected), }; assert!(e.matches(&(AgentConnected | AgentIs(*A)))); @@ -107,7 +107,7 @@ fn test_one_of() { env: Some(*B), transaction: None, cannon: None, - kind: Agent(Connected), + content: Agent(Connected), }; assert!(e.matches(&(AgentConnected ^ AgentIs(*B)))); diff --git a/crates/controlplane/src/server/actions/execute.rs b/crates/controlplane/src/server/actions/execute.rs index 0d8b62e5..05210309 100644 --- a/crates/controlplane/src/server/actions/execute.rs +++ b/crates/controlplane/src/server/actions/execute.rs @@ -40,7 +40,7 @@ pub async fn execute_status( return Err(ActionError::ExecuteStatusTimeout { tx_id: tx_id.to_string(), agent_id, retries }); }, Ok(ev) = rx.next() => { - let Event{ kind: EventKind::Transaction(ev), agent, .. } = ev.as_ref() else { + let Event{ content: EventKind::Transaction(ev), agent, .. } = ev.as_ref() else { continue; }; diff --git a/crates/controlplane/src/server/rpc.rs b/crates/controlplane/src/server/rpc.rs index 741c07a3..aa8502ea 100644 --- a/crates/controlplane/src/server/rpc.rs +++ b/crates/controlplane/src/server/rpc.rs @@ -224,7 +224,7 @@ impl ControlService for ControlRpcServer { let ev = AgentEvent::ReconcileComplete.with_agent(&agent); let is_complete = status.as_ref().is_ok_and(|e| e.inner.is_some()); - ev.replace_kind(match status { + ev.replace_content(match status { Ok(res) => AgentEvent::Reconcile(res), Err(err) => AgentEvent::ReconcileError(err), }) diff --git a/crates/controlplane/src/state/transactions.rs b/crates/controlplane/src/state/transactions.rs index a63715c9..6f54d1a1 100644 --- a/crates/controlplane/src/state/transactions.rs +++ b/crates/controlplane/src/state/transactions.rs @@ -142,7 +142,7 @@ fn get_pending_transactions(state: &GlobalState) -> Vec<((EnvId, CannonId), Pend if cannon.sink.authorize_attempts.is_some_and(|a| attempts > a) { info!("cannon {env_id}.{cannon_id} removed auth {tx_id} (too many attempts)"); to_remove.push(tx_id); - ev.replace_kind(TransactionEvent::ExecuteExceeded { attempts }) + ev.replace_content(TransactionEvent::ExecuteExceeded { attempts }) .emit(state); } else { to_execute.push((tx_id, tx.index)); @@ -155,7 +155,7 @@ fn get_pending_transactions(state: &GlobalState) -> Vec<((EnvId, CannonId), Pend { if cannon.sink.authorize_attempts.is_some_and(|a| attempts > a) { info!("cannon {env_id}.{cannon_id} removed auth {tx_id} (too many attempts)"); - ev.replace_kind(TransactionEvent::ExecuteExceeded { attempts }) + ev.replace_content(TransactionEvent::ExecuteExceeded { attempts }) .emit(state); to_remove.push(tx_id); } else { @@ -166,7 +166,7 @@ fn get_pending_transactions(state: &GlobalState) -> Vec<((EnvId, CannonId), Pend TransactionSendState::Unsent => { if cannon.sink.broadcast_attempts.is_some_and(|a| attempts > a) { info!("cannon {env_id}.{cannon_id} removed broadcast {tx_id} (too many attempts)"); - ev.replace_kind(TransactionEvent::BroadcastExceeded { attempts }) + ev.replace_content(TransactionEvent::BroadcastExceeded { attempts }) .emit(state); to_remove.push(tx_id); } else { @@ -204,8 +204,10 @@ fn get_pending_transactions(state: &GlobalState) -> Vec<((EnvId, CannonId), Pend { if cannon.sink.broadcast_attempts.is_some_and(|a| attempts > a) { info!("cannon {env_id}.{cannon_id} removed broadcast {tx_id} (too many attempts)"); - ev.replace_kind(TransactionEvent::BroadcastExceeded { attempts }) - .emit(state); + ev.replace_content(TransactionEvent::BroadcastExceeded { + attempts, + }) + .emit(state); to_remove.push(tx_id); } else { to_broadcast.push((tx_id, tx.index)); From fee3dcaeea8c58660b33aa773e3b81009ab26d50 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 2 Dec 2024 20:39:43 -0500 Subject: [PATCH 53/68] feat(events): has-* event filters --- crates/common/src/events/filter.rs | 20 ++++++++++++++++++++ crates/common/src/events/filter_parse.rs | 5 +++++ 2 files changed, 25 insertions(+) diff --git a/crates/common/src/events/filter.rs b/crates/common/src/events/filter.rs index 0828ff10..ba506cad 100644 --- a/crates/common/src/events/filter.rs +++ b/crates/common/src/events/filter.rs @@ -23,15 +23,25 @@ pub enum EventFilter { /// Filter by agent ID AgentIs(AgentId), + /// Filter by events that have any agent + HasAgent, /// Filter by environment ID EnvIs(EnvId), + /// Filter by events that have any environment + HasEnv, /// Filter by transaction ID TransactionIs(Arc), + /// Filter by events that have any transaction + HasTransaction, /// Filter by cannon ID CannonIs(InternedId), + /// Filter by events that have any cannon + HasCannon, /// Filter by event kind EventIs(EventKindFilter), /// Filter by node key + HasNodeKey, + /// Filter by node key NodeKeyIs(NodeKey), /// Filter by node target NodeTargetIs(NodeTargets), @@ -46,13 +56,18 @@ impl Event { EventFilter::OneOf(filters) => filters.iter().filter(|f| self.matches(f)).count() == 1, EventFilter::Not(f) => !self.matches(f), EventFilter::AgentIs(agent) => self.agent == Some(*agent), + EventFilter::HasAgent => self.agent.is_some(), EventFilter::EnvIs(env) => self.env == Some(*env), + EventFilter::HasEnv => self.env.is_some(), EventFilter::TransactionIs(transaction) => { self.transaction.as_ref() == Some(transaction) } + EventFilter::HasTransaction => self.transaction.is_some(), EventFilter::CannonIs(cannon) => self.cannon == Some(*cannon), + EventFilter::HasCannon => self.cannon.is_some(), EventFilter::EventIs(kind) => self.content.filter() == *kind, EventFilter::NodeKeyIs(node_key) => self.node_key.as_ref() == Some(node_key), + EventFilter::HasNodeKey => self.node_key.is_some(), EventFilter::NodeTargetIs(node_targets) => self .node_key .as_ref() @@ -78,11 +93,16 @@ impl Display for EventFilter { EventFilter::OneOf(vec) => write!(f, "one-of({})", event_filter_vec(vec)), EventFilter::Not(event_filter) => write!(f, "not({})", event_filter), EventFilter::AgentIs(id) => write!(f, "agent-is({id})"), + EventFilter::HasAgent => write!(f, "has-agent"), EventFilter::EnvIs(id) => write!(f, "env-is({id})"), + EventFilter::HasEnv => write!(f, "has-env"), EventFilter::TransactionIs(str) => write!(f, "transaction-is({str})"), + EventFilter::HasTransaction => write!(f, "has-transaction"), EventFilter::CannonIs(id) => write!(f, "cannon-is({id})"), + EventFilter::HasCannon => write!(f, "has-cannon"), EventFilter::EventIs(event) => write!(f, "event-is({event})"), EventFilter::NodeKeyIs(node_key) => write!(f, "node-key-is({node_key})"), + EventFilter::HasNodeKey => write!(f, "has-node-key"), EventFilter::NodeTargetIs(node_targets) => write!(f, "node-target-is({node_targets})"), } } diff --git a/crates/common/src/events/filter_parse.rs b/crates/common/src/events/filter_parse.rs index 6e29efad..252926e3 100644 --- a/crates/common/src/events/filter_parse.rs +++ b/crates/common/src/events/filter_parse.rs @@ -218,14 +218,19 @@ impl<'a> FilterParser<'a> { "not" => self.parens(|t| Ok(Not(Box::new(t.expect_filter()?)))), "agent-is" => self.parens(|t| expect_parsed(t.next(), P::AgentId).map(AgentIs)), + "has-agent" => Ok(HasAgent), "env-is" => self.parens(|t| expect_parsed(t.next(), P::EnvId).map(EnvIs)), + "has-env" => Ok(HasEnv), "transaction-is" => self.parens(|t| { expect_token(t.next(), P::TransactionId, |token| token.text()) .map(|t| TransactionIs(Arc::new(t.to_string()))) }), + "has-transaction" => Ok(HasTransaction), "cannon-is" => self.parens(|t| expect_parsed(t.next(), P::CannonId).map(CannonIs)), + "has-cannon" => Ok(HasCannon), "event-is" => self.parens(|t| expect_parsed(t.next(), P::EventKind).map(EventIs)), "node-key-is" => self.parens(|t| expect_parsed(t.next(), P::NodeKey).map(NodeKeyIs)), + "has-node-key" => Ok(HasNodeKey), "node-target-is" => self.parens(|t| { t.vec_of(|t| expect_parsed::(t.next(), P::NodeTarget)) .map(|v| NodeTargetIs(NodeTargets::from(v))) From 9aab506e752c20b9a0951dce79170facee957d66 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 2 Dec 2024 20:39:59 -0500 Subject: [PATCH 54/68] fix(cli): fix filters not being url encoded --- Cargo.lock | 1 + crates/cli/Cargo.toml | 1 + crates/cli/src/events.rs | 9 ++++++--- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 84ec8aa4..ac72427f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4646,6 +4646,7 @@ dependencies = [ "snops-common", "tokio", "tokio-tungstenite", + "urlencoding", ] [[package]] diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index 14fbae9c..9b5d7ac1 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -25,3 +25,4 @@ serde_json.workspace = true snops-common = { workspace = true, features = ["aot_cmds"] } tokio = { workspace = true, features = ["macros", "signal", "rt-multi-thread"] } tokio-tungstenite.workspace = true +urlencoding = "2.1.3" diff --git a/crates/cli/src/events.rs b/crates/cli/src/events.rs index 24f020e2..22a5bf7e 100644 --- a/crates/cli/src/events.rs +++ b/crates/cli/src/events.rs @@ -34,7 +34,10 @@ impl EventsClient { }; let req = Uri::from_str(&match filter { - Some(filter) => format!("{proto}://{hostname}/api/v1/events?filter={filter}"), + Some(filter) => format!( + "{proto}://{hostname}/api/v1/events?filter={}", + urlencoding::encode(&filter.to_string()) + ), None => format!("{proto}://{hostname}/api/v1/events"), }) .context("Invalid URI")? @@ -104,9 +107,9 @@ impl EventsClient { msg = self.stream.next() => { match msg { Some(Ok(tungstenite::Message::Text(text))) => - return serde_json::from_str(&text).context("Failed to parse event"), + return serde_json::from_str(&text).with_context(|| format!("Failed to parse event: {text}")), Some(Ok(tungstenite::Message::Binary(bin))) => - return serde_json::from_slice(&bin).context("Failed to parse event"), + return serde_json::from_slice(&bin).with_context(|| format!("Failed to parse event: {}", String::from_utf8_lossy(&bin))), None | Some(Err(_)) => bail!("Websocket closed"), Some(Ok(_)) => continue, From a195ce077e4fbdf016d55a7e2d778e2927f8b60f Mon Sep 17 00:00:00 2001 From: Meshiest Date: Mon, 2 Dec 2024 20:43:31 -0500 Subject: [PATCH 55/68] fix(events): fix auth and transactions serialization collision on untagged values --- crates/common/src/events/models.rs | 8 ++++---- crates/controlplane/src/cannon/context.rs | 8 +++++--- crates/controlplane/src/cannon/source.rs | 10 ++++++---- crates/controlplane/src/server/actions/execute.rs | 2 +- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/crates/common/src/events/models.rs b/crates/common/src/events/models.rs index 6acf49e1..3e0570c1 100644 --- a/crates/common/src/events/models.rs +++ b/crates/common/src/events/models.rs @@ -68,7 +68,7 @@ pub enum AgentEvent { #[serde(tag = "kind", rename_all = "snake_case")] pub enum TransactionEvent { /// The authorization was inserted into the cannon - AuthorizationReceived(Arc), + AuthorizationReceived { authorization: Arc }, /// The transaction execution was aborted ExecuteAborted(TransactionAbortReason), /// The transaction is awaiting compute resources @@ -80,7 +80,7 @@ pub enum TransactionEvent { /// The transaction is currently executing Executing, /// The transaction execution is complete - ExecuteComplete(Arc), + ExecuteComplete { transaction: Arc }, /// The transaction has been broadcasted Broadcasted { height: Option, @@ -138,13 +138,13 @@ impl EventKind { Agent(ReconcileError(_)) => AgentReconcileError, Agent(NodeStatus(_)) => AgentNodeStatus, Agent(BlockInfo(_)) => AgentBlockInfo, - Transaction(AuthorizationReceived(_)) => TransactionAuthorizationReceived, + Transaction(AuthorizationReceived { .. }) => TransactionAuthorizationReceived, Transaction(ExecuteAborted(_)) => TransactionExecuteAborted, Transaction(ExecuteAwaitingCompute) => TransactionExecuteAwaitingCompute, Transaction(ExecuteExceeded { .. }) => TransactionExecuteExceeded, Transaction(ExecuteFailed(_)) => TransactionExecuteFailed, Transaction(Executing) => TransactionExecuting, - Transaction(ExecuteComplete(_)) => TransactionExecuteComplete, + Transaction(ExecuteComplete { .. }) => TransactionExecuteComplete, Transaction(Broadcasted { .. }) => TransactionBroadcasted, Transaction(BroadcastExceeded { .. }) => TransactionBroadcastExceeded, Transaction(Confirmed { .. }) => TransactionConfirmed, diff --git a/crates/controlplane/src/cannon/context.rs b/crates/controlplane/src/cannon/context.rs index 6f85bd21..54952e2d 100644 --- a/crates/controlplane/src/cannon/context.rs +++ b/crates/controlplane/src/cannon/context.rs @@ -179,9 +179,11 @@ impl ExecutionContext { auth: Arc, query_path: &str, ) -> Result<(), (Arc, CannonError)> { - TransactionEvent::AuthorizationReceived(Arc::clone(&auth)) - .with_cannon_ctx(self, tx_id.clone()) - .emit(self); + TransactionEvent::AuthorizationReceived { + authorization: Arc::clone(&auth), + } + .with_cannon_ctx(self, tx_id.clone()) + .emit(self); match self .source .compute diff --git a/crates/controlplane/src/cannon/source.rs b/crates/controlplane/src/cannon/source.rs index 3e0b85c2..1f191160 100644 --- a/crates/controlplane/src/cannon/source.rs +++ b/crates/controlplane/src/cannon/source.rs @@ -243,10 +243,12 @@ impl ComputeTarget { tx.status = TransactionSendState::Unsent; tx.transaction = Some(Arc::clone(&transaction)); } - TransactionEvent::ExecuteComplete(Arc::clone(&transaction)) - .with_cannon_ctx(ctx, Arc::clone(tx_id)) - .with_agent_id(agent_id) - .emit(ctx); + TransactionEvent::ExecuteComplete { + transaction: Arc::clone(&transaction), + } + .with_cannon_ctx(ctx, Arc::clone(tx_id)) + .with_agent_id(agent_id) + .emit(ctx); Ok(()) } diff --git a/crates/controlplane/src/server/actions/execute.rs b/crates/controlplane/src/server/actions/execute.rs index 05210309..c2a82974 100644 --- a/crates/controlplane/src/server/actions/execute.rs +++ b/crates/controlplane/src/server/actions/execute.rs @@ -65,7 +65,7 @@ pub async fn execute_status( ExecuteAwaitingCompute => { retries += 1; }, - ExecuteComplete(transaction) => { + ExecuteComplete { transaction } => { return Ok(Json(json!({ "agent_id": agent_id, "retries": retries, From e9aca2367c2c9cbaf4d13cd3085f533867322066 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Tue, 3 Dec 2024 01:53:34 -0500 Subject: [PATCH 56/68] refactor(rpc): convert rpc codec from bincode to json, cleanup some enum structures for json --- Cargo.lock | 2 -- Cargo.toml | 1 + crates/agent/Cargo.toml | 1 - crates/agent/src/client.rs | 6 ++-- crates/agent/src/reconcile/files.rs | 36 ++++++++++--------- crates/agent/src/reconcile/storage.rs | 30 ++++++++-------- crates/agent/src/server.rs | 6 ++-- crates/aot/src/runner/mod.rs | 2 ++ crates/aot/src/runner/rpc/mod.rs | 6 ++-- crates/cli/src/commands/env/mod.rs | 3 +- crates/common/Cargo.toml | 3 +- crates/common/src/aot_cmds/mod.rs | 4 +-- crates/common/src/db/error.rs | 4 --- crates/common/src/events/models.rs | 26 +++++++------- crates/common/src/rpc/codec.rs | 20 +++++++++++ crates/common/src/rpc/error.rs | 1 + crates/common/src/rpc/mod.rs | 1 + crates/common/src/state/agent_status.rs | 7 ++-- .../src/{aot_cmds => state}/authorization.rs | 2 +- crates/common/src/state/mod.rs | 2 ++ crates/common/src/state/reconcile.rs | 30 +++++++++++++--- crates/controlplane/src/cannon/context.rs | 5 ++- crates/controlplane/src/cannon/mod.rs | 4 +-- crates/controlplane/src/cannon/router.rs | 4 +-- crates/controlplane/src/cannon/source.rs | 8 ++--- crates/controlplane/src/cannon/tracker.rs | 5 ++- crates/controlplane/src/db.rs | 3 +- .../controlplane/src/server/actions/deploy.rs | 4 +-- .../src/server/actions/execute.rs | 4 +-- crates/controlplane/src/server/agent_ws.rs | 6 ++-- crates/controlplane/src/server/rpc.rs | 16 +++++++++ 31 files changed, 155 insertions(+), 97 deletions(-) create mode 100644 crates/common/src/rpc/codec.rs rename crates/common/src/{aot_cmds => state}/authorization.rs (98%) diff --git a/Cargo.lock b/Cargo.lock index ac72427f..269cc026 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4582,7 +4582,6 @@ version = "0.2.0" dependencies = [ "anyhow", "axum", - "bincode", "bytes", "chrono", "clap", @@ -4654,7 +4653,6 @@ name = "snops-common" version = "0.2.0" dependencies = [ "anyhow", - "bincode", "bytes", "chrono", "clap", diff --git a/Cargo.toml b/Cargo.toml index 68b2224b..07884df6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -79,6 +79,7 @@ reqwest = { version = "0.12", default-features = false, features = [ "default-tls", "http2", ] } +rmp-serde = "1.3.0" # Can't update this cause snarkos/vm rocksdb = { version = "0.21", default-features = false } rustls = { version = "0.23.15", features = ["ring"] } diff --git a/crates/agent/Cargo.toml b/crates/agent/Cargo.toml index de721390..2845d20f 100644 --- a/crates/agent/Cargo.toml +++ b/crates/agent/Cargo.toml @@ -14,7 +14,6 @@ mangen = ["snops-common/mangen"] [dependencies] anyhow.workspace = true axum = { workspace = true, features = ["http2", "json", "tokio", "ws"] } -bincode.workspace = true bytes.workspace = true chrono.workspace = true clap.workspace = true diff --git a/crates/agent/src/client.rs b/crates/agent/src/client.rs index 72975fe5..ea94b9c4 100644 --- a/crates/agent/src/client.rs +++ b/crates/agent/src/client.rs @@ -118,7 +118,7 @@ pub async fn ws_connection(ws_req: Request, state: Arc) { error!("internal agent RPC channel closed"); break; }; - let bin = match bincode::serialize(&control::MuxedMessageOutgoing::Child(msg)) { + let bin = match snops_common::rpc::codec::encode(&control::MuxedMessageOutgoing::Child(msg)) { Ok(bin) => bin, Err(e) => { error!("failed to serialize response: {e}"); @@ -139,7 +139,7 @@ pub async fn ws_connection(ws_req: Request, state: Arc) { error!("internal agent RPC channel closed"); break; }; - let bin = match bincode::serialize(&control::MuxedMessageOutgoing::Parent(msg)) { + let bin = match snops_common::rpc::codec::encode(&control::MuxedMessageOutgoing::Parent(msg)) { Ok(bin) => bin, Err(e) => { error!("failed to serialize request: {e}"); @@ -193,7 +193,7 @@ pub async fn ws_connection(ws_req: Request, state: Arc) { } Some(Ok(tungstenite::Message::Binary(bin))) => { - let msg = match bincode::deserialize(&bin) { + let msg = match snops_common::rpc::codec::decode(&bin) { Ok(msg) => msg, Err(e) => { error!("failed to deserialize a message from the control plane: {e}"); diff --git a/crates/agent/src/reconcile/files.rs b/crates/agent/src/reconcile/files.rs index be3e8bba..d470b01a 100644 --- a/crates/agent/src/reconcile/files.rs +++ b/crates/agent/src/reconcile/files.rs @@ -139,10 +139,10 @@ impl Reconcile for FileReconciler { if entry.is_pending() { return Ok(ReconcileStatus::empty() - .add_condition(ReconcileCondition::PendingTransfer( - self.src.to_string(), - tx_id, - )) + .add_condition(ReconcileCondition::PendingTransfer { + source: self.src.to_string(), + id: tx_id, + }) .requeue_after(Duration::from_secs(1))); } @@ -152,11 +152,11 @@ impl Reconcile for FileReconciler { < TimeDelta::seconds(60) { return Ok(ReconcileStatus::empty() - .add_condition(ReconcileCondition::InterruptedTransfer( - self.src.to_string(), - tx_id, - entry.interruption.clone().unwrap_or_default(), - )) + .add_condition(ReconcileCondition::InterruptedTransfer { + source: self.src.to_string(), + id: tx_id, + reason: entry.interruption.clone(), + }) .requeue_after(Duration::from_secs(60))); } @@ -194,9 +194,9 @@ impl Reconcile for FileReconciler { ); return Ok(ReconcileStatus::empty() - .add_condition(ReconcileCondition::MissingFile( - self.dst.display().to_string(), - )) + .add_condition(ReconcileCondition::MissingFile { + path: self.dst.display().to_string(), + }) .requeue_after(Duration::from_secs(1))); } @@ -218,7 +218,9 @@ impl Reconcile for FileReconciler { self.tx_id = None; return Ok(ReconcileStatus::empty() - .add_condition(ReconcileCondition::MissingFile(self.src.to_string())) + .add_condition(ReconcileCondition::MissingFile { + path: self.dst.display().to_string(), + }) .requeue_after(Duration::from_secs(1))); } @@ -266,10 +268,10 @@ impl Reconcile for FileReconciler { // transfer is pending - requeue after 1 second with the pending condition Ok(ReconcileStatus::empty() - .add_condition(ReconcileCondition::PendingTransfer( - self.src.to_string(), - tx_id, - )) + .add_condition(ReconcileCondition::PendingTransfer { + source: self.src.to_string(), + id: tx_id, + }) .requeue_after(Duration::from_secs(1))) } } diff --git a/crates/agent/src/reconcile/storage.rs b/crates/agent/src/reconcile/storage.rs index 5b3bbd8b..90383671 100644 --- a/crates/agent/src/reconcile/storage.rs +++ b/crates/agent/src/reconcile/storage.rs @@ -102,7 +102,9 @@ impl<'a> Reconcile<(), ReconcileError> for BinaryReconciler<'a> { trace!("binary is not OK, waiting for the endpoint to come back online..."); Ok(ReconcileStatus::empty() .add_condition(ReconcileCondition::PendingConnection) - .add_condition(ReconcileCondition::MissingFile(SNARKOS_FILE.to_string())) + .add_condition(ReconcileCondition::MissingFile { + path: SNARKOS_FILE.to_string(), + }) .add_scope("binary/offline") .requeue_after(Duration::from_secs(5))) } @@ -181,9 +183,9 @@ impl<'a> Reconcile<(), ReconcileError> for GenesisReconciler<'a> { trace!("genesis is not OK, waiting for the endpoint to come back online..."); Ok(ReconcileStatus::empty() .add_condition(ReconcileCondition::PendingConnection) - .add_condition(ReconcileCondition::MissingFile( - SNARKOS_GENESIS_FILE.to_string(), - )) + .add_condition(ReconcileCondition::MissingFile { + path: SNARKOS_GENESIS_FILE.to_string(), + }) .add_scope("genesis/offline") .requeue_after(Duration::from_secs(5))) } @@ -403,10 +405,9 @@ impl<'a> Reconcile<(), ReconcileError> for LedgerReconciler<'a> { trace!("Pending ledger modification to height {}", target_height.1); return Ok(ReconcileStatus::empty() - .add_condition(ReconcileCondition::PendingProcess(format!( - "ledger modification to height {}", - target_height.1 - ))) + .add_condition(ReconcileCondition::PendingProcess { + process: format!("ledger modification to height {}", target_height.1), + }) .requeue_after(Duration::from_secs(5))); } @@ -416,9 +417,9 @@ impl<'a> Reconcile<(), ReconcileError> for LedgerReconciler<'a> { error!("modify handle missing for pending height"); *self.pending_height = None; return Ok(ReconcileStatus::empty() - .add_condition(ReconcileCondition::InterruptedModify(String::from( - "modify handle missing", - ))) + .add_condition(ReconcileCondition::InterruptedModify { + reason: String::from("modify handle missing"), + }) .requeue_after(Duration::from_secs(1))); }; @@ -426,10 +427,9 @@ impl<'a> Reconcile<(), ReconcileError> for LedgerReconciler<'a> { let Ok(Some(handle)) = modify_handle.1.try_lock().map(|r| r.clone()) else { trace!("Waiting for modify handle to unlock..."); return Ok(ReconcileStatus::empty() - .add_condition(ReconcileCondition::PendingProcess(format!( - "ledger modification to height {}", - target_height.1 - ))) + .add_condition(ReconcileCondition::PendingProcess { + process: format!("ledger modification to height {}", target_height.1), + }) .requeue_after(Duration::from_secs(1))); }; diff --git a/crates/agent/src/server.rs b/crates/agent/src/server.rs index d3aaa815..f10ab506 100644 --- a/crates/agent/src/server.rs +++ b/crates/agent/src/server.rs @@ -85,7 +85,7 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { match msg { Some(Err(_)) | None => break, Some(Ok(Message::Binary(bin))) => { - let msg = match bincode::deserialize(&bin) { + let msg = match snops_common::rpc::codec::decode(&bin) { Ok(msg) => msg, Err(e) => { error!("failed to deserialize a message from node: {e}"); @@ -115,7 +115,7 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { // handle outgoing requests msg = client_request_out.recv() => { let Some(msg) = msg else { error!("internal node RPC channel closed"); break; }; - let bin = match bincode::serialize(&MuxedMessageOutgoing::Child(msg)) { + let bin = match snops_common::rpc::codec::encode(&MuxedMessageOutgoing::Child(msg)) { Ok(bin) => bin, Err(e) => { error!("failed to serialize a request to node: {e}"); @@ -130,7 +130,7 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { // handle outgoing response msg = server_response_out.recv() => { let Some(msg) = msg else { error!("internal node RPC channel closed"); break; }; - let bin = match bincode::serialize(&MuxedMessageOutgoing::Parent(msg)) { + let bin = match snops_common::rpc::codec::encode(&MuxedMessageOutgoing::Parent(msg)) { Ok(bin) => bin, Err(e) => { error!("failed to serialize a response to node: {e}"); diff --git a/crates/aot/src/runner/mod.rs b/crates/aot/src/runner/mod.rs index d2f43798..fd3dd59d 100644 --- a/crates/aot/src/runner/mod.rs +++ b/crates/aot/src/runner/mod.rs @@ -226,6 +226,8 @@ impl Runner { .map_err(|e| e.context("create client"))?, }; + agent.status(SnarkOSStatus::Started); + // only monitor block updates if we have a checkpoint manager or agent status // API if manager.is_some() || agent.is_enabled() { diff --git a/crates/aot/src/runner/rpc/mod.rs b/crates/aot/src/runner/rpc/mod.rs index 50a84ccf..3722caf7 100644 --- a/crates/aot/src/runner/rpc/mod.rs +++ b/crates/aot/src/runner/rpc/mod.rs @@ -114,7 +114,7 @@ impl RpcClient { error!("internal RPC channel closed"); break 'event; }; - let bin = bincode::serialize(&MuxedMessageOutgoing::Child(msg)).expect("failed to serialize response"); + let bin = snops_common::rpc::codec::encode(&MuxedMessageOutgoing::Child(msg)).expect("failed to serialize response"); let send = ws_stream.send(tungstenite::Message::Binary(bin)); if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { error!("The connection to the agent was interrupted while sending node message"); @@ -128,7 +128,7 @@ impl RpcClient { error!("internal RPC channel closed"); break 'event; }; - let bin = bincode::serialize(&MuxedMessageOutgoing::Parent(msg)).expect("failed to serialize request"); + let bin = snops_common::rpc::codec::encode(&MuxedMessageOutgoing::Parent(msg)).expect("failed to serialize request"); let send = ws_stream.send(tungstenite::Message::Binary(bin)); if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { error!("The connection to the agent was interrupted while sending node message"); @@ -174,7 +174,7 @@ impl RpcClient { } Some(Ok(tungstenite::Message::Binary(bin))) => { - let msg = match bincode::deserialize(&bin) { + let msg = match snops_common::rpc::codec::decode(&bin) { Ok(msg) => msg, Err(e) => { error!("failed to deserialize a message from the agent: {e}"); diff --git a/crates/cli/src/commands/env/mod.rs b/crates/cli/src/commands/env/mod.rs index d3b443f6..1aff92a9 100644 --- a/crates/cli/src/commands/env/mod.rs +++ b/crates/cli/src/commands/env/mod.rs @@ -6,9 +6,8 @@ use clap_stdin::FileOrStdin; use reqwest::blocking::{Client, Response}; use snops_common::{ action_models::AleoValue, - aot_cmds::Authorization, key_source::KeySource, - state::{CannonId, InternedId, NodeKey}, + state::{Authorization, CannonId, InternedId, NodeKey}, }; mod action; diff --git a/crates/common/Cargo.toml b/crates/common/Cargo.toml index 1563ef32..3d5268da 100644 --- a/crates/common/Cargo.toml +++ b/crates/common/Cargo.toml @@ -13,7 +13,6 @@ mangen = ["anyhow", "clap_mangen"] [dependencies] anyhow = { workspace = true, optional = true } -bincode.workspace = true bytes.workspace = true chrono = { workspace = true, features = ["serde"] } clap.workspace = true @@ -25,8 +24,8 @@ indexmap = { workspace = true, features = ["std", "serde"] } lasso.workspace = true lazy_static.workspace = true paste.workspace = true -regex.workspace = true rand.workspace = true +regex.workspace = true serde.workspace = true serde_json.workspace = true sha2.workspace = true diff --git a/crates/common/src/aot_cmds/mod.rs b/crates/common/src/aot_cmds/mod.rs index ded792c2..d12dfcb7 100644 --- a/crates/common/src/aot_cmds/mod.rs +++ b/crates/common/src/aot_cmds/mod.rs @@ -5,15 +5,13 @@ use tokio::{ process::{Child, Command}, }; -mod authorization; pub mod error; -pub use authorization::*; pub use error::AotCmdError; use self::error::CommandError; use crate::{ constant::{LEDGER_BASE_DIR, SNARKOS_GENESIS_FILE}, - state::NetworkId, + state::{Authorization, NetworkId}, }; pub struct AotCmd { diff --git a/crates/common/src/db/error.rs b/crates/common/src/db/error.rs index 7760ba39..cce3ffbc 100644 --- a/crates/common/src/db/error.rs +++ b/crates/common/src/db/error.rs @@ -8,10 +8,6 @@ pub enum DatabaseError { DeleteError(String, String, sled::Error), #[error("save error key {0} in {1}: {2}")] SaveError(String, String, sled::Error), - #[error("deserialize value {0} in {1}: {2}")] - DeserializeError(String, String, bincode::Error), - #[error("serialize value {0} in {1}: {2}")] - SerializeError(String, String, bincode::Error), #[error("missing key {0} in {1}")] MissingKey(String, String), #[error("unknown document version {2} for: {1} in {0}")] diff --git a/crates/common/src/events/models.rs b/crates/common/src/events/models.rs index 3e0570c1..fa6b2a2d 100644 --- a/crates/common/src/events/models.rs +++ b/crates/common/src/events/models.rs @@ -5,11 +5,10 @@ use serde::{Deserialize, Serialize}; use super::EventFilter; use crate::{ - aot_cmds::Authorization, rpc::error::ReconcileError, state::{ - AgentId, EnvId, InternedId, LatestBlockInfo, NodeKey, NodeStatus, ReconcileStatus, - TransactionSendState, + AgentId, Authorization, EnvId, InternedId, LatestBlockInfo, NodeKey, NodeStatus, + ReconcileStatus, TransactionSendState, }, }; @@ -23,28 +22,29 @@ pub enum EventWsRequest { #[derive(Clone, Debug, Serialize, Deserialize)] pub struct Event { pub created_at: DateTime, - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default, skip_serializing_if = "Option::is_none")] pub agent: Option, - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default, skip_serializing_if = "Option::is_none")] pub node_key: Option, - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default, skip_serializing_if = "Option::is_none")] pub env: Option, - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default, skip_serializing_if = "Option::is_none")] pub transaction: Option>, - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default, skip_serializing_if = "Option::is_none")] pub cannon: Option, + #[serde(flatten)] pub content: EventKind, } #[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(tag = "type", rename_all = "snake_case")] +#[serde(tag = "event_kind", rename_all = "snake_case")] pub enum EventKind { Agent(AgentEvent), Transaction(TransactionEvent), } #[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(tag = "kind", rename_all = "snake_case")] +#[serde(tag = "event_name", content = "data", rename_all = "snake_case")] pub enum AgentEvent { /// An agent connects to the control plane Connected, @@ -65,7 +65,7 @@ pub enum AgentEvent { } #[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(tag = "kind", rename_all = "snake_case")] +#[serde(tag = "event_name", content = "data", rename_all = "snake_case")] pub enum TransactionEvent { /// The authorization was inserted into the cannon AuthorizationReceived { authorization: Arc }, @@ -96,7 +96,9 @@ pub enum TransactionEvent { #[serde(tag = "reason", rename_all = "snake_case")] pub enum TransactionAbortReason { MissingTracker, - UnexpectedStatus(TransactionSendState), + UnexpectedStatus { + transaction_status: TransactionSendState, + }, MissingAuthorization, } diff --git a/crates/common/src/rpc/codec.rs b/crates/common/src/rpc/codec.rs new file mode 100644 index 00000000..957a341f --- /dev/null +++ b/crates/common/src/rpc/codec.rs @@ -0,0 +1,20 @@ +// rmp_serde and bincode have various limitations and are troublesome to debug. +// the overhead of JSON for messages is not a concern for the RPC layer. + +pub fn encode(msg: &T) -> serde_json::Result> { + serde_json::to_vec(msg) +} + +pub fn decode<'de, T: serde::Deserialize<'de>>(msg: &'de [u8]) -> serde_json::Result { + serde_json::from_slice(msg) +} + +// pub fn encode(msg: &T) -> Result, +// rmp_serde::encode::Error> { rmp_serde::to_vec(msg) +// } + +// pub fn decode<'de, T: serde::Deserialize<'de>>( +// msg: &'de [u8], +// ) -> Result { +// rmp_serde::from_slice(msg) +// } diff --git a/crates/common/src/rpc/error.rs b/crates/common/src/rpc/error.rs index beb87aac..3a6ecb24 100644 --- a/crates/common/src/rpc/error.rs +++ b/crates/common/src/rpc/error.rs @@ -124,6 +124,7 @@ pub enum ResolveError { } #[derive(Debug, Clone, Error, Serialize, Deserialize, AsRefStr)] +#[serde(tag = "error", content = "message")] pub enum ReconcileError { #[error("node is not connected to the controlplane")] Offline, diff --git a/crates/common/src/rpc/mod.rs b/crates/common/src/rpc/mod.rs index c724dabb..14108486 100644 --- a/crates/common/src/rpc/mod.rs +++ b/crates/common/src/rpc/mod.rs @@ -22,6 +22,7 @@ use tarpc::transport::channel::ChannelError; use tokio::sync::mpsc; pub mod agent; +pub mod codec; pub mod control; pub mod error; diff --git a/crates/common/src/state/agent_status.rs b/crates/common/src/state/agent_status.rs index c9e3217c..5ced9634 100644 --- a/crates/common/src/state/agent_status.rs +++ b/crates/common/src/state/agent_status.rs @@ -9,6 +9,7 @@ use super::{snarkos_status::SnarkOSStatus, ReconcileStatus}; use crate::{format::DataFormat, rpc::error::ReconcileError}; #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "status")] pub enum NodeStatus { /// The last known status of the node is unknown #[default] @@ -18,7 +19,7 @@ pub enum NodeStatus { /// The node waiting on other tasks to complete before starting PendingStart, /// The node is running - Running(SnarkOSStatus), + Running { running_status: SnarkOSStatus }, /// The node has exited with a status code Exited(u8), /// The node was online and is in the process of shutting down @@ -30,7 +31,9 @@ pub enum NodeStatus { impl From for NodeStatus { fn from(status: SnarkOSStatus) -> Self { - NodeStatus::Running(status) + NodeStatus::Running { + running_status: status, + } } } diff --git a/crates/common/src/aot_cmds/authorization.rs b/crates/common/src/state/authorization.rs similarity index 98% rename from crates/common/src/aot_cmds/authorization.rs rename to crates/common/src/state/authorization.rs index c5d1a2e8..9fb0e2ee 100644 --- a/crates/common/src/aot_cmds/authorization.rs +++ b/crates/common/src/state/authorization.rs @@ -15,7 +15,7 @@ pub enum Authorization { Deploy { owner: Value, deployment: Value, - #[serde(skip_serializing_if = "Option::is_none", default)] + #[serde(default, skip_serializing_if = "Option::is_none")] fee_auth: Option, }, } diff --git a/crates/common/src/state/mod.rs b/crates/common/src/state/mod.rs index 2f13ae83..f228edfe 100644 --- a/crates/common/src/state/mod.rs +++ b/crates/common/src/state/mod.rs @@ -4,6 +4,7 @@ use regex::Regex; mod agent_mode; mod agent_state; mod agent_status; +mod authorization; mod height_request; mod id; mod network; @@ -19,6 +20,7 @@ mod transaction_status; pub use agent_mode::*; pub use agent_state::*; pub use agent_status::*; +pub use authorization::*; pub use height_request::*; pub use id::*; pub use network::*; diff --git a/crates/common/src/state/reconcile.rs b/crates/common/src/state/reconcile.rs index c57b367a..0744dc5f 100644 --- a/crates/common/src/state/reconcile.rs +++ b/crates/common/src/state/reconcile.rs @@ -26,18 +26,24 @@ impl ReconcileOptions { } #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +#[serde(tag = "condition", rename_all = "snake_case")] pub enum ReconcileCondition { /// A file is being transferred. - PendingTransfer(String, TransferId), + PendingTransfer { source: String, id: TransferId }, /// A process is being spawned / confirmed. Could be starting the node or /// manipulating the ledger - PendingProcess(String), + PendingProcess { process: String }, /// A tranfer was started and interrupted. - InterruptedTransfer(String, TransferId, String), + InterruptedTransfer { + source: String, + id: TransferId, + #[serde(default, skip_serializing_if = "Option::is_none")] + reason: Option, + }, /// A modify operation was started and interrupted. - InterruptedModify(String), + InterruptedModify { reason: String }, /// A file is missing and cannot be downloaded at the moment. - MissingFile(String), + MissingFile { path: String }, /// Waiting to reconnect to the controlplane PendingConnection, /// Waiting for the node to be shut down @@ -48,12 +54,26 @@ pub enum ReconcileCondition { #[derive(Clone, Debug, Serialize, Deserialize)] pub struct ReconcileStatus { + #[serde(default, skip_serializing_if = "Vec::is_empty")] pub scopes: Vec, + #[serde(default, skip_serializing_if = "Option::is_none")] pub inner: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] pub requeue_after: Option, + #[serde(default, skip_serializing_if = "IndexSet::is_empty")] pub conditions: IndexSet, } +impl Eq for ReconcileStatus {} +impl PartialEq for ReconcileStatus { + fn eq(&self, other: &Self) -> bool { + self.inner == other.inner + && self.conditions == other.conditions + && self.scopes == other.scopes + && self.requeue_after == other.requeue_after + } +} + impl Default for ReconcileStatus { fn default() -> Self { Self::new(Some(Default::default())) diff --git a/crates/controlplane/src/cannon/context.rs b/crates/controlplane/src/cannon/context.rs index 54952e2d..13833bc8 100644 --- a/crates/controlplane/src/cannon/context.rs +++ b/crates/controlplane/src/cannon/context.rs @@ -5,9 +5,8 @@ use dashmap::DashMap; use futures_util::{stream::FuturesUnordered, StreamExt}; use lazysort::SortedBy; use snops_common::{ - aot_cmds::Authorization, events::{Event, TransactionAbortReason, TransactionEvent}, - state::{AgentId, CannonId, EnvId, NetworkId, TransactionSendState}, + state::{AgentId, Authorization, CannonId, EnvId, NetworkId, TransactionSendState}, }; use tracing::{error, trace, warn}; @@ -104,7 +103,7 @@ impl ExecutionContext { if tracker.status != TransactionSendState::Authorized { error!("cannon {env_id}.{cannon_id} unexpected status for {tx_id}: {:?}", tracker.status); // TODO: remove this auth and log it somewhere - TransactionEvent::ExecuteAborted(TransactionAbortReason::UnexpectedStatus(tracker.status)).with_cannon_ctx(&self, tx_id).emit(&self); + TransactionEvent::ExecuteAborted(TransactionAbortReason::UnexpectedStatus{ transaction_status: tracker.status}).with_cannon_ctx(&self, tx_id).emit(&self); continue; } // ensure the transaction has an authorization (more than likely unreachable) diff --git a/crates/controlplane/src/cannon/mod.rs b/crates/controlplane/src/cannon/mod.rs index ae8d3d37..f70b04ee 100644 --- a/crates/controlplane/src/cannon/mod.rs +++ b/crates/controlplane/src/cannon/mod.rs @@ -18,9 +18,9 @@ use std::{ use context::ExecutionContext; use dashmap::DashMap; use snops_common::{ - aot_cmds::{AotCmd, Authorization}, + aot_cmds::AotCmd, format::PackedUint, - state::{CannonId, EnvId, NetworkId, StorageId, TransactionSendState}, + state::{Authorization, CannonId, EnvId, NetworkId, StorageId, TransactionSendState}, }; use tokio::{ sync::{ diff --git a/crates/controlplane/src/cannon/router.rs b/crates/controlplane/src/cannon/router.rs index 18e85d82..14cf8e96 100644 --- a/crates/controlplane/src/cannon/router.rs +++ b/crates/controlplane/src/cannon/router.rs @@ -11,10 +11,10 @@ use serde::Deserialize; use serde_json::json; use snops_common::{ key_source::KeySource, - state::{id_or_none, KeyState, NetworkId}, + state::{id_or_none, Authorization, KeyState, NetworkId}, }; -use super::{source::QueryTarget, Authorization}; +use super::source::QueryTarget; use crate::{ server::{actions::execute::execute_status, error::ServerError}, state::AppState, diff --git a/crates/controlplane/src/cannon/source.rs b/crates/controlplane/src/cannon/source.rs index 1f191160..21878b0a 100644 --- a/crates/controlplane/src/cannon/source.rs +++ b/crates/controlplane/src/cannon/source.rs @@ -4,10 +4,8 @@ use chrono::Utc; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use snops_common::events::{EventHelpers, TransactionEvent}; -use snops_common::state::TransactionSendState; -use snops_common::{ - aot_cmds::Authorization, lasso::Spur, node_targets::NodeTargets, state::NetworkId, INTERN, -}; +use snops_common::state::{Authorization, TransactionSendState}; +use snops_common::{lasso::Spur, node_targets::NodeTargets, state::NetworkId, INTERN}; use tracing::error; use super::context::CtxEventHelper; @@ -32,7 +30,7 @@ pub struct LocalService { /// if the node is out of sync, it will corrupt the ledger... /// /// requires cannon to have an associated env_id - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default, skip_serializing_if = "Option::is_none")] pub sync_from: Option, } diff --git a/crates/controlplane/src/cannon/tracker.rs b/crates/controlplane/src/cannon/tracker.rs index c0bfae51..04a89305 100644 --- a/crates/controlplane/src/cannon/tracker.rs +++ b/crates/controlplane/src/cannon/tracker.rs @@ -1,6 +1,9 @@ use std::sync::Arc; -use snops_common::{aot_cmds::Authorization, format::PackedUint, state::TransactionSendState}; +use snops_common::{ + format::PackedUint, + state::{Authorization, TransactionSendState}, +}; use super::error::CannonError; use crate::{db::TxEntry, state::GlobalState}; diff --git a/crates/controlplane/src/db.rs b/crates/controlplane/src/db.rs index b9b9679a..b9600c17 100644 --- a/crates/controlplane/src/db.rs +++ b/crates/controlplane/src/db.rs @@ -1,10 +1,9 @@ use std::{path::Path, sync::Arc}; use snops_common::{ - aot_cmds::Authorization, db::{error::DatabaseError, tree::DbTree, Database as DatabaseTrait}, format::PackedUint, - state::{AgentId, CannonId, EnvId, NetworkId, StorageId, TransactionSendState}, + state::{AgentId, Authorization, CannonId, EnvId, NetworkId, StorageId, TransactionSendState}, }; use crate::{ diff --git a/crates/controlplane/src/server/actions/deploy.rs b/crates/controlplane/src/server/actions/deploy.rs index 033d6d2e..43175054 100644 --- a/crates/controlplane/src/server/actions/deploy.rs +++ b/crates/controlplane/src/server/actions/deploy.rs @@ -8,8 +8,8 @@ use axum::{ use http::StatusCode; use snops_common::{ action_models::DeployAction, - aot_cmds::{AotCmd, Authorization}, - state::KeyState, + aot_cmds::AotCmd, + state::{Authorization, KeyState}, }; use super::{execute::execute_status, Env}; diff --git a/crates/controlplane/src/server/actions/execute.rs b/crates/controlplane/src/server/actions/execute.rs index c2a82974..854fb0cc 100644 --- a/crates/controlplane/src/server/actions/execute.rs +++ b/crates/controlplane/src/server/actions/execute.rs @@ -9,9 +9,9 @@ use http::StatusCode; use serde_json::json; use snops_common::{ action_models::{AleoValue, ExecuteAction}, - aot_cmds::{AotCmd, Authorization}, + aot_cmds::AotCmd, events::{Event, EventKind}, - state::KeyState, + state::{Authorization, KeyState}, }; use tokio::select; diff --git a/crates/controlplane/src/server/agent_ws.rs b/crates/controlplane/src/server/agent_ws.rs index ff2d25d2..4a587ecf 100644 --- a/crates/controlplane/src/server/agent_ws.rs +++ b/crates/controlplane/src/server/agent_ws.rs @@ -299,7 +299,7 @@ async fn handle_socket( } None => break, Some(Ok(Message::Binary(bin))) => { - let msg = match bincode::deserialize(&bin) { + let msg = match snops_common::rpc::codec::decode(&bin) { Ok(msg) => msg, Err(e) => { error!("Agent {id} failed to deserialize a message: {e}"); @@ -332,7 +332,7 @@ async fn handle_socket( error!("Agent {id} internal RPC channel closed"); break; }; - let bin = match bincode::serialize(&MuxedMessageOutgoing::Child(msg)) { + let bin = match snops_common::rpc::codec::encode(&MuxedMessageOutgoing::Child(msg)) { Ok(bin) => bin, Err(e) => { error!("Agent {id} failed to serialize request: {e}"); @@ -351,7 +351,7 @@ async fn handle_socket( error!("Agent {id} internal RPC channel closed"); break; }; - let bin = match bincode::serialize(&MuxedMessageOutgoing::Parent(msg)) { + let bin = match snops_common::rpc::codec::encode(&MuxedMessageOutgoing::Parent(msg)) { Ok(bin) => bin, Err(e) => { error!("Agent {id} failed to serialize response: {e}"); diff --git a/crates/controlplane/src/server/rpc.rs b/crates/controlplane/src/server/rpc.rs index aa8502ea..c3f4d997 100644 --- a/crates/controlplane/src/server/rpc.rs +++ b/crates/controlplane/src/server/rpc.rs @@ -202,6 +202,11 @@ impl ControlService for ControlRpcServer { return; }; + // Prevent redundant events + if agent.status.node_status == status { + return; + } + agent.status.node_status = status.clone(); AgentEvent::NodeStatus(status) .with_agent(&agent) @@ -217,8 +222,19 @@ impl ControlService for ControlRpcServer { return; }; + let changed = match (agent.status.reconcile.as_ref(), status.as_ref()) { + (Some((_, Ok(old))), Ok(new)) => old != new, + (Some((_, Err(old))), Err(err)) => old.to_string() != err.to_string(), + _ => true, + }; + agent.status.reconcile = Some((Instant::now(), status.clone())); + // Prevent redundant events + if !changed { + return; + } + // Emit events for this reconcile let ev = AgentEvent::ReconcileComplete.with_agent(&agent); From 3e543a28fde4a7f718aadcd75ab80d5bda0e6e09 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Tue, 3 Dec 2024 02:00:28 -0500 Subject: [PATCH 57/68] feat(agent): cleanup reconcile serialization --- crates/common/src/state/reconcile.rs | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/crates/common/src/state/reconcile.rs b/crates/common/src/state/reconcile.rs index 0744dc5f..2328ed2f 100644 --- a/crates/common/src/state/reconcile.rs +++ b/crates/common/src/state/reconcile.rs @@ -26,7 +26,7 @@ impl ReconcileOptions { } #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] -#[serde(tag = "condition", rename_all = "snake_case")] +#[serde(tag = "name", rename_all = "snake_case")] pub enum ReconcileCondition { /// A file is being transferred. PendingTransfer { source: String, id: TransferId }, @@ -58,12 +58,35 @@ pub struct ReconcileStatus { pub scopes: Vec, #[serde(default, skip_serializing_if = "Option::is_none")] pub inner: Option, - #[serde(default, skip_serializing_if = "Option::is_none")] + #[serde( + default, + skip_serializing_if = "Option::is_none", + serialize_with = "ser_duration_as_secs", + deserialize_with = "deser_duration_from_secs" + )] pub requeue_after: Option, #[serde(default, skip_serializing_if = "IndexSet::is_empty")] pub conditions: IndexSet, } +fn ser_duration_as_secs(duration: &Option, serializer: S) -> Result +where + S: serde::Serializer, +{ + match duration { + Some(duration) => serializer.serialize_some(&duration.as_secs()), + None => serializer.serialize_none(), + } +} + +fn deser_duration_from_secs<'de, D>(deserializer: D) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + let secs = Option::deserialize(deserializer)?; + Ok(secs.map(Duration::from_secs)) +} + impl Eq for ReconcileStatus {} impl PartialEq for ReconcileStatus { fn eq(&self, other: &Self) -> bool { From e35f9dc345cb96efbec020258171206289c7b228 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Tue, 3 Dec 2024 22:34:48 -0500 Subject: [PATCH 58/68] fix(snops): fix reconcile complete not sending, add version to connected event --- crates/agent/src/reconcile/agent.rs | 13 +++++- crates/common/src/events/models.rs | 10 +++-- crates/common/src/events/test_filter.rs | 42 +++++++++++++------ crates/common/src/rpc/control/mod.rs | 2 +- crates/common/src/state/agent_status.rs | 2 +- crates/common/src/state/reconcile.rs | 9 ++++ crates/controlplane/src/env/mod.rs | 14 +++---- crates/controlplane/src/events/test_stream.rs | 7 +++- crates/controlplane/src/server/agent_ws.rs | 6 ++- crates/controlplane/src/server/api.rs | 5 +-- crates/controlplane/src/server/rpc.rs | 17 ++------ 11 files changed, 82 insertions(+), 45 deletions(-) diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index 371583a0..a9a0655e 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -176,8 +176,11 @@ impl AgentStateReconciler { trace!("Reconciling agent state..."); let res = self.reconcile().await; + + // If this reconcile was triggered by a reconcile request, post the status if let Some(client) = self.state.get_ws_client().await { - let res = res.clone(); + let res = res.clone().map(|s| s.replace_inner(self.is_node_running())); + // TODO: throttle this broadcast tokio::spawn(async move { if let Err(e) = client.post_reconcile_status(context::current(), res).await { @@ -185,6 +188,7 @@ impl AgentStateReconciler { } }); } + match res { Ok(status) => { if status.inner.is_some() { @@ -246,6 +250,13 @@ impl AgentStateReconciler { self.context.process.is_some() } + pub fn is_node_running(&mut self) -> bool { + self.context + .process + .as_mut() + .is_some_and(|p| p.is_running()) + } + pub fn is_shutdown_pending(&self, node: &NodeState, env_info: &AgentEnvInfo) -> bool { // Ensure the process is running if !self.has_process() { diff --git a/crates/common/src/events/models.rs b/crates/common/src/events/models.rs index fa6b2a2d..b030a6b5 100644 --- a/crates/common/src/events/models.rs +++ b/crates/common/src/events/models.rs @@ -47,7 +47,7 @@ pub enum EventKind { #[serde(tag = "event_name", content = "data", rename_all = "snake_case")] pub enum AgentEvent { /// An agent connects to the control plane - Connected, + Connected { version: String }, /// An agent completes a handshake with the control plane HandshakeComplete, /// An agent disconnects from the control plane @@ -55,7 +55,7 @@ pub enum AgentEvent { /// An agent finishes a reconcile ReconcileComplete, /// An agent updates its reconcile status - Reconcile(ReconcileStatus<()>), + Reconcile(ReconcileStatus), /// An error occurs during reconcile ReconcileError(ReconcileError), /// An agent emits a node status @@ -132,7 +132,7 @@ impl EventKind { use TransactionEvent::*; match self { - Agent(Connected) => AgentConnected, + Agent(Connected { .. }) => AgentConnected, Agent(HandshakeComplete) => AgentHandshakeComplete, Agent(Disconnected) => AgentDisconnected, Agent(ReconcileComplete) => AgentReconcileComplete, @@ -225,6 +225,10 @@ impl Event { } } + pub fn kind(&self) -> EventKindFilter { + self.content.filter() + } + pub fn replace_content(&self, content: impl Into) -> Self { Self { created_at: Utc::now(), diff --git a/crates/common/src/events/test_filter.rs b/crates/common/src/events/test_filter.rs index f162f986..9ec2e81c 100644 --- a/crates/common/src/events/test_filter.rs +++ b/crates/common/src/events/test_filter.rs @@ -20,7 +20,11 @@ lazy_static! { #[test] fn test_unfiltered() { - assert!(Connected.event().matches(&Unfiltered)); + assert!(Connected { + version: "0.0.0".to_string() + } + .event() + .matches(&Unfiltered)); assert!(HandshakeComplete.event().matches(&Unfiltered)); assert!(Disconnected.event().matches(&Unfiltered)); assert!(ReconcileComplete.event().matches(&Unfiltered)); @@ -38,9 +42,11 @@ fn test_unfiltered() { #[test] fn test_all_of() { - assert!(Connected - .event() - .matches(&AllOf(vec![EventIs(AgentConnected)]))); + assert!(Connected { + version: "0.0.0".to_string() + } + .event() + .matches(&AllOf(vec![EventIs(AgentConnected)]))); let e = Event { created_at: Utc::now(), @@ -49,7 +55,9 @@ fn test_all_of() { env: Some(*B), transaction: None, cannon: None, - content: Agent(Connected), + content: Agent(Connected { + version: "0.0.0".to_string(), + }), }; assert!(e.matches(&(AgentConnected & AgentIs(*A)))); @@ -65,9 +73,11 @@ fn test_all_of() { #[test] fn test_any_of() { - assert!(Connected - .event() - .matches(&AnyOf(vec![EventIs(AgentConnected)]))); + assert!(Connected { + version: "0.0.0".to_string() + } + .event() + .matches(&AnyOf(vec![EventIs(AgentConnected)]))); let e = Event { created_at: Utc::now(), @@ -76,7 +86,9 @@ fn test_any_of() { env: Some(*B), transaction: None, cannon: None, - content: Agent(Connected), + content: Agent(Connected { + version: "0.0.0".to_string(), + }), }; assert!(e.matches(&(AgentConnected | AgentIs(*A)))); @@ -96,9 +108,11 @@ fn test_any_of() { #[test] fn test_one_of() { - assert!(Connected - .event() - .matches(&OneOf(vec![EventIs(AgentConnected)]))); + assert!(Connected { + version: "0.0.0".to_string() + } + .event() + .matches(&OneOf(vec![EventIs(AgentConnected)]))); let e = Event { created_at: Utc::now(), @@ -107,7 +121,9 @@ fn test_one_of() { env: Some(*B), transaction: None, cannon: None, - content: Agent(Connected), + content: Agent(Connected { + version: "0.0.0".to_string(), + }), }; assert!(e.matches(&(AgentConnected ^ AgentIs(*B)))); diff --git a/crates/common/src/rpc/control/mod.rs b/crates/common/src/rpc/control/mod.rs index 4e83ed02..c8e17536 100644 --- a/crates/common/src/rpc/control/mod.rs +++ b/crates/common/src/rpc/control/mod.rs @@ -37,5 +37,5 @@ pub trait ControlService { async fn post_node_status(update: NodeStatus); /// Emit an agent reconcile status update. - async fn post_reconcile_status(status: Result, ReconcileError>); + async fn post_reconcile_status(status: Result, ReconcileError>); } diff --git a/crates/common/src/state/agent_status.rs b/crates/common/src/state/agent_status.rs index 5ced9634..a882b66f 100644 --- a/crates/common/src/state/agent_status.rs +++ b/crates/common/src/state/agent_status.rs @@ -171,7 +171,7 @@ pub struct AgentStatus { /// A map of transfers in progress pub transfers: IndexMap, /// Latest reconcile status of the agent - pub reconcile: Option<(Instant, Result, ReconcileError>)>, + pub reconcile: Option<(Instant, Result, ReconcileError>)>, } impl DataFormat for LatestBlockInfo { diff --git a/crates/common/src/state/reconcile.rs b/crates/common/src/state/reconcile.rs index 2328ed2f..d62568bc 100644 --- a/crates/common/src/state/reconcile.rs +++ b/crates/common/src/state/reconcile.rs @@ -125,6 +125,15 @@ impl ReconcileStatus { self.requeue_after.is_some() } + pub fn replace_inner(self, inner: U) -> ReconcileStatus { + ReconcileStatus { + inner: Some(inner), + scopes: self.scopes, + requeue_after: self.requeue_after, + conditions: self.conditions, + } + } + pub fn emptied(&self) -> ReconcileStatus { ReconcileStatus { inner: None, diff --git a/crates/controlplane/src/env/mod.rs b/crates/controlplane/src/env/mod.rs index ca0bc991..d1ec3e9c 100644 --- a/crates/controlplane/src/env/mod.rs +++ b/crates/controlplane/src/env/mod.rs @@ -116,7 +116,7 @@ impl Environment { env_id: EnvId, documents: Vec, state: Arc, - ) -> Result { + ) -> Result, EnvError> { let prev_env = state.get_env(env_id); let mut storage_doc = None; @@ -422,17 +422,16 @@ impl Environment { ..Default::default() }, ) - .await?; - - Ok(env_id) + .await } async fn update_all_agents( &self, state: &GlobalState, opts: ReconcileOptions, - ) -> Result<(), EnvError> { + ) -> Result, EnvError> { let mut pending_changes = vec![]; + let mut node_map = HashMap::new(); for entry in self.node_states.iter() { let key = entry.key(); @@ -464,13 +463,14 @@ impl Environment { AgentState::Inventory => {} } - let agent_state = AgentState::Node(self.id, Box::new(next_state)); + node_map.insert(next_state.node_key.clone(), agent_id); + let agent_state = AgentState::Node(self.id, Box::new(next_state)); pending_changes.push((agent_id, agent_state)); } state.update_agent_states_opts(pending_changes, opts).await; - Ok(()) + Ok(node_map) } pub async fn cleanup(id: EnvId, state: &GlobalState) -> Result<(), EnvError> { diff --git a/crates/controlplane/src/events/test_stream.rs b/crates/controlplane/src/events/test_stream.rs index 89e62631..f19a8324 100644 --- a/crates/controlplane/src/events/test_stream.rs +++ b/crates/controlplane/src/events/test_stream.rs @@ -27,7 +27,12 @@ fn test_stream_filtering() { assert_eq!(sub_b.collect_many().len(), 0); assert_eq!(sub_connected.collect_many().len(), 0); - events.emit(Connected.with_agent_id(*A)); + events.emit( + Connected { + version: "0.0.0".to_string(), + } + .with_agent_id(*A), + ); events.emit(Disconnected.with_agent_id(*A)); events.emit(BlockInfo(Default::default()).with_agent_id(*B)); diff --git a/crates/controlplane/src/server/agent_ws.rs b/crates/controlplane/src/server/agent_ws.rs index 4a587ecf..e249c82b 100644 --- a/crates/controlplane/src/server/agent_ws.rs +++ b/crates/controlplane/src/server/agent_ws.rs @@ -150,7 +150,11 @@ async fn handle_socket( warn!("Connecting agent {id} is trying to identify with an invalid nonce"); break 'reconnect; } - AgentEvent::Connected.with_agent(&agent).emit(&state); + AgentEvent::Connected { + version: agent_version.to_string(), + } + .with_agent(&agent) + .emit(&state); match agent.env() { Some(env) if !state.envs.contains_key(&env) => { diff --git a/crates/controlplane/src/server/api.rs b/crates/controlplane/src/server/api.rs index a779a71c..c95567e3 100644 --- a/crates/controlplane/src/server/api.rs +++ b/crates/controlplane/src/server/api.rs @@ -610,11 +610,8 @@ async fn post_env_prepare( Err(e) => return ServerError::from(e).into_response(), }; - // TODO: some live state to report to the calling CLI or something would be - // really nice - match Environment::prepare(env_id, documents, state).await { - Ok(env_id) => (StatusCode::OK, Json(json!({ "id": env_id }))).into_response(), + Ok(node_map) => Json(json!(node_map)).into_response(), Err(e) => ServerError::from(e).into_response(), } } diff --git a/crates/controlplane/src/server/rpc.rs b/crates/controlplane/src/server/rpc.rs index c3f4d997..367a12b5 100644 --- a/crates/controlplane/src/server/rpc.rs +++ b/crates/controlplane/src/server/rpc.rs @@ -216,29 +216,20 @@ impl ControlService for ControlRpcServer { async fn post_reconcile_status( self, _: context::Context, - status: Result, ReconcileError>, + status: Result, ReconcileError>, ) { let Some(mut agent) = self.state.pool.get_mut(&self.agent) else { return; }; - let changed = match (agent.status.reconcile.as_ref(), status.as_ref()) { - (Some((_, Ok(old))), Ok(new)) => old != new, - (Some((_, Err(old))), Err(err)) => old.to_string() != err.to_string(), - _ => true, - }; - agent.status.reconcile = Some((Instant::now(), status.clone())); - // Prevent redundant events - if !changed { - return; - } - // Emit events for this reconcile let ev = AgentEvent::ReconcileComplete.with_agent(&agent); - let is_complete = status.as_ref().is_ok_and(|e| e.inner.is_some()); + let is_complete = status + .as_ref() + .is_ok_and(|e| e.requeue_after.is_none() && e.inner.is_some()); ev.replace_content(match status { Ok(res) => AgentEvent::Reconcile(res), From b88a9c7c09423cf85d2cfce5a1553f5d5f62d121 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Tue, 3 Dec 2024 22:35:12 -0500 Subject: [PATCH 59/68] feat(cli): actions and prepare now wait for reconcile to complete unless --async --- crates/cli/src/commands/env/action/mod.rs | 73 ++++++++++++++---- crates/cli/src/commands/env/mod.rs | 91 ++++++++++++++++++++--- crates/cli/src/commands/mod.rs | 15 ++-- crates/cli/src/events.rs | 15 ++-- 4 files changed, 155 insertions(+), 39 deletions(-) diff --git a/crates/cli/src/commands/env/action/mod.rs b/crates/cli/src/commands/env/action/mod.rs index 6bfe6efb..26e11a16 100644 --- a/crates/cli/src/commands/env/action/mod.rs +++ b/crates/cli/src/commands/env/action/mod.rs @@ -12,6 +12,8 @@ use snops_common::{ state::{CannonId, DocHeightRequest, EnvId, InternedId}, }; +use crate::commands::env::post_and_wait; + //scli env canary action online client/* //scli env canary action offline client/* @@ -58,12 +60,30 @@ impl From for NodeTargets { pub enum Action { /// Turn the specified agents(and nodes) offline. #[clap(alias = "off")] - Offline(Nodes), + Offline { + #[clap(num_args = 1, value_delimiter = ' ')] + nodes: Vec, + /// When present, don't wait for reconciles to finish before returning + #[clap(long = "async")] + async_mode: bool, + }, /// Turn the specified agents(and nodes) online. #[clap(alias = "on")] - Online(Nodes), + Online { + #[clap(num_args = 1, value_delimiter = ' ')] + nodes: Vec, + /// When present, don't wait for reconciles to finish before returning + #[clap(long = "async")] + async_mode: bool, + }, /// Reboot the specified agents(and nodes). - Reboot(Nodes), + Reboot { + #[clap(num_args = 1, value_delimiter = ' ')] + nodes: Vec, + /// When present, don't wait for reconciles to finish before returning + #[clap(long = "async")] + async_mode: bool, + }, /// Execute an aleo program function on the environment. i.e. /// credits.aleo/transfer_public Execute { @@ -147,6 +167,8 @@ pub enum Action { /// Configure the private key for a node. #[clap(long, short)] private_key: Option, + #[clap(long = "async")] + async_mode: bool, }, } @@ -169,23 +191,38 @@ impl KeyEqValue { } impl Action { - pub fn execute(self, url: &str, env_id: EnvId, client: Client) -> Result { + pub async fn execute(self, url: &str, env_id: EnvId, client: Client) -> Result { use Action::*; Ok(match self { - Offline(Nodes { nodes }) => { + Offline { nodes, async_mode } => { let ep = format!("{url}/api/v1/env/{env_id}/action/offline"); - - client.post(ep).json(&WithTargets::from(nodes)).send()? + let req = client.post(ep).json(&WithTargets::from(nodes)); + if async_mode { + req.send()? + } else { + post_and_wait(url, req, env_id).await?; + std::process::exit(0); + } } - Online(Nodes { nodes }) => { + Online { nodes, async_mode } => { let ep = format!("{url}/api/v1/env/{env_id}/action/online"); - - client.post(ep).json(&WithTargets::from(nodes)).send()? + let req = client.post(ep).json(&WithTargets::from(nodes)); + if async_mode { + req.send()? + } else { + post_and_wait(url, req, env_id).await?; + std::process::exit(0); + } } - Reboot(Nodes { nodes }) => { + Reboot { nodes, async_mode } => { let ep = format!("{url}/api/v1/env/{env_id}/action/reboot"); - - client.post(ep).json(&WithTargets::from(nodes)).send()? + let req = client.post(ep).json(&WithTargets::from(nodes)); + if async_mode { + req.send()? + } else { + post_and_wait(url, req, env_id).await?; + std::process::exit(0); + } } Execute { @@ -285,6 +322,7 @@ impl Action { del_env, binary, private_key, + async_mode, } => { let ep = format!("{url}/api/v1/env/{env_id}/action/config"); @@ -321,7 +359,14 @@ impl Action { } // this api accepts a list of json objects - client.post(ep).json(&json!(vec![json])).send()? + let req = client.post(ep).json(&json!(vec![json])); + + if async_mode { + req.send()? + } else { + post_and_wait(url, req, env_id).await?; + std::process::exit(0); + } } }) } diff --git a/crates/cli/src/commands/env/mod.rs b/crates/cli/src/commands/env/mod.rs index 1aff92a9..4b50467a 100644 --- a/crates/cli/src/commands/env/mod.rs +++ b/crates/cli/src/commands/env/mod.rs @@ -1,13 +1,15 @@ -use std::path::PathBuf; +use std::collections::HashMap; use anyhow::Result; use clap::{Parser, ValueHint}; use clap_stdin::FileOrStdin; -use reqwest::blocking::{Client, Response}; +use reqwest::blocking::{Client, RequestBuilder, Response}; +use snops_cli::events::EventsClient; use snops_common::{ action_models::AleoValue, + events::{AgentEvent, Event, EventKind}, key_source::KeySource, - state::{Authorization, CannonId, InternedId, NodeKey}, + state::{AgentId, Authorization, CannonId, EnvId, InternedId, NodeKey, ReconcileStatus}, }; mod action; @@ -101,7 +103,10 @@ enum EnvCommands { Prepare { /// The test spec file. #[clap(value_hint = ValueHint::AnyPath)] - spec: PathBuf, + spec: FileOrStdin, + /// When present, don't wait for reconciles to finish before returning + #[clap(long = "async")] + async_mode: bool, }, /// Lookup a mapping by program id and mapping name. @@ -127,11 +132,11 @@ enum EnvCommands { } impl Env { - pub fn run(self, url: &str, client: Client) -> Result { + pub async fn run(self, url: &str, client: Client) -> Result { let id = self.id; use EnvCommands::*; Ok(match self.command { - Action(action) => action.execute(url, id, client)?, + Action(action) => action.execute(url, id, client).await?, Agent { key } => { let ep = format!("{url}/api/v1/env/{id}/agents/{key}"); @@ -192,11 +197,15 @@ impl Env { client.get(ep).send()? } - Prepare { spec } => { + Prepare { spec, async_mode } => { let ep = format!("{url}/api/v1/env/{id}/prepare"); - let file: String = std::fs::read_to_string(spec)?; - - client.post(ep).body(file).send()? + let req = client.post(ep).body(spec.contents()?); + if async_mode { + req.send()? + } else { + post_and_wait(url, req, id).await?; + std::process::exit(0); + } } Mapping { program, @@ -252,3 +261,65 @@ impl Env { }) } } + +pub async fn post_and_wait(url: &str, req: RequestBuilder, env_id: EnvId) -> Result<()> { + use snops_common::events::EventFilter::*; + use snops_common::events::EventKindFilter::*; + + let mut events = EventsClient::open_with_filter( + url, + EnvIs(env_id) + & (AgentConnected + | AgentDisconnected + | AgentReconcile + | AgentReconcileComplete + | AgentReconcileError), + ) + .await?; + + let mut node_map: HashMap = req.send()?.json()?; + println!("{}", serde_json::to_string_pretty(&node_map)?); + + while let Some(event) = events.next().await? { + if let Event { + node_key: Some(node), + content: EventKind::Agent(e), + .. + } = &event + { + match &e { + AgentEvent::Reconcile(ReconcileStatus { + scopes, conditions, .. + }) => { + println!( + "{node}: {} {}", + scopes.join(";"), + conditions + .iter() + // unwrap safety - it was literally just serialized + .map(|s| serde_json::to_string(s).unwrap()) + .collect::>() + .join(",") + ); + } + AgentEvent::ReconcileError(err) => { + println!("{node}: error: {err}"); + } + AgentEvent::ReconcileComplete => { + println!("{node}: done"); + } + _ => {} + } + } + if let (Some(node_key), true) = ( + event.node_key.as_ref(), + event.matches(&AgentReconcileComplete.into()), + ) { + node_map.remove(node_key); + if node_map.is_empty() { + break; + } + } + } + events.close().await +} diff --git a/crates/cli/src/commands/mod.rs b/crates/cli/src/commands/mod.rs index 92931062..59021a22 100644 --- a/crates/cli/src/commands/mod.rs +++ b/crates/cli/src/commands/mod.rs @@ -30,7 +30,8 @@ pub enum Commands { Events { /// The event filter to apply, such as `agent-connected` or /// `all-of(env-is(default),node-target-is(validator/any))` - filter: Option, + #[clap(default_value = "unfiltered")] + filter: EventFilter, }, #[cfg(feature = "mangen")] Man(snops_common::mangen::Mangen), @@ -52,21 +53,15 @@ impl Commands { return Ok(()); } Commands::Agent(agent) => agent.run(url, client), - Commands::Env(env) => env.run(url, client), + Commands::Env(env) => env.run(url, client).await, Commands::SetLogLevel { level } => { client.post(format!("{url}/api/v1/log/{level}")).send()?; return Ok(()); } Commands::Events { filter } => { let mut client = EventsClient::open_with_filter(url, filter).await?; - loop { - tokio::select! { - _ = tokio::signal::ctrl_c() => break, - res = client.next() => { - let event = res?; - println!("{}", serde_json::to_string_pretty(&event)?); - } - } + while let Some(event) = client.next().await? { + println!("{}", serde_json::to_string_pretty(&event)?); } client.close().await?; return Ok(()); diff --git a/crates/cli/src/events.rs b/crates/cli/src/events.rs index 22a5bf7e..a99bd797 100644 --- a/crates/cli/src/events.rs +++ b/crates/cli/src/events.rs @@ -23,10 +23,14 @@ pub struct EventsClient { impl EventsClient { pub async fn open(url: &str) -> Result { - Self::open_with_filter(url, None).await + Self::new(url, None).await } - pub async fn open_with_filter(url: &str, filter: Option) -> Result { + pub async fn open_with_filter(url: &str, filter: EventFilter) -> Result { + Self::new(url, Some(filter)).await + } + + pub async fn new(url: &str, filter: Option) -> Result { let (proto, hostname) = url.split_once("://").unwrap_or(("http", url)); let proto = match proto { "wss" | "https" => "wss", @@ -98,18 +102,19 @@ impl EventsClient { } /// Get the next event from the stream - pub async fn next(&mut self) -> Result { + pub async fn next(&mut self) -> Result> { loop { select! { + _ = tokio::signal::ctrl_c() => return Ok(None), _ = self.ping_interval.tick() => { self.stream.send(tungstenite::Message::Ping(vec![b'p', b'i', b'n', b'g'])).await.context("Failed to send ping")?; } msg = self.stream.next() => { match msg { Some(Ok(tungstenite::Message::Text(text))) => - return serde_json::from_str(&text).with_context(|| format!("Failed to parse event: {text}")), + return serde_json::from_str(&text).map(Some).with_context(|| format!("Failed to parse event: {text}")), Some(Ok(tungstenite::Message::Binary(bin))) => - return serde_json::from_slice(&bin).with_context(|| format!("Failed to parse event: {}", String::from_utf8_lossy(&bin))), + return serde_json::from_slice(&bin).map(Some).with_context(|| format!("Failed to parse event: {}", String::from_utf8_lossy(&bin))), None | Some(Err(_)) => bail!("Websocket closed"), Some(Ok(_)) => continue, From 58ba2a875e3cb225acdef6670414c3102f1d826f Mon Sep 17 00:00:00 2001 From: Meshiest Date: Tue, 3 Dec 2024 22:56:27 -0500 Subject: [PATCH 60/68] feat(cli): monitor event stream for transactions --- crates/cli/src/commands/env/action/mod.rs | 111 ++++++++++++++++++++-- crates/cli/src/commands/env/mod.rs | 8 +- 2 files changed, 108 insertions(+), 11 deletions(-) diff --git a/crates/cli/src/commands/env/action/mod.rs b/crates/cli/src/commands/env/action/mod.rs index 26e11a16..7f806154 100644 --- a/crates/cli/src/commands/env/action/mod.rs +++ b/crates/cli/src/commands/env/action/mod.rs @@ -1,12 +1,14 @@ -use std::{collections::HashMap, str::FromStr}; +use std::{collections::HashMap, str::FromStr, sync::Arc}; use anyhow::Result; use clap::Parser; use clap_stdin::FileOrStdin; -use reqwest::blocking::{Client, Response}; +use reqwest::blocking::{Client, RequestBuilder, Response}; use serde_json::json; +use snops_cli::events::EventsClient; use snops_common::{ action_models::{AleoValue, WithTargets}, + events::{Event, EventKind, TransactionEvent}, key_source::KeySource, node_targets::{NodeTarget, NodeTargetError, NodeTargets}, state::{CannonId, DocHeightRequest, EnvId, InternedId}, @@ -267,12 +269,13 @@ impl Action { json["program"] = program.into(); } - let mut builder = client.post(ep); + let req = client.post(ep).query(&[("async", "true")]).json(&json); if async_mode { - let query = [("async", "true")]; - builder = builder.query(&query); + req.send()? + } else { + post_and_wait_tx(url, req).await?; + std::process::exit(0); } - builder.json(&json).send()? } Deploy { private_key, @@ -305,12 +308,13 @@ impl Action { json["fee_record"] = fee_record.into(); } - let mut builder = client.post(ep); + let req = client.post(ep).query(&[("async", "true")]).json(&json); if async_mode { - let query = [("async", "true")]; - builder = builder.query(&query); + req.send()? + } else { + post_and_wait_tx(url, req).await?; + std::process::exit(0); } - builder.json(&json).send()? } Config { online, @@ -371,3 +375,90 @@ impl Action { }) } } + +pub async fn post_and_wait_tx(url: &str, req: RequestBuilder) -> Result<()> { + use snops_common::events::EventFilter::*; + + let tx_id: String = req.send()?.json()?; + + let mut events = EventsClient::open_with_filter(url, TransactionIs(Arc::new(tx_id))).await?; + + let mut tx = None; + let mut block_hash = None; + let mut broadcast_height = None; + let mut broadcast_time = None; + + while let Some(event) = events.next().await? { + let Event { + content: EventKind::Transaction(e), + agent, + .. + } = event + else { + continue; + }; + + match e { + TransactionEvent::AuthorizationReceived { .. } => { + // ignore output of this event + } + TransactionEvent::Executing => { + eprintln!( + "executing on {}", + agent + .map(|a| a.to_string()) + .unwrap_or_else(|| "unknown".to_string()) + ); + } + TransactionEvent::ExecuteAwaitingCompute => { + eprintln!("waiting for compute...",); + } + TransactionEvent::ExecuteExceeded { attempts } => { + eprintln!("execution failed after {attempts} attempts"); + break; + } + TransactionEvent::ExecuteFailed(reason) => { + eprintln!("execution failed: {reason}"); + } + TransactionEvent::ExecuteAborted(reason) => { + eprintln!( + "execution aborted: {}", + serde_json::to_string_pretty(&reason)? + ); + } + TransactionEvent::ExecuteComplete { transaction } => { + eprintln!("execution complete"); + tx = Some(transaction); + } + TransactionEvent::BroadcastExceeded { attempts } => { + eprintln!("broadcast failed after {attempts} attempts"); + break; + } + TransactionEvent::Broadcasted { height, timestamp } => { + eprintln!( + "broadcasted at height {} at {timestamp}", + height + .map(|h| h.to_string()) + .unwrap_or_else(|| "unknown".to_string()), + ); + broadcast_height = height; + broadcast_time = Some(timestamp); + } + TransactionEvent::Confirmed { hash } => { + eprintln!("confirmed with hash {hash}"); + block_hash = Some(hash); + break; + } + } + } + println!( + "{}", + serde_json::to_string_pretty(&json!({ + "transaction": tx, + "broadcast_height": broadcast_height, + "broadcast_time": broadcast_time, + "block_hash": block_hash, + }))? + ); + events.close().await +} diff --git a/crates/cli/src/commands/env/mod.rs b/crates/cli/src/commands/env/mod.rs index 4b50467a..c3571536 100644 --- a/crates/cli/src/commands/env/mod.rs +++ b/crates/cli/src/commands/env/mod.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; +use action::post_and_wait_tx; use anyhow::Result; use clap::{Parser, ValueHint}; use clap_stdin::FileOrStdin; @@ -160,7 +161,12 @@ impl Env { req = req.query(&[("async", "true")]); } - req.send()? + if async_mode { + req.send()? + } else { + post_and_wait_tx(url, req).await?; + std::process::exit(0); + } } Balance { address: key } => { let ep = format!("{url}/api/v1/env/{id}/balance/{key}"); From 88c780cb422f1979a9e1e499a79f2a3a118e92c6 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Wed, 4 Dec 2024 19:26:22 -0500 Subject: [PATCH 61/68] feat(controlplane): rename prepare to apply and clean to delete --- crates/cli/src/commands/env/mod.rs | 28 ++++++++++++++++++--------- crates/controlplane/src/env/mod.rs | 6 ++++-- crates/controlplane/src/server/api.rs | 6 +++--- sdk_ts/index.ts | 6 +++--- 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/crates/cli/src/commands/env/mod.rs b/crates/cli/src/commands/env/mod.rs index c3571536..070d45b2 100644 --- a/crates/cli/src/commands/env/mod.rs +++ b/crates/cli/src/commands/env/mod.rs @@ -78,9 +78,9 @@ enum EnvCommands { #[clap(alias = "tx-details")] TransactionDetails { id: String }, - /// Clean a specific environment. - #[clap(alias = "c")] - Clean, + /// Delete a specific environment. + #[clap(alias = "d")] + Delete, /// Get an env's latest block/state root info. Info, @@ -99,10 +99,10 @@ enum EnvCommands { #[clap(alias = "top-res")] TopologyResolved, - /// Prepare a (test) environment. + /// Apply an environment spec. #[clap(alias = "p")] - Prepare { - /// The test spec file. + Apply { + /// The environment spec file. #[clap(value_hint = ValueHint::AnyPath)] spec: FileOrStdin, /// When present, don't wait for reconciles to finish before returning @@ -178,7 +178,7 @@ impl Env { client.get(ep).send()? } - Clean => { + Delete => { let ep = format!("{url}/api/v1/env/{id}"); client.delete(ep).send()? @@ -203,8 +203,8 @@ impl Env { client.get(ep).send()? } - Prepare { spec, async_mode } => { - let ep = format!("{url}/api/v1/env/{id}/prepare"); + Apply { spec, async_mode } => { + let ep = format!("{url}/api/v1/env/{id}/apply"); let req = client.post(ep).body(spec.contents()?); if async_mode { req.send()? @@ -286,7 +286,17 @@ pub async fn post_and_wait(url: &str, req: RequestBuilder, env_id: EnvId) -> Res let mut node_map: HashMap = req.send()?.json()?; println!("{}", serde_json::to_string_pretty(&node_map)?); + let filter = node_map + .values() + .copied() + .fold(!Unfiltered, |id, filter| (id | AgentIs(filter))); + while let Some(event) = events.next().await? { + // Ensure the event is based on the response + if !event.matches(&filter) { + continue; + } + if let Event { node_key: Some(node), content: EventKind::Agent(e), diff --git a/crates/controlplane/src/env/mod.rs b/crates/controlplane/src/env/mod.rs index d1ec3e9c..3da59028 100644 --- a/crates/controlplane/src/env/mod.rs +++ b/crates/controlplane/src/env/mod.rs @@ -108,11 +108,13 @@ impl Environment { .collect() } - /// Prepare a test. This will set the current test on the GlobalState. + /// Apply an environment spec. This will attempt to delegate the given node + /// configurations to available agents, or update existing agents with new + /// configurations. /// /// **This will error if the current env is not unset before calling to /// ensure tests are properly cleaned up.** - pub async fn prepare( + pub async fn apply( env_id: EnvId, documents: Vec, state: Arc, diff --git a/crates/controlplane/src/server/api.rs b/crates/controlplane/src/server/api.rs index c95567e3..128ca728 100644 --- a/crates/controlplane/src/server/api.rs +++ b/crates/controlplane/src/server/api.rs @@ -68,7 +68,7 @@ pub(super) fn routes() -> Router { // get(get_env_agent_key), // ) // .route("/env/:env_id/metric/:prom_ql", get()) - .route("/env/:env_id/prepare", post(post_env_prepare)) + .route("/env/:env_id/apply", post(post_env_apply)) .route("/env/:env_id/info", get(get_env_info)) .route("/env/:env_id/height", get(get_latest_height)) .route("/env/:env_id/block_info", get(get_env_block_info)) @@ -598,7 +598,7 @@ async fn get_env_agent_key( Json(AgentStatusResponse::from(agent.value())).into_response() } -async fn post_env_prepare( +async fn post_env_apply( // This env_id is allowed to be in the Path because it would be allocated // anyway Path(env_id): Path, @@ -610,7 +610,7 @@ async fn post_env_prepare( Err(e) => return ServerError::from(e).into_response(), }; - match Environment::prepare(env_id, documents, state).await { + match Environment::apply(env_id, documents, state).await { Ok(node_map) => Json(json!(node_map)).into_response(), Err(e) => ServerError::from(e).into_response(), } diff --git a/sdk_ts/index.ts b/sdk_ts/index.ts index 0e7f215d..b34db8a1 100644 --- a/sdk_ts/index.ts +++ b/sdk_ts/index.ts @@ -141,8 +141,8 @@ class SnopsApi { return await this.get(`env/${env_id}/agents/${node_ty}/${node_key}`); } - async envPrepare(env_id: string, prepare: any): Promise { - return this.post(`env/${env_id}/prepare`, prepare); + async envApply(env_id: string, prepare: any): Promise { + return this.post(`env/${env_id}/apply`, prepare); } async envInfo(env_id: string): Promise { @@ -332,7 +332,7 @@ class Env { } async prepare(prepare: any) { - return await this.api.envPrepare(this.env_id, prepare); + return await this.api.envApply(this.env_id, prepare); } async info() { From c765afc9c482c8062e1e0b38bcf9056d38a6035c Mon Sep 17 00:00:00 2001 From: Meshiest Date: Thu, 5 Dec 2024 01:05:34 -0500 Subject: [PATCH 62/68] refactor(snops): replace DocHeightRequest with HeightRequest because bincode is begone --- crates/cli/src/commands/env/action/mod.rs | 4 +- crates/common/src/action_models.rs | 4 +- crates/common/src/api.rs | 12 --- crates/common/src/state/height_request.rs | 90 ++----------------- crates/controlplane/src/env/set.rs | 9 -- crates/controlplane/src/persist/node.rs | 6 +- crates/controlplane/src/schema/nodes.rs | 12 +-- .../controlplane/src/server/actions/config.rs | 1 - 8 files changed, 21 insertions(+), 117 deletions(-) diff --git a/crates/cli/src/commands/env/action/mod.rs b/crates/cli/src/commands/env/action/mod.rs index 7f806154..9f4fa957 100644 --- a/crates/cli/src/commands/env/action/mod.rs +++ b/crates/cli/src/commands/env/action/mod.rs @@ -11,7 +11,7 @@ use snops_common::{ events::{Event, EventKind, TransactionEvent}, key_source::KeySource, node_targets::{NodeTarget, NodeTargetError, NodeTargets}, - state::{CannonId, DocHeightRequest, EnvId, InternedId}, + state::{CannonId, EnvId, HeightRequest, InternedId}, }; use crate::commands::env::post_and_wait; @@ -147,7 +147,7 @@ pub enum Action { online: Option, /// Configure the height of the target nodes. #[clap(long)] - height: Option, + height: Option, /// Configure the peers of the target nodes, or `none`. #[clap(long, short)] peers: Option, diff --git a/crates/common/src/action_models.rs b/crates/common/src/action_models.rs index 5e8e950b..af4b2005 100644 --- a/crates/common/src/action_models.rs +++ b/crates/common/src/action_models.rs @@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize}; use crate::{ key_source::KeySource, node_targets::{NodeTarget, NodeTargets}, - state::{CannonId, DocHeightRequest, InternedId}, + state::{CannonId, HeightRequest, InternedId}, }; #[derive(Deserialize, Serialize, Clone)] @@ -116,7 +116,7 @@ pub struct Reconfig { #[serde(default, skip_serializing_if = "Option::is_none")] pub online: Option, #[serde(default, skip_serializing_if = "Option::is_none")] - pub height: Option, + pub height: Option, #[serde(default, skip_serializing_if = "Option::is_none")] pub peers: Option, #[serde(default, skip_serializing_if = "Option::is_none")] diff --git a/crates/common/src/api.rs b/crates/common/src/api.rs index 8bdede66..fec625e0 100644 --- a/crates/common/src/api.rs +++ b/crates/common/src/api.rs @@ -259,18 +259,6 @@ impl DataFormat for StorageInfo { ) -> Result { let mut written = self.id.write_data(writer)?; written += self.retention_policy.write_data(writer)?; - // written += self - // .checkpoints - // .iter() - // .map( - // |CheckpointMeta { - // height, - // timestamp, - // filename, - // }| (*height, *timestamp, filename.to_owned()), - // ) - // .collect::>() - // .write_data(writer)?; written += self.persist.write_data(writer)?; written += self.version.write_data(writer)?; written += self.native_genesis.write_data(writer)?; diff --git a/crates/common/src/state/height_request.rs b/crates/common/src/state/height_request.rs index 30ec7484..162b1c9e 100644 --- a/crates/common/src/state/height_request.rs +++ b/crates/common/src/state/height_request.rs @@ -4,41 +4,19 @@ use snops_checkpoint::RetentionSpan; use crate::format::{DataFormat, DataFormatReader, DataHeaderOf, DataReadError}; -/// for some reason bincode does not allow deserialize_any so if i want to allow -/// end users to type "top", 42, or "persist" i need to do have to copies of -/// this where one is not untagged. -/// -/// bincode. please. -#[derive(Debug, Copy, Default, Clone, serde::Serialize, serde::Deserialize, PartialEq, Eq)] -#[serde(rename_all = "lowercase", untagged)] -pub enum DocHeightRequest { - #[default] - /// Use the latest height for the ledger - #[serde(with = "super::strings::top")] - Top, - /// Set the height to the given block (there must be a checkpoint at this - /// height) Setting to 0 will reset the height to the genesis block - Absolute(u32), - /// Use the next checkpoint that matches this checkpoint span - Checkpoint(snops_checkpoint::RetentionSpan), - // the control plane doesn't know the heights the nodes are at - // TruncateHeight(u32), - // TruncateTime(i64), -} - -impl FromStr for DocHeightRequest { +impl FromStr for HeightRequest { type Err = String; fn from_str(s: &str) -> Result { match s { - "top" => Ok(DocHeightRequest::Top), + "top" => Ok(HeightRequest::Top), s => { if let Ok(height) = s.parse() { - Ok(DocHeightRequest::Absolute(height)) + Ok(HeightRequest::Absolute(height)) } else if let Ok(span) = s.parse() { - Ok(DocHeightRequest::Checkpoint(span)) + Ok(HeightRequest::Checkpoint(span)) } else { - Err(format!("invalid DocHeightRequest: {}", s)) + Err(format!("invalid HeightRequest: {}", s)) } } } @@ -55,52 +33,12 @@ impl Display for HeightRequest { } } -impl DataFormat for DocHeightRequest { - type Header = (u8, DataHeaderOf); - const LATEST_HEADER: Self::Header = (1, RetentionSpan::LATEST_HEADER); - - fn write_data( - &self, - writer: &mut W, - ) -> Result { - match self { - DocHeightRequest::Top => 0u8.write_data(writer), - DocHeightRequest::Absolute(height) => { - Ok(1u8.write_data(writer)? + height.write_data(writer)?) - } - DocHeightRequest::Checkpoint(retention) => { - Ok(2u8.write_data(writer)? + retention.write_data(writer)?) - } - } - } - - fn read_data( - reader: &mut R, - header: &Self::Header, - ) -> Result { - if header.0 != Self::LATEST_HEADER.0 { - return Err(DataReadError::unsupported( - "DocHeightRequest", - Self::LATEST_HEADER.0, - header.0, - )); - } - match reader.read_data(&())? { - 0u8 => Ok(DocHeightRequest::Top), - 1u8 => Ok(DocHeightRequest::Absolute(reader.read_data(&())?)), - 2u8 => Ok(DocHeightRequest::Checkpoint(reader.read_data(&header.1)?)), - n => Err(DataReadError::Custom(format!( - "invalid DocHeightRequest discrminant: {n}" - ))), - } - } -} - #[derive(Debug, Default, Copy, Clone, serde::Serialize, serde::Deserialize, PartialEq, Eq)] -#[serde(rename_all = "lowercase")] +#[serde(rename_all = "lowercase", untagged)] pub enum HeightRequest { #[default] /// Use the latest height for the ledger + #[serde(with = "super::strings::top")] Top, /// Set the height to the given block (there must be a checkpoint at this /// height) Setting to 0 will reset the height to the genesis block @@ -112,8 +50,6 @@ pub enum HeightRequest { // TruncateTime(i64), } -// TODO: now that we don't use bincode for storage format, we should be able to -// make remove HeightRequest and rename DocHeightRequest to HeightRequest impl DataFormat for HeightRequest { type Header = (u8, DataHeaderOf); const LATEST_HEADER: Self::Header = (1, RetentionSpan::LATEST_HEADER); @@ -149,7 +85,7 @@ impl DataFormat for HeightRequest { 1u8 => Ok(HeightRequest::Absolute(reader.read_data(&())?)), 2u8 => Ok(HeightRequest::Checkpoint(reader.read_data(&header.1)?)), n => Err(DataReadError::Custom(format!( - "invalid HeightRequest discrminant: {n}" + "invalid HeightRequest discriminant: {n}" ))), } } @@ -166,13 +102,3 @@ impl HeightRequest { *self == Self::Absolute(0) || *self == Self::Checkpoint(RetentionSpan::Unlimited) } } - -impl From for HeightRequest { - fn from(req: DocHeightRequest) -> Self { - match req { - DocHeightRequest::Top => Self::Top, - DocHeightRequest::Absolute(h) => Self::Absolute(h), - DocHeightRequest::Checkpoint(c) => Self::Checkpoint(c), - } - } -} diff --git a/crates/controlplane/src/env/set.rs b/crates/controlplane/src/env/set.rs index 8259fcac..8f8ffe3e 100644 --- a/crates/controlplane/src/env/set.rs +++ b/crates/controlplane/src/env/set.rs @@ -207,15 +207,6 @@ pub fn pair_with_nodes( )]); } - // another optimization that could be made is to sort nodes based on the number - // of agents with the specific labels. this would be useful for when some - // agents have unique labels as well as other common labels and - // there are nodes asking for agents with either. - - // TODO: potential performance improvement by splitting this agent map up - // available modes eg. client map, prover map, validator map, then pick by - // the key.ty - // handle the nodes that want specific agents first let agent_map = agents.iter().map(|a| (a.id, a)).collect::>(); diff --git a/crates/controlplane/src/persist/node.rs b/crates/controlplane/src/persist/node.rs index 5feff317..5cd638bc 100644 --- a/crates/controlplane/src/persist/node.rs +++ b/crates/controlplane/src/persist/node.rs @@ -90,7 +90,7 @@ mod tests { use snops_common::{ format::DataFormat, node_targets::NodeTargets, - state::{DocHeightRequest, InternedId}, + state::{HeightRequest, InternedId}, }; use crate::{ @@ -140,7 +140,7 @@ mod tests { online: true, replicas: None, key: None, - height: DocHeightRequest::Top, + height: HeightRequest::Top, labels: Default::default(), agent: None, validators: NodeTargets::None, @@ -156,7 +156,7 @@ mod tests { online: true, replicas: None, key: None, - height: DocHeightRequest::Top, + height: HeightRequest::Top, labels: Default::default(), agent: None, validators: NodeTargets::None, diff --git a/crates/controlplane/src/schema/nodes.rs b/crates/controlplane/src/schema/nodes.rs index e94e1d31..da16b984 100644 --- a/crates/controlplane/src/schema/nodes.rs +++ b/crates/controlplane/src/schema/nodes.rs @@ -8,7 +8,7 @@ use snops_common::{ lasso::Spur, node_targets::NodeTargets, set::{MaskBit, MASK_PREFIX_LEN}, - state::{AgentId, DocHeightRequest, InternedId, NetworkId, NodeState}, + state::{AgentId, HeightRequest, InternedId, NetworkId, NodeState}, INTERN, }; @@ -181,7 +181,7 @@ pub struct Node { /// * When zero, the ledger is empty and only the genesis block is /// inherited. #[serde(default)] - pub height: DocHeightRequest, + pub height: HeightRequest, /// When specified, agents must have these labels #[serde( @@ -217,7 +217,7 @@ impl Node { NodeState { node_key, private_key: Default::default(), - height: (0, self.height.into()), + height: (0, self.height), online: self.online, env: self.env.clone(), binary: self.binary, @@ -252,7 +252,7 @@ impl Node { #[derive(Debug, Clone)] pub struct NodeFormatHeader { pub(crate) key_source: DataHeaderOf, - pub(crate) height_request: DataHeaderOf, + pub(crate) height_request: DataHeaderOf, pub(crate) node_targets: DataHeaderOf, pub has_binaries: bool, } @@ -285,7 +285,7 @@ impl DataFormat for NodeFormatHeader { } let key_source = KeySource::read_header(reader)?; - let height_request = DocHeightRequest::read_header(reader)?; + let height_request = HeightRequest::read_header(reader)?; let node_targets = NodeTargets::read_header(reader)?; Ok(NodeFormatHeader { key_source, @@ -300,7 +300,7 @@ impl DataFormat for Node { type Header = NodeFormatHeader; const LATEST_HEADER: Self::Header = NodeFormatHeader { key_source: KeySource::LATEST_HEADER, - height_request: DocHeightRequest::LATEST_HEADER, + height_request: HeightRequest::LATEST_HEADER, node_targets: NodeTargets::LATEST_HEADER, has_binaries: true, }; diff --git a/crates/controlplane/src/server/actions/config.rs b/crates/controlplane/src/server/actions/config.rs index fcb267fd..77aebab9 100644 --- a/crates/controlplane/src/server/actions/config.rs +++ b/crates/controlplane/src/server/actions/config.rs @@ -56,7 +56,6 @@ pub async fn config( for WithTargets { nodes, data } in configs { for agent in env.matching_agents(&nodes, &state.pool) { if let Some(h) = data.height { - let h = h.into(); set_node_field!(agent, height = (height.0 + 1, h)); } From 0f4cb0b4c8a6281bb0cbc8b02818355d645f4990 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Thu, 5 Dec 2024 01:05:53 -0500 Subject: [PATCH 63/68] fix(cli): fix env errors not printing --- crates/cli/src/commands/env/mod.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/crates/cli/src/commands/env/mod.rs b/crates/cli/src/commands/env/mod.rs index 070d45b2..bb09c783 100644 --- a/crates/cli/src/commands/env/mod.rs +++ b/crates/cli/src/commands/env/mod.rs @@ -283,7 +283,17 @@ pub async fn post_and_wait(url: &str, req: RequestBuilder, env_id: EnvId) -> Res ) .await?; - let mut node_map: HashMap = req.send()?.json()?; + let res = req.send()?; + + if !res.status().is_success() { + println!( + "{}", + serde_json::to_string_pretty(&res.json::()?)? + ); + std::process::exit(1); + } + + let mut node_map: HashMap = res.json()?; println!("{}", serde_json::to_string_pretty(&node_map)?); let filter = node_map From 55f3ee35f44a80296cad4ac3cddf8e7855080310 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Thu, 5 Dec 2024 01:22:01 -0500 Subject: [PATCH 64/68] feat(agent): ensure node actually starts up for reconcile to succeed --- crates/agent/src/main.rs | 1 + crates/agent/src/reconcile/agent.rs | 30 ++++++++++++++++++++--- crates/agent/src/rpc/agent.rs | 3 +++ crates/agent/src/state.rs | 14 ++++++++++- crates/common/src/state/snarkos_status.rs | 16 ++++++++++++ 5 files changed, 59 insertions(+), 5 deletions(-) diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 439fca8e..e8d7d8d0 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -86,6 +86,7 @@ async fn main() { endpoint, queue_reconcile_tx, loki: Mutex::new(db.loki_url()), + last_node_status: RwLock::new(None), env_info: RwLock::new( db.env_info() .inspect_err(|e| { diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs index a9a0655e..d645028d 100644 --- a/crates/agent/src/reconcile/agent.rs +++ b/crates/agent/src/reconcile/agent.rs @@ -179,7 +179,14 @@ impl AgentStateReconciler { // If this reconcile was triggered by a reconcile request, post the status if let Some(client) = self.state.get_ws_client().await { - let res = res.clone().map(|s| s.replace_inner(self.is_node_running())); + let node_is_started = self + .state + .get_node_status() + .await + .is_some_and(|s| s.is_started()); + let res = res + .clone() + .map(|s| s.replace_inner(self.is_node_running() && node_is_started)); // TODO: throttle this broadcast tokio::spawn(async move { @@ -221,6 +228,8 @@ impl AgentStateReconciler { // If the process has exited, clear the process context if res.inner.is_some() { self.context.process = None; + self.state.set_node_status(None).await; + self.context.shutdown_pending = false; } }); } @@ -347,6 +356,7 @@ impl Reconcile<(), ReconcileError> for AgentStateReconciler { // If the process has exited, clear the process context if res.inner.is_some() { self.context.process = None; + self.state.set_node_status(None).await; self.context.shutdown_pending = false; } }); @@ -403,7 +413,17 @@ impl Reconcile<(), ReconcileError> for AgentStateReconciler { // Prevent other reconcilers from running while the node is running if self.state.is_node_online() { - return Ok(ReconcileStatus::default().add_scope("agent_state/running")); + let Some(node_status) = self.state.get_node_status().await else { + return Ok(ReconcileStatus::empty().add_scope("agent_state/node/booting")); + }; + + let rec = if node_status.is_started() { + ReconcileStatus::default() + } else { + ReconcileStatus::empty() + }; + + return Ok(rec.add_scope(format!("agent_state/node/{}", node_status.label()))); } // If the node is not online, the process is still running, but the node @@ -413,7 +433,7 @@ impl Reconcile<(), ReconcileError> for AgentStateReconciler { return Ok(ReconcileStatus::empty() .requeue_after(Duration::from_secs(1)) .add_condition(ReconcileCondition::PendingStartup) - .add_scope("agent_state/starting")); + .add_scope("agent_state/node/booting")); } let storage_path = self @@ -496,10 +516,12 @@ impl Reconcile<(), ReconcileError> for AgentStateReconciler { .await?; let process = ProcessContext::new(command)?; + // Clear the last node running status (it was shut down) + self.state.set_node_status(None).await; self.context.process = Some(process); self.context.shutdown_pending = false; Ok(ReconcileStatus::empty() - .add_scope("agent_state/starting") + .add_scope("agent_state/node/booting") .requeue_after(Duration::from_secs(1))) } } diff --git a/crates/agent/src/rpc/agent.rs b/crates/agent/src/rpc/agent.rs index 0af005a2..e2ab05f8 100644 --- a/crates/agent/src/rpc/agent.rs +++ b/crates/agent/src/rpc/agent.rs @@ -57,6 +57,9 @@ impl AgentNodeService for AgentNodeRpcServer { return Ok(()); // ignore if client is not available }; + // Update the last node status + self.state.set_node_status(Some(status.clone())).await; + client .post_node_status(context::current(), status.into()) .await diff --git a/crates/agent/src/state.rs b/crates/agent/src/state.rs index 8f6b0da4..06361a80 100644 --- a/crates/agent/src/state.rs +++ b/crates/agent/src/state.rs @@ -11,7 +11,10 @@ use reqwest::Url; use snops_common::{ api::AgentEnvInfo, rpc::{agent::node::NodeServiceClient, control::ControlServiceClient, error::ReconcileError}, - state::{AgentId, AgentPeer, AgentState, EnvId, ReconcileOptions, TransferId, TransferStatus}, + state::{ + snarkos_status::SnarkOSStatus, AgentId, AgentPeer, AgentState, EnvId, ReconcileOptions, + TransferId, TransferStatus, + }, util::OpaqueDebug, }; use tarpc::context; @@ -52,6 +55,7 @@ pub struct GlobalState { pub transfers: Arc>, pub node_client: RwLock>, + pub last_node_status: RwLock>, pub log_level_handler: ReloadHandler, /// A oneshot sender to shutdown the agent. pub shutdown: RwLock>>, @@ -222,4 +226,12 @@ impl GlobalState { error!("failed to save resolved addrs to db: {e}"); } } + + pub async fn set_node_status(&self, status: Option) { + *self.last_node_status.write().await = status.map(|s| (Instant::now(), s)); + } + + pub async fn get_node_status(&self) -> Option { + self.last_node_status.read().await.clone().map(|(_, s)| s) + } } diff --git a/crates/common/src/state/snarkos_status.rs b/crates/common/src/state/snarkos_status.rs index cbfd72c0..52903fc8 100644 --- a/crates/common/src/state/snarkos_status.rs +++ b/crates/common/src/state/snarkos_status.rs @@ -20,6 +20,22 @@ pub enum SnarkOSStatus { Halted(Option), } +impl SnarkOSStatus { + pub fn is_started(&self) -> bool { + matches!(self, SnarkOSStatus::Started) + } + + pub fn label(&self) -> &'static str { + match self { + SnarkOSStatus::Starting => "starting", + SnarkOSStatus::LedgerLoading => "loading", + SnarkOSStatus::LedgerFailure(_) => "failure", + SnarkOSStatus::Started => "started", + SnarkOSStatus::Halted(_) => "halted", + } + } +} + /// Messages from snarkos to the agent, containing information about the status /// of the node #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] From ecab658eac410590ef99b39c2f028ef3c34e7361 Mon Sep 17 00:00:00 2001 From: Meshiest Date: Thu, 5 Dec 2024 23:51:27 -0500 Subject: [PATCH 65/68] docs: update docs from latest changes --- README.md | 12 ++++++------ index.html | 16 ++++++++-------- snops_book/architecture/CONTROL_PLANE.md | 2 +- snops_book/user_guide/envs/CANNONS.md | 2 +- snops_book/user_guide/envs/STORAGE.md | 2 +- snops_book/user_guide/running/README.md | 4 ++-- specs/example-multi-binaries.yaml | 2 +- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 99603d85..834dc7fc 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ To learn more about `snops` we recommend checking out the mdbook [here](https:// The controlplane is the webserver that communicates to agents how to run snarkOS, or what transactions to execute. -1. In another terminal, build the cli: `cargo install --path ./crates/snops-cli` +1. In another terminal, install the cli: `cargo install --path ./crates/snops-cli`, or build with `cargo xtask build cli` and use from `target/release-big/snops-cli`. The cli is used to interact with the controlplane and manage environments. It provides JSON based output. We recommend pairing our cli with [`jq`](https://jqlang.github.io/jq/) when leveraging other scripts and tools @@ -64,15 +64,15 @@ To learn more about `snops` we recommend checking out the mdbook [here](https:// Each of these can be dynamically configured as snarkos nodes. The default agent configuration should connect to a locally operated controlplane. -### Local Isonets +### Local Isolated Networks (Isonets) -This example requires 4 agents and the control plane to be running. +This example requires 4 agents and the control plane to be running. It allows you to run a devnet with a custom genesis block. -1. Start the environment: `snops-cli env prepare specs/test-4-validators.yaml` +1. Start the environment: `snops-cli env apply specs/test-4-validators.yaml` 1. Check the current network height: `snops-cli env height` 1. Look at the latest block: `snops-cli env block` 1. Look at the genesis block: `snops-cli env block 0` -1. Stop the environment: `snops-cli env clean` +1. Stop the environment: `snops-cli env delete` ### Isonet Transfers @@ -156,7 +156,7 @@ Deploying and executing Aleo programs on your isonets is easiest with snops. You `snarkos-aot` provides various CLI tools to help with developing and executing Aleo programs as well as interact with snarkOS ledgers. -Build `snarkos-aot` with: `cargo install --profile release-big -p snarkos-aot`. +Build `snarkos-aot` with: `cargo xtask build aot`. The compiled binary can be found in `target/release-big/snarkos-aot`. Use the `NETWORK` environment variable to specify `mainnet` (default), diff --git a/index.html b/index.html index 10cc8d26..33179779 100644 --- a/index.html +++ b/index.html @@ -208,17 +208,17 @@

Easy Setup

run snarkOS, or what transactions to execute.

  • -

    In another terminal, build the cli: cargo install --path ./crates/snops-cli

    +

    In another terminal, install the cli: cargo install --path ./crates/snops-cli, or build with cargo xtask build cli and use from target/release-big/snops-cli.

    The cli is used to interact with the controlplane and manage environments. It provides JSON based output. We recommend pairing our cli with jq when leveraging other scripts and tools

  • -

    Build the agent: cargo build --profile release-big -p snops-agent

    +

    Build the agent: cargo xtask build agent

    The agent is a lightweight service that starts up snarkos-aot which automatically configures snarkos nodes, or executes transactions.

  • -

    Build snarkos-aot (for running nodes): cargo build --profile release-big -p snarkos-aot

    +

    Build snarkos-aot (for running nodes): cargo xtask build aot

    snarkos-aot is an alternative snarkOS CLI providing more developer-oriented features as well as tooling for distributed transaction generation and execution.

  • @@ -233,14 +233,14 @@

    Easy Setup

    agent configuration should connect to a locally operated controlplane.

    -

    Local Isonets

    -

    This example requires 4 agents and the control plane to be running.

    +

    Local Isolated Networks (Isonets)

    +

    This example requires 4 agents and the control plane to be running. It allows you to run a devnet with a custom genesis block.

      -
    1. Start the environment: snops-cli env prepare specs/test-4-validators.yaml
    2. +
    3. Start the environment: snops-cli env apply specs/test-4-validators.yaml
    4. Check the current network height: snops-cli env height
    5. Look at the latest block: snops-cli env block
    6. Look at the genesis block: snops-cli env block 0
    7. -
    8. Stop the environment: snops-cli env clean
    9. +
    10. Stop the environment: snops-cli env delete

    Isonet Transfers

    Using the setup for a Local Isonet, executing Aleo programs has never @@ -328,7 +328,7 @@

    SnarkOS-aot Quickstart

    snarkos-aot provides various CLI tools to help with developing and executing Aleo programs as well as interact with snarkOS ledgers.

    -

    Build snarkos-aot with: cargo install --profile release-big -p snarkos-aot. +

    Build snarkos-aot with: cargo xtask build aot. The compiled binary can be found in target/release-big/snarkos-aot.

    Use the NETWORK environment variable to specify mainnet (default), testnet, or canary.

    diff --git a/snops_book/architecture/CONTROL_PLANE.md b/snops_book/architecture/CONTROL_PLANE.md index 02a2099e..2724e6ab 100644 --- a/snops_book/architecture/CONTROL_PLANE.md +++ b/snops_book/architecture/CONTROL_PLANE.md @@ -83,7 +83,7 @@ have connected to it. Agents have two States: - _Inventoried_: An agent is in inventory mode if it is not currently running a snarkOS node. -- _Associated_: It becomes associated with an **environment** when one is prepared. As the control plane will delegate agents in inventory to the **environment**. +- _Associated_: It becomes associated with an **environment** when one is applied. As the control plane will delegate agents in inventory to the **environment**. ### Metrics and Logging diff --git a/snops_book/user_guide/envs/CANNONS.md b/snops_book/user_guide/envs/CANNONS.md index 0b3d6963..9a28f8de 100644 --- a/snops_book/user_guide/envs/CANNONS.md +++ b/snops_book/user_guide/envs/CANNONS.md @@ -7,7 +7,7 @@ The cannon document is an optional where you can specify: - where to send transactions (to a file, or a node in the topology) -The cannon document is not required for a `environment` to run, but the document needs to be present at `prepare` time to work. +The cannon document is not required for a `environment` to run, but the document needs to be present at `apply` time to work. This document is required if you want to use a [cannon timeline action](TIMELINES.md#cannon). diff --git a/snops_book/user_guide/envs/STORAGE.md b/snops_book/user_guide/envs/STORAGE.md index 15a70aa6..1f2ccfa9 100644 --- a/snops_book/user_guide/envs/STORAGE.md +++ b/snops_book/user_guide/envs/STORAGE.md @@ -94,7 +94,7 @@ The size of the binary in bytes. An optional number used if you want to wipe the old storage. -The recommendation is to increment this number, `clean` the env, and then `prepare` it again. +The recommendation is to increment this number, `delete` the env, and then `apply` it again. The default value is `0`. diff --git a/snops_book/user_guide/running/README.md b/snops_book/user_guide/running/README.md index 77a32e27..6a3ee8f9 100644 --- a/snops_book/user_guide/running/README.md +++ b/snops_book/user_guide/running/README.md @@ -38,7 +38,7 @@ Additionally you can enable [metrics and logging](./METRICS_AND_LOGGING.md), to ### Starting agents -