diff --git a/Cargo.lock b/Cargo.lock index a99ee2d4..4b688f90 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3106,6 +3106,9 @@ name = "semver" version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" +dependencies = [ + "serde", +] [[package]] name = "serde" @@ -3299,7 +3302,7 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "snarkos-account" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "colored", @@ -3309,7 +3312,7 @@ dependencies = [ [[package]] name = "snarkos-aot" -version = "0.1.0" +version = "0.1.1" dependencies = [ "aleo-std", "anyhow", @@ -3353,7 +3356,7 @@ dependencies = [ [[package]] name = "snarkos-node" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "aleo-std", "anyhow", @@ -3361,6 +3364,7 @@ dependencies = [ "colored", "futures-util", "indexmap 2.6.0", + "lru", "num_cpus", "once_cell", "parking_lot 0.12.3", @@ -3386,7 +3390,7 @@ dependencies = [ [[package]] name = "snarkos-node-bft" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "aleo-std", "anyhow", @@ -3419,7 +3423,7 @@ dependencies = [ [[package]] name = "snarkos-node-bft-events" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "bytes", @@ -3436,7 +3440,7 @@ dependencies = [ [[package]] name = "snarkos-node-bft-ledger-service" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "async-trait", "indexmap 2.6.0", @@ -3452,7 +3456,7 @@ dependencies = [ [[package]] name = "snarkos-node-bft-storage-service" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "aleo-std", "anyhow", @@ -3466,7 +3470,7 @@ dependencies = [ [[package]] name = "snarkos-node-cdn" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "bincode", @@ -3485,7 +3489,7 @@ dependencies = [ [[package]] name = "snarkos-node-consensus" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "aleo-std", "anyhow", @@ -3507,7 +3511,7 @@ dependencies = [ [[package]] name = "snarkos-node-metrics" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "metrics-exporter-prometheus", "parking_lot 0.12.3", @@ -3520,7 +3524,7 @@ dependencies = [ [[package]] name = "snarkos-node-rest" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "axum", @@ -3548,7 +3552,7 @@ dependencies = [ [[package]] name = "snarkos-node-router" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "async-trait", @@ -3579,7 +3583,7 @@ dependencies = [ [[package]] name = "snarkos-node-router-messages" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "bytes", @@ -3597,7 +3601,7 @@ dependencies = [ [[package]] name = "snarkos-node-sync" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "indexmap 2.6.0", @@ -3610,6 +3614,7 @@ dependencies = [ "snarkos-node-router", "snarkos-node-sync-communication-service", "snarkos-node-sync-locators", + "snarkos-node-tcp", "snarkvm", "tokio", "tracing", @@ -3618,7 +3623,7 @@ dependencies = [ [[package]] name = "snarkos-node-sync-communication-service" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "async-trait", "tokio", @@ -3627,7 +3632,7 @@ dependencies = [ [[package]] name = "snarkos-node-sync-locators" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "anyhow", "indexmap 2.6.0", @@ -3639,7 +3644,7 @@ dependencies = [ [[package]] name = "snarkos-node-tcp" version = "3.0.0" -source = "git+https://github.com/AleoNet/snarkOS?rev=c6de459#c6de459a31387f1700c859ce3842fed2a228a495" +source = "git+https://github.com/AleoNet/snarkOS?rev=ba41197#ba41197d9eb6b5412e6b451508c069baf26f8a0d" dependencies = [ "async-trait", "bytes", @@ -3655,7 +3660,7 @@ dependencies = [ [[package]] name = "snarkvm" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "anstyle", "anyhow", @@ -3686,7 +3691,7 @@ dependencies = [ [[package]] name = "snarkvm-algorithms" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -3717,7 +3722,7 @@ dependencies = [ [[package]] name = "snarkvm-algorithms-cuda" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "blst", "cc", @@ -3728,7 +3733,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-account", "snarkvm-circuit-algorithms", @@ -3742,7 +3747,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-account" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-algorithms", "snarkvm-circuit-network", @@ -3753,7 +3758,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-algorithms" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-types", "snarkvm-console-algorithms", @@ -3763,7 +3768,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-collections" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-algorithms", "snarkvm-circuit-types", @@ -3773,7 +3778,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-environment" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "indexmap 2.6.0", "itertools 0.11.0", @@ -3791,12 +3796,12 @@ dependencies = [ [[package]] name = "snarkvm-circuit-environment-witness" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" [[package]] name = "snarkvm-circuit-network" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-algorithms", "snarkvm-circuit-collections", @@ -3807,7 +3812,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-program" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "paste", "snarkvm-circuit-account", @@ -3822,7 +3827,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-address", @@ -3837,7 +3842,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-address" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3850,7 +3855,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-boolean" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-console-types-boolean", @@ -3859,7 +3864,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-field" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3869,7 +3874,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-group" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3881,7 +3886,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-integers" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3893,7 +3898,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-scalar" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3904,7 +3909,7 @@ dependencies = [ [[package]] name = "snarkvm-circuit-types-string" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-circuit-environment", "snarkvm-circuit-types-boolean", @@ -3916,7 +3921,7 @@ dependencies = [ [[package]] name = "snarkvm-console" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-account", "snarkvm-console-algorithms", @@ -3929,7 +3934,7 @@ dependencies = [ [[package]] name = "snarkvm-console-account" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "bs58", "snarkvm-console-network", @@ -3940,7 +3945,7 @@ dependencies = [ [[package]] name = "snarkvm-console-algorithms" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "blake2s_simd", "smallvec", @@ -3953,7 +3958,7 @@ dependencies = [ [[package]] name = "snarkvm-console-collections" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "rayon", @@ -3964,7 +3969,7 @@ dependencies = [ [[package]] name = "snarkvm-console-network" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "anyhow", "indexmap 2.6.0", @@ -3987,7 +3992,7 @@ dependencies = [ [[package]] name = "snarkvm-console-network-environment" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "anyhow", "bech32", @@ -4005,7 +4010,7 @@ dependencies = [ [[package]] name = "snarkvm-console-program" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "enum-iterator", "enum_index", @@ -4027,7 +4032,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-address", @@ -4042,7 +4047,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-address" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4053,7 +4058,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-boolean" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", ] @@ -4061,7 +4066,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-field" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4071,7 +4076,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-group" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4082,7 +4087,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-integers" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4093,7 +4098,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-scalar" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4104,7 +4109,7 @@ dependencies = [ [[package]] name = "snarkvm-console-types-string" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console-network-environment", "snarkvm-console-types-boolean", @@ -4115,7 +4120,7 @@ dependencies = [ [[package]] name = "snarkvm-curves" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "rand", "rayon", @@ -4129,7 +4134,7 @@ dependencies = [ [[package]] name = "snarkvm-fields" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -4146,7 +4151,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -4170,7 +4175,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-authority" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "anyhow", "rand", @@ -4182,7 +4187,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-block" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4202,7 +4207,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-committee" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4215,7 +4220,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-ledger-narwhal-batch-certificate", "snarkvm-ledger-narwhal-batch-header", @@ -4228,7 +4233,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-batch-certificate" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4241,7 +4246,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-batch-header" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4253,7 +4258,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-data" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "bytes", "serde_json", @@ -4264,7 +4269,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-subdag" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "indexmap 2.6.0", "rayon", @@ -4279,7 +4284,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-transmission" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "bytes", "serde_json", @@ -4292,7 +4297,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-narwhal-transmission-id" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "snarkvm-console", "snarkvm-ledger-puzzle", @@ -4301,7 +4306,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-puzzle" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -4321,7 +4326,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-puzzle-epoch" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -4342,7 +4347,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-query" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "async-trait", "reqwest 0.11.27", @@ -4355,7 +4360,7 @@ dependencies = [ [[package]] name = "snarkvm-ledger-store" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std-storage", "anyhow", @@ -4382,7 +4387,7 @@ dependencies = [ [[package]] name = "snarkvm-metrics" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "metrics", "metrics-exporter-prometheus", @@ -4391,7 +4396,7 @@ dependencies = [ [[package]] name = "snarkvm-parameters" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -4416,7 +4421,7 @@ dependencies = [ [[package]] name = "snarkvm-synthesizer" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -4447,12 +4452,11 @@ dependencies = [ [[package]] name = "snarkvm-synthesizer-process" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "colored", "indexmap 2.6.0", - "lru", "once_cell", "parking_lot 0.12.3", "rand", @@ -4467,13 +4471,12 @@ dependencies = [ "snarkvm-synthesizer-program", "snarkvm-synthesizer-snark", "snarkvm-utilities", - "tracing", ] [[package]] name = "snarkvm-synthesizer-program" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "indexmap 2.6.0", "paste", @@ -4487,7 +4490,7 @@ dependencies = [ [[package]] name = "snarkvm-synthesizer-snark" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "bincode", "once_cell", @@ -4500,7 +4503,7 @@ dependencies = [ [[package]] name = "snarkvm-utilities" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "aleo-std", "anyhow", @@ -4521,7 +4524,7 @@ dependencies = [ [[package]] name = "snarkvm-utilities-derives" version = "1.0.0" -source = "git+https://github.com/AleoNet/snarkVM?rev=4eb83d7#4eb83d7f7276514baa9c44b920155750cfe855e7" +source = "git+https://github.com/AleoNet/snarkVM?rev=1de86e7#1de86e7d09b91dd1a78042053697dea80f600d87" dependencies = [ "proc-macro2", "quote 1.0.37", @@ -4530,11 +4533,10 @@ dependencies = [ [[package]] name = "snops" -version = "0.1.0" +version = "0.2.0" dependencies = [ "axum", "bimap", - "bincode", "chrono", "clap", "dashmap 6.1.0", @@ -4552,6 +4554,7 @@ dependencies = [ "rand_chacha", "rayon", "reqwest 0.12.8", + "semver", "serde", "serde_json", "serde_yml", @@ -4574,11 +4577,10 @@ dependencies = [ [[package]] name = "snops-agent" -version = "0.1.0" +version = "0.2.0" dependencies = [ "anyhow", "axum", - "bincode", "chrono", "clap", "dashmap 6.1.0", @@ -4586,6 +4588,7 @@ dependencies = [ "futures-util", "http 1.1.0", "httpdate", + "indexmap 2.6.0", "local-ip-address", "nix", "reqwest 0.12.8", @@ -4613,6 +4616,7 @@ dependencies = [ "anyhow", "chrono", "glob", + "lazysort", "rayon", "serde", "snarkos-node", @@ -4629,17 +4633,23 @@ dependencies = [ "clap", "clap-stdin", "clap_complete", + "futures-util", + "http 1.1.0", "reqwest 0.12.8", + "rustls 0.23.15", + "serde", "serde_json", "snops-common", + "tokio", + "tokio-tungstenite", + "urlencoding", ] [[package]] name = "snops-common" -version = "0.1.0" +version = "0.2.0" dependencies = [ "anyhow", - "bincode", "bytes", "chrono", "clap", diff --git a/Cargo.toml b/Cargo.toml index b8a30609..68b2224b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -81,6 +81,8 @@ reqwest = { version = "0.12", default-features = false, features = [ ] } # Can't update this cause snarkos/vm rocksdb = { version = "0.21", default-features = false } +rustls = { version = "0.23.15", features = ["ring"] } +semver = { version = "1.0", features = ["serde"] } serde = { version = "1", default-features = false, features = [ "alloc", "derive", @@ -129,9 +131,9 @@ snops-common = { path = "./crates/common" } # snarkos-node-metrics = { version = "3.0" } # snarkvm = { version = "1.0", features = ["rocks"] } -snarkos-account = { git = "https://github.com/AleoNet/snarkOS", rev = "c6de459" } -snarkos-node = { git = "https://github.com/AleoNet/snarkOS", rev = "c6de459" } -snarkos-node-metrics = { git = "https://github.com/AleoNet/snarkOS", rev = "c6de459" } -snarkvm = { git = "https://github.com/AleoNet/snarkVM", rev = "4eb83d7", default-features = false, features = [ +snarkos-account = { git = "https://github.com/AleoNet/snarkOS", rev = "ba41197" } +snarkos-node = { git = "https://github.com/AleoNet/snarkOS", rev = "ba41197" } +snarkos-node-metrics = { git = "https://github.com/AleoNet/snarkOS", rev = "ba41197" } +snarkvm = { git = "https://github.com/AleoNet/snarkVM", rev = "1de86e7", default-features = false, features = [ "rocks", ] } diff --git a/README.md b/README.md index 99603d85..834dc7fc 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ To learn more about `snops` we recommend checking out the mdbook [here](https:// The controlplane is the webserver that communicates to agents how to run snarkOS, or what transactions to execute. -1. In another terminal, build the cli: `cargo install --path ./crates/snops-cli` +1. In another terminal, install the cli: `cargo install --path ./crates/snops-cli`, or build with `cargo xtask build cli` and use from `target/release-big/snops-cli`. The cli is used to interact with the controlplane and manage environments. It provides JSON based output. We recommend pairing our cli with [`jq`](https://jqlang.github.io/jq/) when leveraging other scripts and tools @@ -64,15 +64,15 @@ To learn more about `snops` we recommend checking out the mdbook [here](https:// Each of these can be dynamically configured as snarkos nodes. The default agent configuration should connect to a locally operated controlplane. -### Local Isonets +### Local Isolated Networks (Isonets) -This example requires 4 agents and the control plane to be running. +This example requires 4 agents and the control plane to be running. It allows you to run a devnet with a custom genesis block. -1. Start the environment: `snops-cli env prepare specs/test-4-validators.yaml` +1. Start the environment: `snops-cli env apply specs/test-4-validators.yaml` 1. Check the current network height: `snops-cli env height` 1. Look at the latest block: `snops-cli env block` 1. Look at the genesis block: `snops-cli env block 0` -1. Stop the environment: `snops-cli env clean` +1. Stop the environment: `snops-cli env delete` ### Isonet Transfers @@ -156,7 +156,7 @@ Deploying and executing Aleo programs on your isonets is easiest with snops. You `snarkos-aot` provides various CLI tools to help with developing and executing Aleo programs as well as interact with snarkOS ledgers. -Build `snarkos-aot` with: `cargo install --profile release-big -p snarkos-aot`. +Build `snarkos-aot` with: `cargo xtask build aot`. The compiled binary can be found in `target/release-big/snarkos-aot`. Use the `NETWORK` environment variable to specify `mainnet` (default), diff --git a/crates/agent/Cargo.toml b/crates/agent/Cargo.toml index ccab7017..faa84dd1 100644 --- a/crates/agent/Cargo.toml +++ b/crates/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "snops-agent" -version = "0.1.0" +version = "0.2.0" edition = "2021" license = "MIT" description = "A snarkops agent for communicating with snarkos nodes and the control plane" @@ -14,7 +14,6 @@ mangen = ["snops-common/mangen"] [dependencies] anyhow.workspace = true axum = { workspace = true, features = ["http2", "json", "tokio", "ws"] } -bincode.workspace = true chrono.workspace = true clap.workspace = true dashmap.workspace = true @@ -22,9 +21,11 @@ futures.workspace = true futures-util.workspace = true http.workspace = true httpdate.workspace = true +indexmap.workspace = true local-ip-address.workspace = true nix = { workspace = true, features = ["signal"] } reqwest = { workspace = true, features = ["json", "stream"] } +rustls.workspace = true serde_json.workspace = true sha2.workspace = true simple_moving_average.workspace = true @@ -43,4 +44,3 @@ tracing-appender.workspace = true tracing.workspace = true tracing-subscriber.workspace = true url.workspace = true -rustls = { version = "0.23.15", features = ["ring"] } diff --git a/crates/agent/src/api.rs b/crates/agent/src/api.rs index 412e3458..f91f95a9 100644 --- a/crates/agent/src/api.rs +++ b/crates/agent/src/api.rs @@ -12,7 +12,8 @@ use reqwest::IntoUrl; use sha2::{Digest, Sha256}; use snops_common::{ binaries::{BinaryEntry, BinarySource}, - state::TransferStatusUpdate, + rpc::error::ReconcileError, + state::{TransferId, TransferStatusUpdate}, util::sha256_file, }; use tokio::{fs::File, io::AsyncWriteExt}; @@ -24,6 +25,7 @@ const TRANSFER_UPDATE_RATE: Duration = Duration::from_secs(2); /// Download a file. Returns a None if 404. pub async fn download_file( + tx_id: TransferId, client: &reqwest::Client, url: impl IntoUrl, to: impl AsRef, @@ -35,8 +37,7 @@ pub async fn download_file( return Ok(None); } - // create a new transfer - let tx_id = transfers::next_id(); + // start a new transfer transfer_tx.send(( tx_id, TransferStatusUpdate::Start { @@ -98,26 +99,6 @@ pub async fn download_file( Ok(Some((file, sha256, downloaded))) } -pub async fn check_file( - url: impl IntoUrl, - to: &Path, - transfer_tx: TransferTx, -) -> anyhow::Result<()> { - let client = reqwest::Client::new(); - - if !should_download_file(&client, url.as_str(), to, None) - .await - .unwrap_or(true) - { - return Ok(()); - } - - info!("downloading {to:?}"); - download_file(&client, url, to, transfer_tx).await?; - - Ok(()) -} - pub async fn check_binary( binary: &BinaryEntry, base_url: &str, @@ -136,23 +117,30 @@ pub async fn check_binary( // this also checks for sha256 differences, along with last modified time // against the target - if !should_download_file(&client, &source_url, path, Some(binary)) - .await - .unwrap_or(true) - { + let file_issues = get_file_issues( + &client, + &source_url, + path, + binary.size, + binary.sha256.as_deref(), + false, + ) + .await; + + if file_issues.is_ok_and(|issues| issues.is_none()) { // check permissions and ensure 0o755 let perms = path.metadata()?.permissions(); if perms.mode() != 0o755 { tokio::fs::set_permissions(path, std::fs::Permissions::from_mode(0o755)).await?; } - // TODO: check sha256 and size - return Ok(()); } info!("downloading binary update to {}: {binary}", path.display()); - let Some((file, sha256, size)) = download_file(&client, &source_url, path, transfer_tx).await? + let tx_id = transfers::next_id(); + let Some((file, sha256, size)) = + download_file(tx_id, &client, &source_url, path, transfer_tx).await? else { bail!("downloading binary returned 404"); }; @@ -186,47 +174,92 @@ pub async fn check_binary( Ok(()) } -pub async fn should_download_file( +#[derive(Debug)] +pub enum BadFileReason { + /// File is missing + NotFound, + /// File size mismatch + Size, + /// SHA256 mismatch + Sha256, + /// A new version is available based on modified header + Stale, +} + +pub async fn get_file_issues( client: &reqwest::Client, - loc: &str, - path: &Path, - binary: Option<&BinaryEntry>, -) -> anyhow::Result { - if !path.exists() { - return Ok(true); + src: &str, + dst: &Path, + size: Option, + sha256: Option<&str>, + offline: bool, +) -> Result, ReconcileError> { + if !dst.try_exists().unwrap_or(false) { + return Ok(Some(BadFileReason::NotFound)); } - let meta = tokio::fs::metadata(&path).await?; + let meta = tokio::fs::metadata(&dst) + .await + .map_err(|e| ReconcileError::FileStatError(dst.to_path_buf(), e.to_string()))?; let local_content_length = meta.len(); // if the binary entry is provided, check if the file size and sha256 match - if let Some(binary) = binary { - // file size is incorrect - if binary.size.is_some_and(|s| s != local_content_length) { - return Ok(true); - } + // file size is incorrect + if size.is_some_and(|s| s != local_content_length) { + return Ok(Some(BadFileReason::Size)); + } - // if sha256 is present, only download if the sha256 is different - if let Some(sha256) = binary.sha256.as_ref() { - return Ok(sha256_file(&path.to_path_buf())? != sha256.to_ascii_lowercase()); - } + // if sha256 is present, only download if the sha256 is different + if let Some(sha256) = sha256 { + let bad_sha256 = sha256_file(&dst.to_path_buf()) + .map_err(|e| ReconcileError::FileReadError(dst.to_path_buf(), e.to_string()))? + != sha256.to_ascii_lowercase(); + return Ok(bad_sha256.then_some(BadFileReason::Sha256)); + } + + // if we're offline, don't download + if offline { + return Ok(None); } // check last modified - let res = client.head(loc).send().await?; + let res = client + .head(src) + .send() + .await + .map_err(|e| ReconcileError::HttpError { + method: String::from("HEAD"), + url: src.to_owned(), + error: e.to_string(), + })?; - let Some(last_modified_header) = res.headers().get(http::header::LAST_MODIFIED) else { - return Ok(true); + let Some(last_modified_header) = res + .headers() + .get(http::header::LAST_MODIFIED) + // parse as a string + .and_then(|e| e.to_str().ok()) + else { + return Ok(Some(BadFileReason::Stale)); }; - let Some(content_length_header) = res.headers().get(http::header::CONTENT_LENGTH) else { - return Ok(true); + let Some(remote_content_length) = res + .headers() + .get(http::header::CONTENT_LENGTH) + // parse the header as a u64 + .and_then(|e| e.to_str().ok().and_then(|s| s.parse::().ok())) + else { + return Ok(Some(BadFileReason::Size)); }; - let remote_last_modified = httpdate::parse_http_date(last_modified_header.to_str()?)?; - let local_last_modified = meta.modified()?; - - let remote_content_length = content_length_header.to_str()?.parse::()?; - - Ok(remote_last_modified > local_last_modified || remote_content_length != local_content_length) + let remote_last_modified = httpdate::parse_http_date(last_modified_header); + let local_last_modified = meta + .modified() + .map_err(|e| ReconcileError::FileStatError(dst.to_path_buf(), e.to_string()))?; + + let is_stale = remote_last_modified + .map(|res| res > local_last_modified) + .unwrap_or(true); + Ok(is_stale + .then_some(BadFileReason::Stale) + .or_else(|| (remote_content_length != local_content_length).then_some(BadFileReason::Size))) } diff --git a/crates/agent/src/cli.rs b/crates/agent/src/cli.rs index 033b1619..9194a6be 100644 --- a/crates/agent/src/cli.rs +++ b/crates/agent/src/cli.rs @@ -10,9 +10,11 @@ use std::{ use clap::CommandFactory; use clap::Parser; use http::Uri; -use snops_common::state::{AgentId, AgentModeOptions, PortConfig}; +use snops_common::state::{AgentId, AgentModeOptions, NetworkId, PortConfig, StorageId}; use tracing::{info, warn}; +use crate::net; + pub const ENV_ENDPOINT: &str = "SNOPS_ENDPOINT"; pub const ENV_ENDPOINT_DEFAULT: &str = "127.0.0.1:1234"; @@ -119,6 +121,9 @@ impl Cli { let mut query = format!("/agent?mode={}", u8::from(self.modes)); + // Add agent version + query.push_str(&format!("&version={}", env!("CARGO_PKG_VERSION"))); + // add &id= query.push_str(&format!("&id={}", self.id)); @@ -127,13 +132,13 @@ impl Cli { if fs::metadata(file).is_ok() { query.push_str("&local_pk=true"); } else { - warn!("private-key-file flag ignored as the file was not found: {file:?}") + warn!("Private-key-file flag ignored as the file was not found: {file:?}") } } // add &labels= if id is present if let Some(labels) = &self.labels { - info!("using labels: {:?}", labels); + info!("Using labels: {:?}", labels); query.push_str(&format!( "&labels={}", labels @@ -167,4 +172,31 @@ impl Cli { ws_uri, ) } + + pub fn addrs(&self) -> (Vec, Option) { + let internal_addrs = match (self.internal, self.external) { + // use specified internal address + (Some(internal), _) => vec![internal], + // use no internal address if the external address is loopback + (None, Some(external)) if external.is_loopback() => vec![], + // otherwise, get the local network interfaces available to this node + (None, _) => net::get_internal_addrs().expect("failed to get network interfaces"), + }; + + let external_addr = self.external; + if let Some(addr) = external_addr { + info!("Using external addr: {}", addr); + } else { + info!("Skipping external addr"); + } + + (internal_addrs, external_addr) + } + + pub fn storage_path(&self, network: NetworkId, storage_id: StorageId) -> PathBuf { + let mut path = self.path.join("storage"); + path.push(network.to_string()); + path.push(storage_id.to_string()); + path + } } diff --git a/crates/agent/src/client.rs b/crates/agent/src/client.rs new file mode 100644 index 00000000..ea94b9c4 --- /dev/null +++ b/crates/agent/src/client.rs @@ -0,0 +1,234 @@ +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; + +use futures::{SinkExt, StreamExt}; +use http::{HeaderValue, StatusCode, Uri}; +use snops_common::{ + constant::{ENV_AGENT_KEY, HEADER_AGENT_KEY}, + rpc::{ + control::{agent::AgentService, ControlServiceClient, PING_HEADER}, + RpcTransport, PING_INTERVAL_SEC, PING_LENGTH, + }, +}; +use tarpc::server::Channel; +use tokio::select; +use tokio_tungstenite::{ + connect_async, + tungstenite::{self, client::IntoClientRequest, handshake::client::Request}, +}; +use tracing::{error, info, warn}; + +use crate::{ + rpc::control::{self, AgentRpcServer}, + state::GlobalState, +}; + +pub fn new_ws_request(ws_uri: &Uri, jwt: Option) -> Request { + let mut req = ws_uri.to_owned().into_client_request().unwrap(); + + // attach JWT if we have one + if let Some(jwt) = jwt { + req.headers_mut().insert( + "Authorization", + HeaderValue::from_bytes(format!("Bearer {jwt}").as_bytes()) + .expect("attach authorization header"), + ); + } + + // attach agent key if one is set in env vars + if let Ok(key) = std::env::var(ENV_AGENT_KEY) { + req.headers_mut().insert( + HEADER_AGENT_KEY, + HeaderValue::from_bytes(key.as_bytes()).expect("attach agent key header"), + ); + } + + req +} + +pub async fn ws_connection(ws_req: Request, state: Arc) { + let (mut stream, _response) = match connect_async(ws_req).await { + Ok(res) => res, + Err(e) => { + match e { + // Ignore connection refused errors, we only care if something interesting is + // causing the connection to fail. + tungstenite::Error::Io(e) if e.kind() == std::io::ErrorKind::ConnectionRefused => { + return + } + // Shutdown the agent if the control plane requires an upgrade + tungstenite::Error::Http(e) if e.status() == StatusCode::UPGRADE_REQUIRED => { + error!("The control plane requires an agent upgrade. Shutting down..."); + state.shutdown().await; + return; + } + _ => error!("failed to connect to websocket: {e}"), + } + return; + } + }; + + info!("Connection established with the control plane"); + + // create rpc channels + let (client_response_in, client_transport, mut client_request_out) = RpcTransport::new(); + let (server_request_in, server_transport, mut server_response_out) = RpcTransport::new(); + + // set up the client, facing the control plane + let client = + ControlServiceClient::new(tarpc::client::Config::default(), client_transport).spawn(); + state.client.write().await.replace(client.clone()); + + let start_time = Instant::now(); + let mut interval = tokio::time::interval(Duration::from_secs(PING_INTERVAL_SEC)); + let mut num_pings: u32 = 0; + + // initialize and start the rpc server + let mut server_handle = Box::pin( + tarpc::server::BaseChannel::with_defaults(server_transport).execute( + AgentRpcServer { + client, + state: Arc::clone(&state), + version: env!("CARGO_PKG_VERSION"), + } + .serve(), + ), + ); + + loop { + select! { + _ = interval.tick() => { + // ping payload contains "snops-agent", number of pings, and uptime + let mut payload = Vec::from(PING_HEADER); + payload.extend_from_slice(&num_pings.to_le_bytes()); + payload.extend_from_slice(&start_time.elapsed().as_micros().to_le_bytes()); + + let send = stream.send(tungstenite::Message::Ping(payload)); + if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { + error!("The connection to the control plane was interrupted while sending ping"); + break + } + } + + // handle outgoing responses + msg = server_response_out.recv() => { + let Some(msg) = msg else { + error!("internal agent RPC channel closed"); + break; + }; + let bin = match snops_common::rpc::codec::encode(&control::MuxedMessageOutgoing::Child(msg)) { + Ok(bin) => bin, + Err(e) => { + error!("failed to serialize response: {e}"); + continue; + } + }; + + let send = stream.send(tungstenite::Message::Binary(bin)); + if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { + error!("The connection to the control plane was interrupted while sending agent message"); + break; + } + } + + // handle outgoing requests + msg = client_request_out.recv() => { + let Some(msg) = msg else { + error!("internal agent RPC channel closed"); + break; + }; + let bin = match snops_common::rpc::codec::encode(&control::MuxedMessageOutgoing::Parent(msg)) { + Ok(bin) => bin, + Err(e) => { + error!("failed to serialize request: {e}"); + continue; + } + }; + let send = stream.send(tungstenite::Message::Binary(bin)); + if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { + error!("The connection to the control plane was interrupted while sending control message"); + break; + } + } + + // handle incoming messages + msg = stream.next() => match msg { + Some(Ok(tungstenite::Message::Close(frame))) => { + if let Some(frame) = frame { + info!("The control plane has closed the connection: {frame}"); + } else { + info!("The control plane has closed the connection"); + } + break; + } + + Some(Ok(tungstenite::Message::Pong(payload))) => { + let mut payload = payload.as_slice(); + // check the header + if !payload.starts_with(PING_HEADER) { + warn!("Received a pong payload with an invalid header prefix"); + continue; + } + payload = &payload[PING_HEADER.len()..]; + if payload.len() != PING_LENGTH { + warn!("Received a pong payload with an invalid length {}, expected {PING_LENGTH}", payload.len()); + continue; + } + let (left, right) = payload.split_at(size_of::()); + let ping_index = u32::from_le_bytes(left.try_into().unwrap()); + let _uptime_start = u128::from_le_bytes(right.try_into().unwrap()); + + if ping_index != num_pings { + warn!("Received a pong payload with an invalid index {ping_index}, expected {num_pings}"); + continue; + } + + num_pings += 1; + + // when desired, we can add this as a metric + // let uptime_now = start_time.elapsed().as_micros(); + // let uptime_diff = uptime_now - uptime_start; + } + + Some(Ok(tungstenite::Message::Binary(bin))) => { + let msg = match snops_common::rpc::codec::decode(&bin) { + Ok(msg) => msg, + Err(e) => { + error!("failed to deserialize a message from the control plane: {e}"); + continue; + } + }; + + match msg { + control::MuxedMessageIncoming::Child(msg) => { + if let Err(e) = server_request_in.send(msg) { + error!("internal agent RPC channel closed: {e}"); + break; + } + }, + control::MuxedMessageIncoming::Parent(msg) => { + if let Err(e) = client_response_in.send(msg) { + error!("internal agent RPC channel closed: {e}"); + break; + } + } + } + } + + None | Some(Err(_)) => { + error!("The connection to the control plane was interrupted"); + break; + } + + Some(Ok(o)) => println!("{o:#?}"), + }, + + // handle server requests + Some(r) = server_handle.next() => { + tokio::spawn(r); + } + } + } +} diff --git a/crates/agent/src/db.rs b/crates/agent/src/db.rs index bfbb3952..c432d483 100644 --- a/crates/agent/src/db.rs +++ b/crates/agent/src/db.rs @@ -1,21 +1,44 @@ use std::{ io::{Read, Write}, + net::IpAddr, path::Path, - sync::Mutex, + sync::{Arc, Mutex}, }; +use indexmap::IndexMap; use snops_common::{ - db::{error::DatabaseError, tree::DbTree, Database as DatabaseTrait}, - format::{DataFormat, DataReadError, DataWriteError}, + api::AgentEnvInfo, + db::{ + error::DatabaseError, + tree::{DbRecords, DbTree}, + Database as DatabaseTrait, + }, + format::{DataFormat, DataReadError, DataWriteError, PackedUint}, + state::{AgentId, AgentState, EnvId, HeightRequest}, }; +use url::Url; + +use crate::reconcile::state::EnvState; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] #[repr(u8)] pub enum AgentDbString { /// JSON web token of agent. - Jwt, + Jwt = 0, /// Process ID of node. Used to keep track of zombie node processes. - NodePid, + NodePid = 1, + // Url to Loki instance, configured by the endpoint. + LokiUrl = 2, + /// Current state of the agent. + AgentState = 3, + /// Current environment state. + EnvState = 4, + /// Latest stored environment info. + EnvInfo = 5, + /// Agent addresses resolved by the controlplane. + ResolvedAddrs = 6, + /// Last height of the agent state + LastHeight = 7, } impl DataFormat for AgentDbString { @@ -34,6 +57,12 @@ impl DataFormat for AgentDbString { Ok(match u8::read_data(reader, &())? { 0 => Self::Jwt, 1 => Self::NodePid, + 2 => Self::LokiUrl, + 3 => Self::AgentState, + 4 => Self::EnvInfo, + 5 => Self::EnvState, + 6 => Self::ResolvedAddrs, + 7 => Self::LastHeight, _ => return Err(DataReadError::custom("invalid agent DB string type")), }) } @@ -49,18 +78,21 @@ pub struct Database { pub jwt_mutex: Mutex>, pub strings: DbTree, + pub documents: DbRecords, } impl DatabaseTrait for Database { fn open(path: &Path) -> Result { let db = sled::open(path)?; let strings = DbTree::new(db.open_tree(b"v1/strings")?); + let documents = DbRecords::new(db.open_tree(b"v1/documents")?); let jwt_mutex = Mutex::new(strings.restore(&AgentDbString::Jwt)?); Ok(Self { db, jwt_mutex, strings, + documents, }) } } @@ -77,4 +109,83 @@ impl Database { *lock = jwt; Ok(()) } + + pub fn set_loki_url(&self, url: Option) -> Result<(), DatabaseError> { + self.strings + .save_option(&AgentDbString::LokiUrl, url.as_ref()) + } + + pub fn loki_url(&self) -> Option { + self.strings + .restore(&AgentDbString::LokiUrl) + .ok()? + .and_then(|url| url.parse::().ok()) + } + + pub fn env_info(&self) -> Result)>, DatabaseError> { + self.documents + .restore(&AgentDbString::EnvInfo) + .map_err(DatabaseError::from) + } + + pub fn set_env_info( + &self, + info: Option<(EnvId, Arc)>, + ) -> Result<(), DatabaseError> { + self.documents + .save_option(&AgentDbString::EnvInfo, info.as_ref()) + } + + pub fn agent_state(&self) -> Result { + Ok(self + .documents + .restore(&AgentDbString::AgentState)? + .unwrap_or_default()) + } + + pub fn set_agent_state(&self, state: &AgentState) -> Result<(), DatabaseError> { + self.documents.save(&AgentDbString::AgentState, state) + } + + pub fn resolved_addrs(&self) -> Result, DatabaseError> { + Ok(self + .documents + .restore(&AgentDbString::ResolvedAddrs)? + .unwrap_or_default()) + } + + pub fn set_resolved_addrs( + &self, + addrs: Option<&IndexMap>, + ) -> Result<(), DatabaseError> { + self.documents + .save_option(&AgentDbString::ResolvedAddrs, addrs) + } + + pub fn env_state(&self) -> Result, DatabaseError> { + self.documents.restore(&AgentDbString::EnvState) + } + + pub fn set_env_state(&self, state: Option<&EnvState>) -> Result<(), DatabaseError> { + self.documents.save_option(&AgentDbString::EnvState, state) + } + + pub fn last_height(&self) -> Result, DatabaseError> { + Ok(self + .documents + .restore::<(PackedUint, HeightRequest)>(&AgentDbString::LastHeight)? + .map(|(counter, req)| (counter.into(), req))) + } + + pub fn set_last_height( + &self, + height: Option<(usize, HeightRequest)>, + ) -> Result<(), DatabaseError> { + self.documents.save_option( + &AgentDbString::LastHeight, + height + .map(|(counter, req)| (PackedUint::from(counter), req)) + .as_ref(), + ) + } } diff --git a/crates/agent/src/log.rs b/crates/agent/src/log.rs new file mode 100644 index 00000000..9d87492b --- /dev/null +++ b/crates/agent/src/log.rs @@ -0,0 +1,53 @@ +use tracing::level_filters::LevelFilter; +use tracing_appender::non_blocking::WorkerGuard; +use tracing_subscriber::{layer::SubscriberExt, reload, util::SubscriberInitExt, EnvFilter}; + +pub type ReloadHandler = reload::Handle; + +pub fn make_env_filter(level: LevelFilter) -> EnvFilter { + EnvFilter::builder() + .with_env_var("SNOPS_AGENT_LOG") + .with_default_directive(level.into()) + .from_env_lossy() + .add_directive(level.into()) + .add_directive("neli=off".parse().unwrap()) + .add_directive("hyper_util=off".parse().unwrap()) + .add_directive("reqwest=off".parse().unwrap()) + .add_directive("tungstenite=off".parse().unwrap()) + .add_directive("tokio_tungstenite=off".parse().unwrap()) + .add_directive("tarpc::client=ERROR".parse().unwrap()) + .add_directive("tarpc::server=ERROR".parse().unwrap()) +} + +pub fn init_logging() -> (WorkerGuard, ReloadHandler) { + let (stdout, guard) = tracing_appender::non_blocking(std::io::stdout()); + + let output: tracing_subscriber::fmt::Layer< + _, + tracing_subscriber::fmt::format::DefaultFields, + tracing_subscriber::fmt::format::Format, + tracing_appender::non_blocking::NonBlocking, + > = tracing_subscriber::fmt::layer().with_writer(stdout); + + let output = if cfg!(debug_assertions) { + output.with_file(true).with_line_number(true) + } else { + output + }; + + let filter_level = if cfg!(debug_assertions) { + LevelFilter::TRACE + } else { + LevelFilter::INFO + }; + + let (env_filter, reload_handler) = reload::Layer::new(make_env_filter(filter_level)); + + tracing_subscriber::registry() + .with(env_filter) + .with(output) + .try_init() + .unwrap(); + + (guard, reload_handler) +} diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 2b1b3cae..e8d7d8d0 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -1,5 +1,6 @@ mod api; mod cli; +mod client; mod db; mod metrics; mod net; @@ -10,7 +11,6 @@ mod state; mod transfers; use std::{ - mem::size_of, net::Ipv4Addr, sync::{Arc, Mutex}, time::{Duration, Instant}, @@ -18,49 +18,19 @@ use std::{ use clap::Parser; use cli::Cli; -use futures::SinkExt; use futures_util::stream::{FuturesUnordered, StreamExt}; -use http::HeaderValue; -use rpc::control::{self, AgentRpcServer}; -use snops_common::{ - constant::{ENV_AGENT_KEY, HEADER_AGENT_KEY}, - db::Database, - rpc::{ - control::{agent::AgentService, ControlServiceClient, PING_HEADER}, - RpcTransport, PING_INTERVAL_SEC, PING_LENGTH, - }, - util::OpaqueDebug, -}; -use tarpc::server::Channel; +use log::init_logging; +use reconcile::agent::{AgentStateReconciler, AgentStateReconcilerContext}; +use snops_common::{db::Database, util::OpaqueDebug}; use tokio::{ select, signal::unix::{signal, Signal, SignalKind}, + sync::{mpsc, RwLock}, }; -use tokio_tungstenite::{ - connect_async, - tungstenite::{self, client::IntoClientRequest}, -}; -use tracing::{error, info, level_filters::LevelFilter, warn}; -use tracing_subscriber::{layer::SubscriberExt, reload, util::SubscriberInitExt, EnvFilter}; +use tracing::{error, info}; use crate::state::GlobalState; - -type ReloadHandler = reload::Handle; - -fn make_env_filter(level: LevelFilter) -> EnvFilter { - EnvFilter::builder() - .with_env_var("SNOPS_AGENT_LOG") - .with_default_directive(level.into()) - .from_env_lossy() - .add_directive(level.into()) - .add_directive("neli=off".parse().unwrap()) - .add_directive("hyper_util=off".parse().unwrap()) - .add_directive("reqwest=off".parse().unwrap()) - .add_directive("tungstenite=off".parse().unwrap()) - .add_directive("tokio_tungstenite=off".parse().unwrap()) - .add_directive("tarpc::client=ERROR".parse().unwrap()) - .add_directive("tarpc::server=ERROR".parse().unwrap()) -} +mod log; #[tokio::main] async fn main() { @@ -68,78 +38,31 @@ async fn main() { .install_default() .expect("Failed to install rustls crypto provider"); - let (stdout, _guard) = tracing_appender::non_blocking(std::io::stdout()); - let start_time = Instant::now(); - - let output: tracing_subscriber::fmt::Layer< - _, - tracing_subscriber::fmt::format::DefaultFields, - tracing_subscriber::fmt::format::Format, - tracing_appender::non_blocking::NonBlocking, - > = tracing_subscriber::fmt::layer().with_writer(stdout); - - let output = if cfg!(debug_assertions) { - output.with_file(true).with_line_number(true) - } else { - output - }; - - let filter_level = if cfg!(debug_assertions) { - LevelFilter::TRACE - } else { - LevelFilter::INFO - }; - - let (env_filter, reload_handler) = reload::Layer::new(make_env_filter(filter_level)); - - tracing_subscriber::registry() - .with(env_filter) - .with(output) - .try_init() - .unwrap(); - // For documentation purposes will exit after running the command. #[cfg(any(feature = "clipages", feature = "mangen"))] Cli::parse().run(); + + let (_guard, reload_handler) = init_logging(); + let args = Cli::parse(); - let internal_addrs = match (args.internal, args.external) { - // use specified internal address - (Some(internal), _) => vec![internal], - // use no internal address if the external address is loopback - (None, Some(external)) if external.is_loopback() => vec![], - // otherwise, get the local network interfaces available to this node - (None, _) => net::get_internal_addrs().expect("failed to get network interfaces"), - }; - let external_addr = args.external; - if let Some(addr) = external_addr { - info!("using external addr: {}", addr); - } else { - info!("skipping external addr"); - } + let (internal_addrs, external_addr) = args.addrs(); - // get the endpoint let (endpoint, ws_uri) = args.endpoint_and_uri(); - info!("connecting to {endpoint}"); + info!("Using endpoint {endpoint}"); - // create the data directory + // Create the data directory tokio::fs::create_dir_all(&args.path) .await .expect("failed to create data path"); - // open the database + // Open the database let db = db::Database::open(&args.path.join("store")).expect("failed to open database"); - // create rpc channels - let (client_response_in, client_transport, mut client_request_out) = RpcTransport::new(); - let (server_request_in, server_transport, mut server_response_out) = RpcTransport::new(); + let client = Default::default(); - // set up the client, facing the control plane - let client = - ControlServiceClient::new(tarpc::client::Config::default(), client_transport).spawn(); - - // start transfer monitor - let (transfer_tx, transfers) = transfers::start_monitor(client.clone()); + // Start transfer monitor + let (transfer_tx, transfers) = transfers::start_monitor(Arc::clone(&client)); let agent_rpc_listener = tokio::net::TcpListener::bind((Ipv4Addr::LOCALHOST, 0)) .await @@ -149,232 +72,102 @@ async fn main() { .expect("failed to get status server port") .port(); - // create the client state + let (queue_reconcile_tx, reconcile_requests) = mpsc::channel(5); + + let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel(); + + // Create the client state let state = Arc::new(GlobalState { client, - db: OpaqueDebug(db), _started: Instant::now(), - connected: Mutex::new(Instant::now()), external_addr, internal_addrs, cli: args, endpoint, - loki: Default::default(), - env_info: Default::default(), - agent_state: Default::default(), - reconcilation_handle: Default::default(), - child: Default::default(), - resolved_addrs: Default::default(), + queue_reconcile_tx, + loki: Mutex::new(db.loki_url()), + last_node_status: RwLock::new(None), + env_info: RwLock::new( + db.env_info() + .inspect_err(|e| { + error!("failed to load env info from db: {e}"); + }) + .unwrap_or_default(), + ), + agent_state: RwLock::new( + db.agent_state() + .map(Arc::new) + .inspect_err(|e| { + error!("failed to load agent state from db: {e}"); + }) + .unwrap_or_default(), + ), + resolved_addrs: RwLock::new( + db.resolved_addrs() + .inspect_err(|e| { + error!("failed to load resolved addrs from db: {e}"); + }) + .unwrap_or_default(), + ), metrics: Default::default(), agent_rpc_port, transfer_tx, transfers, node_client: Default::default(), log_level_handler: reload_handler, + db: OpaqueDebug(db), + shutdown: RwLock::new(Some(shutdown_tx)), }); - // start the metrics watcher + // Start the metrics watcher metrics::init(Arc::clone(&state)); - // start the status server + // Start the status server let status_state = Arc::clone(&state); tokio::spawn(async move { - info!("starting status API server on port {agent_rpc_port}"); + info!("Starting status API server on port {agent_rpc_port}"); if let Err(e) = server::start(agent_rpc_listener, status_state).await { error!("status API server crashed: {e:?}"); std::process::exit(1); } }); - // initialize and start the rpc server - let rpc_server = tarpc::server::BaseChannel::with_defaults(server_transport); - tokio::spawn( - rpc_server - .execute( - AgentRpcServer { - state: state.to_owned(), - version: env!("CARGO_PKG_VERSION"), - } - .serve(), - ) - .for_each(|r| async move { - tokio::spawn(r); - }), - ); - - // get the interrupt signals to break the stream connection - let mut interrupt = Signals::new(&[SignalKind::terminate(), SignalKind::interrupt()]); - - 'process: loop { - 'connection: { - let mut req = ws_uri.to_owned().into_client_request().unwrap(); - - // invalidate env info cache - state.env_info.write().await.take(); - - // attach JWT if we have one - if let Some(jwt) = state.db.jwt() { - req.headers_mut().insert( - "Authorization", - HeaderValue::from_bytes(format!("Bearer {jwt}").as_bytes()) - .expect("attach authorization header"), - ); - } - - // attach agent key if one is set in env vars - if let Ok(key) = std::env::var(ENV_AGENT_KEY) { - req.headers_mut().insert( - HEADER_AGENT_KEY, - HeaderValue::from_bytes(key.as_bytes()).expect("attach agent key header"), - ); - } - - let (mut ws_stream, _) = select! { - _ = interrupt.recv_any() => break 'process, - - res = connect_async(req) => match res { - Ok(c) => c, - Err(e) => { - error!("An error occurred establishing the connection: {e}"); - break 'connection; - }, - }, - }; + // Get the interrupt signals to break the stream connection + let mut interrupt = Signals::term_or_interrupt(); - *state.connected.lock().unwrap() = Instant::now(); - - info!("Connection established with the control plane"); - - let mut terminating = false; - let mut interval = tokio::time::interval(Duration::from_secs(PING_INTERVAL_SEC)); - let mut num_pings: u32 = 0; - - 'event: loop { - select! { - // terminate if an interrupt was triggered - _ = interrupt.recv_any() => { - terminating = true; - break 'event; - } - - _ = interval.tick() => { - // ping payload contains "snops-agent", number of pings, and uptime - let mut payload = Vec::from(PING_HEADER); - payload.extend_from_slice(&num_pings.to_le_bytes()); - payload.extend_from_slice(&start_time.elapsed().as_micros().to_le_bytes()); - - let send = ws_stream.send(tungstenite::Message::Ping(payload)); - if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { - error!("The connection to the control plane was interrupted while sending ping"); - break 'event; - } - } - - // handle outgoing responses - msg = server_response_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&control::MuxedMessageOutgoing::Child(msg)).expect("failed to serialize response"); - let send = ws_stream.send(tungstenite::Message::Binary(bin)); - if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { - error!("The connection to the control plane was interrupted while sending agent message"); - break 'event; - } - } - - // handle outgoing requests - msg = client_request_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&control::MuxedMessageOutgoing::Parent(msg)).expect("failed to serialize request"); - let send = ws_stream.send(tungstenite::Message::Binary(bin)); - if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { - error!("The connection to the control plane was interrupted while sending control message"); - break 'event; - } - } - - // handle incoming messages - msg = ws_stream.next() => match msg { - Some(Ok(tungstenite::Message::Close(frame))) => { - if let Some(frame) = frame { - info!("The control plane has closed the connection: {frame}"); - } else { - info!("The control plane has closed the connection"); - } - break 'event; - } - - Some(Ok(tungstenite::Message::Pong(payload))) => { - let mut payload = payload.as_slice(); - // check the header - if !payload.starts_with(PING_HEADER) { - warn!("Received a pong payload with an invalid header prefix"); - continue; - } - payload = &payload[PING_HEADER.len()..]; - if payload.len() != PING_LENGTH { - warn!("Received a pong payload with an invalid length {}, expected {PING_LENGTH}", payload.len()); - continue; - } - let (left, right) = payload.split_at(size_of::()); - let ping_index = u32::from_le_bytes(left.try_into().unwrap()); - let _uptime_start = u128::from_le_bytes(right.try_into().unwrap()); - - if ping_index != num_pings { - warn!("Received a pong payload with an invalid index {ping_index}, expected {num_pings}"); - continue; - } - - num_pings += 1; - - // when desired, we can add this as a metric - // let uptime_now = start_time.elapsed().as_micros(); - // let uptime_diff = uptime_now - uptime_start; - - } - - Some(Ok(tungstenite::Message::Binary(bin))) => { - let msg = match bincode::deserialize(&bin) { - Ok(msg) => msg, - Err(e) => { - error!("failed to deserialize a message from the control plane: {e}"); - continue; - } - }; - - match msg { - control::MuxedMessageIncoming::Child(msg) => server_request_in.send(msg).expect("internal RPC channel closed"), - control::MuxedMessageIncoming::Parent(msg) => client_response_in.send(msg).expect("internal RPC channel closed"), - } - } - - None | Some(Err(_)) => { - error!("The connection to the control plane was interrupted"); - break 'event; - } - - Some(Ok(o)) => println!("{o:#?}"), - }, - }; - } - - if terminating { - break 'process; - } + let state2 = Arc::clone(&state); + tokio::spawn(async move { + loop { + let req = client::new_ws_request(&ws_uri, state2.db.jwt()); + client::ws_connection(req, Arc::clone(&state2)).await; + // Remove the control client + state2.client.write().await.take(); + info!("Attempting to reconnect to the control plane..."); + tokio::time::sleep(Duration::from_secs(5)).await; } + }); - // wait some time before attempting to reconnect - select! { - _ = interrupt.recv_any() => break, + // Root reconciler that walks through configuring the agent. + // The context is mutated while reconciling to keep track of things + // like downloads, ledger manipulations, node command, and more. + let mut root = AgentStateReconciler { + agent_state: state.get_agent_state().await, + state: Arc::clone(&state), + // Recover context from previous state + context: AgentStateReconcilerContext::hydrate(&state.db), + }; - // TODO: dynamic time - _ = tokio::time::sleep(Duration::from_secs(5)) => { - info!("Attempting to reconnect..."); - }, - } + select! { + _ = root.loop_forever(reconcile_requests) => unreachable!(), + _ = interrupt.recv_any() => {}, + _ = shutdown_rx => {}, } - state.node_graceful_shutdown().await; - info!("snops agent has shut down gracefully :)"); + info!("Received interrupt signal, shutting down..."); + if let Some(process) = root.context.process.as_mut() { + process.graceful_shutdown().await; + info!("Agent has shut down gracefully"); + } } struct Signals { @@ -388,6 +181,10 @@ impl Signals { } } + pub fn term_or_interrupt() -> Self { + Self::new(&[SignalKind::terminate(), SignalKind::interrupt()]) + } + async fn recv_any(&mut self) { let mut futs = FuturesUnordered::new(); @@ -398,3 +195,12 @@ impl Signals { futs.next().await; } } + +#[cfg(test)] +mod test { + #[test] + // CI is failing because the agent has no tests + fn test_nothing() { + assert_eq!(1, 1) + } +} diff --git a/crates/agent/src/metrics/mod.rs b/crates/agent/src/metrics/mod.rs index 1dd66b21..2f9e9b2f 100644 --- a/crates/agent/src/metrics/mod.rs +++ b/crates/agent/src/metrics/mod.rs @@ -19,26 +19,20 @@ pub fn init(state: Arc) { tokio::spawn(async move { let mut interval = tokio::time::interval(UPDATE_RATE); let client = reqwest::Client::new(); + let route = format!( + "http://{}/", + SocketAddr::new(state.cli.get_local_ip(), state.cli.ports.metrics) + ); loop { interval.tick().await; - // TODO: this could probably be improved, but we want to avoid scraping metrics - // if the child doesn't exist - if state.child.read().await.is_none() { + if !state.is_node_online() { continue; } - // TODO: maybe this should use bind_addr let metrics_text = 'metrics: { - let response = match client - .get(format!( - "http://{}/", - SocketAddr::new(state.cli.get_local_ip(), state.cli.ports.metrics) - )) - .send() - .await - { + let response = match client.get(&route).send().await { Ok(response) => response, Err(_e) => { break 'metrics Default::default(); diff --git a/crates/agent/src/net.rs b/crates/agent/src/net.rs index fb931d7a..8c18e196 100644 --- a/crates/agent/src/net.rs +++ b/crates/agent/src/net.rs @@ -12,7 +12,7 @@ pub fn get_internal_addrs() -> Result> { // loopback addresses can be used when the networks are calculated // to be the same, but they are not useful for peer to peer comms if ip.is_loopback() { - info!("skipping loopback iface {name}: {ip:?}"); + info!("Skipping loopback iface {name}: {ip:?}"); return None; } @@ -21,11 +21,11 @@ pub fn get_internal_addrs() -> Result> { // these addrs are about as useful as their v4 counterpart if let IpAddr::V6(v6) = ip { if (v6.segments()[0] & 0xffc0) == 0xfe80 { - info!("skipping link-local iface {name}: {ip:?}"); + info!("Skipping link-local iface {name}: {ip:?}"); return None; } } - info!("using iface {name}: {ip:?}"); + info!("Using iface {name}: {ip:?}"); Some(ip) }) .collect()) diff --git a/crates/agent/src/reconcile.rs b/crates/agent/src/reconcile.rs deleted file mode 100644 index 693c593a..00000000 --- a/crates/agent/src/reconcile.rs +++ /dev/null @@ -1,435 +0,0 @@ -use std::{ - collections::BTreeMap, - path::{Path, PathBuf}, -}; - -use snops_checkpoint::{CheckpointHeader, CheckpointManager, RetentionSpan}; -use snops_common::{ - api::{CheckpointMeta, EnvInfo}, - binaries::{BinaryEntry, BinarySource}, - constant::{ - LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, LEDGER_STORAGE_FILE, SNARKOS_FILE, - SNARKOS_GENESIS_FILE, VERSION_FILE, - }, - rpc::error::ReconcileError, - state::{HeightRequest, InternedId, NetworkId, StorageId}, -}; -use tokio::process::Command; -use tracing::{debug, error, info, trace}; - -use crate::{api, state::GlobalState}; - -/// Ensure the correct binary is present for running snarkos -pub async fn ensure_correct_binary( - binary_id: Option, - state: &GlobalState, - info: &EnvInfo, -) -> Result<(), ReconcileError> { - let base_path = &state.cli.path; - - let default_entry = BinaryEntry { - source: BinarySource::Path(PathBuf::from(format!( - "/content/storage/{}/{}/binaries/default", - info.network, info.storage.id - ))), - sha256: None, - size: None, - }; - - // TODO: store binary based on binary id - // download the snarkOS binary - api::check_binary( - info.storage - .binaries - .get(&binary_id.unwrap_or_default()) - .unwrap_or(&default_entry), - &state.endpoint, - &base_path.join(SNARKOS_FILE), - state.transfer_tx(), - ) - .await - .map_err(|e| ReconcileError::BinaryAcquireError(e.to_string()))?; - - Ok(()) -} - -/// Ensure all required files are present in the storage directory -pub async fn check_files( - state: &GlobalState, - info: &EnvInfo, - height: &HeightRequest, -) -> Result<(), ReconcileError> { - let base_path = &state.cli.path; - let storage_id = &info.storage.id; - let network = info.network; - let storage_path = base_path - .join("storage") - .join(network.to_string()) - .join(storage_id.to_string()); - - // create the directory containing the storage files - tokio::fs::create_dir_all(&storage_path) - .await - .map_err(|_| ReconcileError::StorageSetupError("create storage directory".to_string()))?; - - let version_file = storage_path.join(VERSION_FILE); - - // wipe old storage when the version changes - if get_version_from_path(&version_file).await? != Some(info.storage.version) - && storage_path.exists() - { - let _ = tokio::fs::remove_dir_all(&storage_path).await; - } - - std::fs::create_dir_all(&storage_path).map_err(|e| { - error!("failed to create storage directory: {e}"); - ReconcileError::StorageSetupError("create storage directory".to_string()) - })?; - - let genesis_path = storage_path.join(SNARKOS_GENESIS_FILE); - let genesis_url = format!( - "{}/content/storage/{network}/{storage_id}/{SNARKOS_GENESIS_FILE}", - &state.endpoint - ); - let ledger_path = storage_path.join(LEDGER_STORAGE_FILE); - let ledger_url = format!( - "{}/content/storage/{network}/{storage_id}/{LEDGER_STORAGE_FILE}", - &state.endpoint - ); - - // skip genesis download for native genesis storage - if !info.storage.native_genesis { - // download the genesis block - api::check_file(genesis_url, &genesis_path, state.transfer_tx()) - .await - .map_err(|e| { - error!("failed to download {SNARKOS_GENESIS_FILE} from the control plane: {e}"); - ReconcileError::StorageAcquireError(SNARKOS_GENESIS_FILE.to_owned()) - })?; - } - - // don't download - if height.reset() { - info!("skipping ledger check due to 0 height request"); - return Ok(()); - } - - // download the ledger file - api::check_file(ledger_url, &ledger_path, state.transfer_tx()) - .await - .map_err(|e| { - error!("failed to download {SNARKOS_GENESIS_FILE} from the control plane: {e}"); - ReconcileError::StorageAcquireError(LEDGER_STORAGE_FILE.to_owned()) - })?; - - // write the regen version to a "version" file - tokio::fs::write(&version_file, info.storage.version.to_string()) - .await - .map_err(|e| { - error!("failed to write storage version: {e}"); - ReconcileError::StorageSetupError("write storage version".to_string()) - })?; - - Ok(()) -} - -/// Untar the ledger file into the storage directory -pub async fn load_ledger( - state: &GlobalState, - info: &EnvInfo, - height: &HeightRequest, - is_new_env: bool, -) -> Result { - let base_path = &state.cli.path; - let storage_id = &info.storage.id; - let storage_path = base_path - .join("storage") - .join(info.network.to_string()) - .join(storage_id.to_string()); - - // use a persisted directory for the untar when configured - let (untar_base, untar_dir) = if info.storage.persist { - info!("using persisted ledger for {storage_id}"); - (&storage_path, LEDGER_PERSIST_DIR) - } else { - info!("using fresh ledger for {storage_id}"); - (base_path, LEDGER_BASE_DIR) - }; - - let ledger_dir = untar_base.join(untar_dir); - - tokio::fs::create_dir_all(&ledger_dir.join(".aleo")) - .await - .map_err(|_| ReconcileError::StorageSetupError("create local aleo home".to_string()))?; - - // skip the top request if the persisted ledger already exists - // this will prevent the ledger from getting wiped in the next step - if info.storage.persist && height.is_top() && ledger_dir.exists() { - info!("persisted ledger already exists for {storage_id}"); - return Ok(false); - } - - let mut changed = false; - - // If there's a retention policy, load the checkpoint manager - // this is so we can wipe all leftover checkpoints for non-persisted storage - // after resets or new environments - let mut manager = info - .storage - .retention_policy - .clone() - .map(|policy| { - debug!("loading checkpoints from {untar_base:?}..."); - CheckpointManager::load(ledger_dir.clone(), policy).map_err(|e| { - error!("failed to load checkpoints: {e}"); - ReconcileError::CheckpointLoadError - }) - }) - .transpose()?; - - if let Some(manager) = &manager { - info!("discovered checkpoints: {manager}"); - } - - // reload the storage if the height is reset or a new environment is created - if height.reset() || is_new_env { - // clean up old storage - if ledger_dir.exists() { - changed = true; - if let Err(err) = tokio::fs::remove_dir_all(&ledger_dir).await { - error!("failed to remove old ledger: {err}"); - } - } - - // cleanup old checkpoints for non-persisted ledgers as they are - // stored in a common location - // - // this also forces the rewind checkpoints to be fetched from the - // control plane - if !info.storage.persist { - if let Some(manager) = manager.as_mut() { - info!("wiping old checkpoints for {storage_id}"); - manager.wipe(); - } - } - } - - let tar_path = storage_path.join(LEDGER_STORAGE_FILE); - - // A reset height will not require untarring the ledger because it is - // created from the genesis block - if is_new_env && !height.reset() && tar_path.exists() { - changed = true; - - // ensure the storage directory exists - tokio::fs::create_dir_all(&ledger_dir) - .await - .map_err(|err| { - error!("failed to create storage directory: {err}"); - ReconcileError::StorageSetupError("create ledger directory".to_string()) - })?; - - trace!("untarring ledger..."); - - // use `tar` to decompress the storage to the untar dir - let status = Command::new("tar") - .current_dir(untar_base) - .arg("xzf") - .arg(&tar_path) - .arg("-C") // the untar_dir must exist. this will extract the contents of the tar to the - // directory - .arg(untar_dir) - .arg("--strip-components") // remove the parent "ledger" directory within the tar - .arg("1") - .kill_on_drop(true) - .spawn() - .map_err(|err| { - error!("failed to spawn tar process: {err}"); - ReconcileError::StorageSetupError("spawn tar process".to_string()) - })? - .wait() - .await - .map_err(|err| { - error!("failed to await tar process: {err}"); - ReconcileError::StorageSetupError("await tar process".to_string()) - })?; - - if !status.success() { - return Err(ReconcileError::StorageSetupError(format!( - "tar failed: {status}" - ))); - } - } - - if matches!(height, HeightRequest::Top | HeightRequest::Absolute(0)) { - return Ok(changed); - } - - // retention policies are required for the rewind operations - let Some(manager) = &manager.as_mut() else { - return Err(ReconcileError::MissingRetentionPolicy); - }; - - // determine which checkpoint to use by the next available height/time - let checkpoint = match height { - HeightRequest::Absolute(block_height) => { - find_checkpoint_by_height(manager, &info.storage.checkpoints, *block_height) - } - HeightRequest::Checkpoint(span) => { - find_checkpoint_by_span(manager, &info.storage.checkpoints, *span) - } - _ => unreachable!("handled by previous match"), - } - .ok_or(ReconcileError::CheckpointAcquireError)?; - - // download checkpoint if necessary, and get the path - let path = checkpoint - .acquire(state, &storage_path, *storage_id, info.network) - .await?; - - // apply the checkpoint to the ledger - let mut command = Command::new(state.cli.path.join(SNARKOS_FILE)); - command - .stdout(std::io::stdout()) - .stderr(std::io::stderr()) - .env("NETWORK", info.network.to_string()) - .arg("ledger") - .arg("--ledger") - .arg(&ledger_dir); - - if !info.storage.native_genesis { - command - .arg("--genesis") - .arg(storage_path.join(SNARKOS_GENESIS_FILE)); - } - - command.arg("checkpoint").arg("apply").arg(path); - - let res = command - .spawn() - .map_err(|e| { - error!("failed to spawn checkpoint apply process: {e}"); - ReconcileError::CheckpointApplyError("spawn checkpoint apply process".to_string()) - })? - .wait() - .await - .map_err(|e| { - error!("failed to await checkpoint apply process: {e}"); - ReconcileError::CheckpointApplyError("await checkpoint apply process".to_string()) - })?; - - if !res.success() { - return Err(ReconcileError::CheckpointApplyError(format!( - "checkpoint apply failed: {res}" - ))); - } - - Ok(true) -} - -enum CheckpointSource<'a> { - Manager(&'a CheckpointHeader, &'a PathBuf), - Meta(&'a CheckpointMeta), -} - -impl<'a> CheckpointSource<'a> { - async fn acquire( - self, - state: &GlobalState, - storage_path: &Path, - storage_id: StorageId, - network: NetworkId, - ) -> Result { - Ok(match self { - CheckpointSource::Meta(meta) => { - info!( - "using checkpoint from control plane with height {} and time {}", - meta.height, meta.timestamp - ); - let checkpoint_url = format!( - "{}/content/storage/{network}/{storage_id}/{}", - &state.endpoint, meta.filename - ); - let path = storage_path.join(&meta.filename); - info!("downloading {} from {checkpoint_url}...", meta.filename); - - api::check_file(checkpoint_url, &path, state.transfer_tx()) - .await - .map_err(|e| { - error!( - "failed to download {} from the control plane: {e}", - meta.filename - ); - ReconcileError::StorageAcquireError(meta.filename.clone()) - })?; - - path - } - CheckpointSource::Manager(header, path) => { - info!( - "using checkpoint from manager with height {} and time {}", - header.block_height, - header.time() - ); - path.clone() - } - }) - } -} - -fn find_checkpoint_by_height<'a>( - manager: &'a CheckpointManager, - checkpoints: &'a [CheckpointMeta], - height: u32, -) -> Option> { - let sorted: BTreeMap<_, _> = manager - .checkpoints() - .map(|(c, p)| (c.block_height, CheckpointSource::Manager(c, p))) - .chain( - checkpoints - .iter() - .map(|c| (c.height, CheckpointSource::Meta(c))), - ) - .collect(); - - sorted - .into_iter() - .rev() - .find_map(|(h, c)| if h <= height { Some(c) } else { None }) -} - -fn find_checkpoint_by_span<'a>( - manager: &'a CheckpointManager, - checkpoints: &'a [CheckpointMeta], - span: RetentionSpan, -) -> Option> { - let timestamp = span.as_timestamp()?; - - let sorted: BTreeMap<_, _> = manager - .checkpoints() - .map(|(c, p)| (c.timestamp, CheckpointSource::Manager(c, p))) - .chain( - checkpoints - .iter() - .map(|c| (c.timestamp, CheckpointSource::Meta(c))), - ) - .collect(); - - sorted - .into_iter() - .rev() - .find_map(|(t, c)| if t <= timestamp { Some(c) } else { None }) -} - -async fn get_version_from_path(path: &PathBuf) -> Result, ReconcileError> { - if !path.exists() { - return Ok(None); - } - - let data = tokio::fs::read_to_string(path).await.map_err(|e| { - error!("failed to read storage version: {e}"); - ReconcileError::StorageSetupError("failed to read storage version".to_string()) - })?; - - Ok(data.parse().ok()) -} diff --git a/crates/agent/src/reconcile/address.rs b/crates/agent/src/reconcile/address.rs new file mode 100644 index 00000000..9c73ceb6 --- /dev/null +++ b/crates/agent/src/reconcile/address.rs @@ -0,0 +1,93 @@ +use std::{collections::HashSet, sync::Arc}; + +use snops_common::{ + rpc::error::ReconcileError, + state::{AgentId, AgentPeer, NodeState}, +}; +use tarpc::context; +use tracing::{error, warn}; + +use super::{Reconcile, ReconcileStatus}; +use crate::state::GlobalState; + +/// Given a node state, resolve the addresses of the agent based peers and +/// validators. Non-agent based peers have their addresses within the state +/// already. +pub struct AddressResolveReconciler { + pub state: Arc, + pub node: Arc, +} + +impl Reconcile<(), ReconcileError> for AddressResolveReconciler { + async fn reconcile(&mut self) -> Result, ReconcileError> { + let AddressResolveReconciler { state, node } = self; + + // Find agents that do not have cached addresses + let unresolved_addrs: Vec = { + let resolved_addrs = state.resolved_addrs.read().await; + node.peers + .iter() + .chain(node.validators.iter()) + .filter_map(|p| { + if let AgentPeer::Internal(id, _) = p { + (!resolved_addrs.contains_key(id)).then_some(*id) + } else { + None + } + }) + // Ensure we only have unique agent ids (can use itertools down the line) + .collect::>() + .into_iter() + .collect() + }; + + // All addrs have been resolved. + if unresolved_addrs.is_empty() { + return Ok(ReconcileStatus::default()); + } + + let Some(client) = state.client.read().await.clone() else { + warn!("Agent state contains {} addresses that need to be resolved, but client is not connected", unresolved_addrs.len()); + + // Client is offline so new addrs cannot be requested + return Ok(ReconcileStatus::default()); + }; + + // Fetch all unresolved addresses and update the cache + tracing::trace!( + "Need to resolve addrs: {}", + unresolved_addrs + .iter() + .map(|id| id.to_string()) + .collect::>() + .join(",") + ); + + // Resolve the addresses + // TODO: turn this into a background process so the reconcile operation can run + // instantly + let new_addrs = client + .resolve_addrs(context::current(), unresolved_addrs) + .await + .map_err(|e| ReconcileError::RpcError(e.to_string()))? + .map_err(ReconcileError::AddressResolve)?; + + tracing::trace!( + "Resolved new addrs: {}", + new_addrs + .iter() + .map(|(id, addr)| format!("{}: {}", id, addr)) + .collect::>() + .join(", ") + ); + + // Extend the cache with the new addresses + let mut lock = state.resolved_addrs.write().await; + lock.extend(new_addrs); + if let Err(e) = state.db.set_resolved_addrs(Some(&lock)) { + error!("failed to save resolved addrs to db: {e}"); + } + + Ok(ReconcileStatus::default()) + } +} diff --git a/crates/agent/src/reconcile/agent.rs b/crates/agent/src/reconcile/agent.rs new file mode 100644 index 00000000..d645028d --- /dev/null +++ b/crates/agent/src/reconcile/agent.rs @@ -0,0 +1,535 @@ +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; + +use snops_common::{ + api::AgentEnvInfo, + binaries::BinaryEntry, + rpc::error::ReconcileError, + state::{ + AgentState, HeightRequest, NodeState, ReconcileCondition, ReconcileOptions, TransferId, + }, +}; +use tarpc::context; +use tokio::{ + select, + sync::{mpsc::Receiver, Mutex}, + task::AbortHandle, + time::sleep_until, +}; +use tracing::{error, info, trace}; + +use super::{ + command::NodeCommand, + process::ProcessContext, + state::EnvState, + storage::{BinaryReconciler, GenesisReconciler, LedgerModifyResult, StorageVersionReconciler}, + Reconcile, ReconcileStatus, +}; +use crate::{ + db::Database, + reconcile::{ + address::AddressResolveReconciler, default_binary, process::EndProcessReconciler, + storage::LedgerReconciler, + }, + state::GlobalState, +}; + +/// Attempt to reconcile the agent's current state. +/// This will download files and start/stop the node +pub struct AgentStateReconciler { + pub agent_state: Arc, + pub state: Arc, + pub context: AgentStateReconcilerContext, +} + +#[derive(Default)] +pub struct AgentStateReconcilerContext { + /// Persisted values that determine if the storage has changed + pub env_state: Option, + /// The last ledger height that was successfully configured + pub ledger_last_height: Option<(usize, HeightRequest)>, + // TODO: allow transfers to be interrupted. potentially allow them to be resumed by using the + // file range feature. + /// Information about active transfers + transfers: Option, + /// Information about the node process + pub process: Option, + pub shutdown_pending: bool, +} + +#[derive(Default)] +struct TransfersContext { + /// Metadata about an active binary transfer + binary_transfer: Option<(TransferId, BinaryEntry)>, + /// Time the binary was marked as OK + binary_ok_at: Option, + + /// Metadata about an active genesis block transfer + genesis_transfer: Option, + /// Time the genesis block was marked as OK + genesis_ok_at: Option, + + /// The height that is currently being configured + ledger_pending_height: Option<(usize, HeightRequest)>, + + /// A handle containing the task that modifies the ledger. + /// The mutex is held until the task is complete, and the bool is set to + /// true when the task is successful. + ledger_modify_handle: Option<(AbortHandle, Arc>>)>, +} + +impl AgentStateReconcilerContext { + pub fn hydrate(db: &Database) -> Self { + let ledger_last_height = db + .last_height() + .inspect_err(|e| error!("failed to restore last height from db: {e}")) + .unwrap_or_default(); + let env_state = db + .env_state() + .inspect_err(|e| error!("failed to restore env state from db: {e}")) + .unwrap_or_default(); + + Self { + env_state, + ledger_last_height, + ..Default::default() + } + } +} + +/// Run a reconciler and return early if a requeue is needed. A condition is +/// added to the scope when a requeue is needed to provide more context when +/// monitoring the agent. +macro_rules! reconcile { + ($id:ident, $e:expr) => { + reconcile!($id, $e, res => {}) + }; + ($id:ident, $e:expr, $v:ident => $rest:expr) => { + let $v = $e.reconcile().await?; + if $v.is_requeue() { + trace!("Requeue needed for {} ({:?}) {:?}", stringify!($id), $v.scopes, $v.conditions); + return Ok($v.add_scope(concat!(stringify!($id), "/requeue"))); + } + $rest + }; +} + +impl AgentStateReconciler { + pub async fn loop_forever( + &mut self, + mut reconcile_requests: Receiver<(Instant, ReconcileOptions)>, + ) { + let mut err_backoff = 0; + + // The first reconcile is scheduled for 5 seconds after startup. + // Connecting to the controlplane will likely trigger a reconcile sooner. + let mut next_reconcile_at = Instant::now() + Duration::from_secs(5); + let mut next_opts = ReconcileOptions::default(); + + // Repeated reconcile loop + loop { + // Await for the next reconcile, allowing for it to be moved up sooner + loop { + select! { + // Replace the next_reconcile_at with the soonest reconcile time + Some((new_reconcile_at, opts)) = reconcile_requests.recv() => { + next_reconcile_at = next_reconcile_at.min(new_reconcile_at); + next_opts = next_opts.union(opts); + }, + _ = sleep_until(next_reconcile_at.into()) => { + break + } + } + } + + // Drain the reconcile request queue + while reconcile_requests.try_recv().is_ok() {} + // Schedule the next reconcile for 1 minute (to periodically check if the node + // went offline) + next_reconcile_at = Instant::now() + Duration::from_secs(60); + + // Update the reconciler with the latest agent state + // This prevents the agent state from changing during reconciliation + self.agent_state = self.state.get_agent_state().await; + + // Clear the env info if refetch_info is set to force it to be fetched again + if next_opts.refetch_info { + self.state.set_env_info(None).await; + } + + // If the agent is forced to shutdown, set the shutdown_pending flag + if next_opts.force_shutdown && self.has_process() { + self.context.shutdown_pending = true; + } + + // If the agent is forced to clear the last height, clear it + if next_opts.clear_last_height { + self.context.ledger_last_height = None; + if let Err(e) = self.state.db.set_last_height(None) { + error!("failed to clear last height from db: {e}"); + } + } + + next_opts = Default::default(); + + trace!("Reconciling agent state..."); + let res = self.reconcile().await; + + // If this reconcile was triggered by a reconcile request, post the status + if let Some(client) = self.state.get_ws_client().await { + let node_is_started = self + .state + .get_node_status() + .await + .is_some_and(|s| s.is_started()); + let res = res + .clone() + .map(|s| s.replace_inner(self.is_node_running() && node_is_started)); + + // TODO: throttle this broadcast + tokio::spawn(async move { + if let Err(e) = client.post_reconcile_status(context::current(), res).await { + error!("failed to post reconcile status: {e}"); + } + }); + } + + match res { + Ok(status) => { + if status.inner.is_some() { + err_backoff = 0; + trace!("Reconcile completed"); + } + if !status.conditions.is_empty() { + trace!("Reconcile conditions: {:?}", status.conditions); + } + if let Some(requeue_after) = status.requeue_after { + trace!("Requeueing after {requeue_after:?}"); + next_reconcile_at = Instant::now() + requeue_after; + } + } + Err(e) => { + error!("failed to reconcile agent state: {e}"); + err_backoff = (err_backoff + 5).min(30); + next_reconcile_at = Instant::now() + Duration::from_secs(err_backoff); + } + } + } + } + + pub async fn reconcile_inventory(&mut self) -> Result, ReconcileError> { + // TODO: cleanup other things + + // End the process if it is running + if let Some(process) = self.context.process.as_mut() { + reconcile!(end_process, EndProcessReconciler(process), res => { + // If the process has exited, clear the process context + if res.inner.is_some() { + self.context.process = None; + self.state.set_node_status(None).await; + self.context.shutdown_pending = false; + } + }); + } + + if let Some(_transfers) = self.context.transfers.as_mut() { + // Clear the env state + self.context.env_state = None; + if let Err(e) = self.state.db.set_env_state(None) { + error!("failed to clear env state from db: {e}"); + } + // Clear the last height + self.context.ledger_last_height = None; + if let Err(e) = self.state.db.set_last_height(None) { + error!("failed to clear last height from db: {e}"); + } + + // TODO: interrupt/kill off pending downloads + + // Destroy the old transfers context + self.context.transfers = None; + } + + Ok(ReconcileStatus::default().add_scope("agent_state/inventory")) + } + + pub fn has_process(&self) -> bool { + self.context.process.is_some() + } + + pub fn is_node_running(&mut self) -> bool { + self.context + .process + .as_mut() + .is_some_and(|p| p.is_running()) + } + + pub fn is_shutdown_pending(&self, node: &NodeState, env_info: &AgentEnvInfo) -> bool { + // Ensure the process is running + if !self.has_process() { + return false; + } + + // Node was already marked for shutdown + if self.context.shutdown_pending { + return true; + } + + // Node is now configured to be offline + if !node.online { + info!("Node is marked offline"); + return true; + } + + // Check if the storage version, storage id, or network id has changed + if self + .context + .env_state + .as_ref() + .is_none_or(|e| e.changed(env_info)) + { + info!("Node storage version, storage id, or network id has changed"); + return true; + } + + // Check if the ledger height is not resolved + if self.context.ledger_last_height != Some(node.height) && !node.height.1.is_top() { + info!("Node ledger target height has changed"); + return true; + } + + let default_binary = default_binary(env_info); + let target_binary = env_info + .storage + .binaries + .get(&node.binary.unwrap_or_default()) + .unwrap_or(&default_binary); + + // Check if the binary this node is running is different from the one in storage + if self.context.process.as_ref().is_some_and(|p| { + target_binary + .sha256 + .as_ref() + .is_some_and(|sha256| !p.is_sha256_eq(sha256)) + }) { + info!("Node binary for the running process has changed"); + return true; + } + + // Check if the binary this node is running is different from the one in storage + if self + .context + .transfers + .as_ref() + .and_then(|t| t.binary_transfer.as_ref()) + .is_some_and(|(_, bin)| bin != target_binary) + { + info!("Node binary has changed"); + return true; + } + + false + } +} + +impl Reconcile<(), ReconcileError> for AgentStateReconciler { + async fn reconcile(&mut self) -> Result, ReconcileError> { + let (env_id, node) = match self.agent_state.as_ref() { + AgentState::Inventory => { + return self.reconcile_inventory().await; + } + AgentState::Node(env_id, node) => (env_id, node), + }; + + let env_info = self.state.get_env_info(*env_id).await?; + + // If the node should be torn down because a configuration changed, we need to + // gracefully shut down the node. + if self.is_shutdown_pending(node, &env_info) { + self.context.shutdown_pending = true; + // Unwrap safety - is_shutdown_pending ensures the process exists. + let process = self.context.process.as_mut().unwrap(); + + reconcile!(end_process, EndProcessReconciler(process), res => { + // If the process has exited, clear the process context + if res.inner.is_some() { + self.context.process = None; + self.state.set_node_status(None).await; + self.context.shutdown_pending = false; + } + }); + } + + // node is offline, no need to reconcile + if !node.online { + return Ok(ReconcileStatus::default().add_scope("agent_state/offline")); + } + + let node_arc = Arc::new(*node.clone()); + + // Resolve the addresses of the peers and validators + // This is run before the process is started, as the agent can sometimes have + // new addresses that need to be resolved. + reconcile!( + address_resolve, + AddressResolveReconciler { + node: Arc::clone(&node_arc), + state: Arc::clone(&self.state), + } + ); + + // Reconcile behavior while the node is running... + if let Some(process) = self.context.process.as_mut() { + // If the process has exited, clear the process context + if !process.is_running() { + info!("Node process has exited..."); + self.context.process = None; + + return Ok(ReconcileStatus::empty() + .requeue_after(Duration::ZERO) + .add_scope("agent_state/exited")); + } + + // Accumulate all the fields that are used to derive the command that starts + // the node. + let command = NodeCommand::new( + Arc::clone(&self.state), + node_arc, + *env_id, + Arc::clone(&env_info), + ) + .await?; + + // If the command has changed, restart the process + if process.command != command { + info!("Node command has changed, restarting process..."); + self.context.shutdown_pending = true; + return Ok(ReconcileStatus::empty() + .add_scope("agent_state/command_changed") + .requeue_after(Duration::ZERO)); + } + + // Prevent other reconcilers from running while the node is running + if self.state.is_node_online() { + let Some(node_status) = self.state.get_node_status().await else { + return Ok(ReconcileStatus::empty().add_scope("agent_state/node/booting")); + }; + + let rec = if node_status.is_started() { + ReconcileStatus::default() + } else { + ReconcileStatus::empty() + }; + + return Ok(rec.add_scope(format!("agent_state/node/{}", node_status.label()))); + } + + // If the node is not online, the process is still running, but the node + // has not connected to the controlplane. + // This can happen if the node is still syncing, or if the controlplane + // is not reachable. + return Ok(ReconcileStatus::empty() + .requeue_after(Duration::from_secs(1)) + .add_condition(ReconcileCondition::PendingStartup) + .add_scope("agent_state/node/booting")); + } + + let storage_path = self + .state + .cli + .storage_path(env_info.network, env_info.storage.id); + + // Ensure the storage version is correct, deleting the storage path + // the version changes. + reconcile!( + storage_version, + StorageVersionReconciler(&storage_path, env_info.storage.version), + res => { + if res.inner.is_some() { + trace!("Transfers context cleared due to storage version change"); + self.context.transfers = None; + } + } + ); + + // Initialize the transfers context with the current status + // This happens after the StorageVersionReconciler as storage_version within + // env_state will be guaranteed to match the remote env after it succeeds. + if self.context.transfers.is_none() { + let env_state = EnvState::from(env_info.as_ref()); + if let Err(e) = self.state.db.set_env_state(Some(&env_state)) { + error!("failed to save env state to db: {e}"); + } + self.context.env_state = Some(env_state); + self.context.transfers = Some(Default::default()); + trace!("Cleared transfers state..."); + } + let transfers = self.context.transfers.as_mut().unwrap(); + + // Resolve the genesis block + reconcile!( + genesis, + GenesisReconciler { + state: Arc::clone(&self.state), + env_info: Arc::clone(&env_info), + transfer: &mut transfers.genesis_transfer, + ok_at: &mut transfers.genesis_ok_at, + } + ); + + // Resolve the node's binary + reconcile!( + binary, + BinaryReconciler { + state: Arc::clone(&self.state), + env_info: Arc::clone(&env_info), + node_binary: node.binary, + transfer: &mut transfers.binary_transfer, + ok_at: &mut transfers.binary_ok_at, + } + ); + + reconcile!( + ledger, + LedgerReconciler { + state: Arc::clone(&self.state), + env_info: Arc::clone(&env_info), + modify_handle: &mut transfers.ledger_modify_handle, + target_height: node.height, + last_height: &mut self.context.ledger_last_height, + pending_height: &mut transfers.ledger_pending_height, + } + ); + + // TODO: if possible, use the NodeCommand as configuration for a node service to + // allow running the node outside of the agent + + info!("Starting node process"); + let command = NodeCommand::new( + Arc::clone(&self.state), + node_arc, + *env_id, + Arc::clone(&env_info), + ) + .await?; + + let process = ProcessContext::new(command)?; + // Clear the last node running status (it was shut down) + self.state.set_node_status(None).await; + self.context.process = Some(process); + self.context.shutdown_pending = false; + Ok(ReconcileStatus::empty() + .add_scope("agent_state/node/booting") + .requeue_after(Duration::from_secs(1))) + } +} + +// TODO: large file download behavior (ledgers): +// same as above, except maybe chunk the downloads or + +// TODO: support ledger.aleo.network snapshots: +// https://ledger.aleo.network/mainnet/snapshot/latest.txt +// https://ledger.aleo.network/testnet/snapshot/latest.txt +// https://ledger.aleo.network/canarynet/snapshot/latest.txt diff --git a/crates/agent/src/reconcile/command.rs b/crates/agent/src/reconcile/command.rs new file mode 100644 index 00000000..9a5e376c --- /dev/null +++ b/crates/agent/src/reconcile/command.rs @@ -0,0 +1,197 @@ +use std::{net::IpAddr, ops::Deref, path::PathBuf, process::Stdio, sync::Arc}; + +use indexmap::IndexMap; +use snops_checkpoint::RetentionPolicy; +use snops_common::{ + api::AgentEnvInfo, + constant::{ + LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, NODE_DATA_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, + SNARKOS_LOG_FILE, + }, + rpc::error::ReconcileError, + state::{EnvId, KeyState, NetworkId, NodeKey, NodeState, PortConfig}, +}; +use tokio::process::Command; +use url::Url; + +use crate::state::GlobalState; + +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct NodeCommand { + /// Path to the snarkos binary + pub command_path: PathBuf, + /// If true, do not print stdout + quiet: bool, + /// Environment ID (used in loki) + env_id: EnvId, + /// Node key (drives NETWORK env) + network: NetworkId, + /// Node key (derives node type and loki) + node_key: NodeKey, + /// URL for sending logs to loki + loki: Option, + /// Path to the ledger directory + ledger_path: PathBuf, + /// Path to place the log file + log_path: PathBuf, + /// Path to genesis block. When absent, use the network's genesis block. + genesis_path: Option, + /// Env variables to pass to the node + env: IndexMap, + /// Port to bind the agent's RPC server for node status + agent_rpc_port: u16, + /// Address to bind the node to + bind_addr: IpAddr, + /// Port configuration for the node + ports: PortConfig, + /// Private key to use for the node + private_key: Option, + /// Path to a file containing the private key + private_key_file: Option, + /// Retention policy for the node + retention_policy: Option, + /// Resolved peer addresses for the node + peers: Vec, + /// Resolved validator addresses for the node + validators: Vec, +} + +impl NodeCommand { + pub async fn new( + state: Arc, + node: Arc, + env_id: EnvId, + env_info: Arc, + ) -> Result { + let storage_path = state + .cli + .storage_path(env_info.network, env_info.storage.id); + + let ledger_path = if env_info.storage.persist { + storage_path.join(LEDGER_PERSIST_DIR) + } else { + let mut dir = state.cli.path.join(NODE_DATA_DIR); + dir.push(LEDGER_BASE_DIR); + dir + }; + + Ok(NodeCommand { + command_path: state.cli.path.join(SNARKOS_FILE), + quiet: state.cli.quiet, + env_id, + node_key: node.node_key.clone(), + loki: state.loki.lock().ok().and_then(|l| l.deref().clone()), + ledger_path, + log_path: state.cli.path.join(SNARKOS_LOG_FILE), + genesis_path: (!env_info.storage.native_genesis) + .then(|| storage_path.join(SNARKOS_GENESIS_FILE)), + network: env_info.network, + env: node.env.clone(), + agent_rpc_port: state.agent_rpc_port, + bind_addr: state.cli.bind_addr, + ports: state.cli.ports, + private_key: if let KeyState::Literal(pk) = &node.private_key { + Some(pk.clone()) + } else { + None + }, + // Ensure the private key file can be resolved. + // This is only reachable when an agent is referred to by its + // id in an environment spec. + private_key_file: if let KeyState::Local = &node.private_key { + Some( + state + .cli + .private_key_file + .clone() + .ok_or(ReconcileError::MissingLocalPrivateKey)?, + ) + } else { + None + }, + peers: state.agentpeers_to_cli(&node.peers).await, + validators: state.agentpeers_to_cli(&node.validators).await, + retention_policy: env_info.storage.retention_policy.clone(), + }) + } + + pub fn build(&self) -> Command { + let mut command = Command::new(&self.command_path); + + // set stdio + if self.quiet { + command.stdout(Stdio::null()); + } else { + command.stdout(std::io::stdout()); + } + command.stderr(std::io::stderr()); + + // add loki URL if one is set + if let Some(loki) = &self.loki { + command + .env( + "SNOPS_LOKI_LABELS", + format!("env_id={},node_key={}", self.env_id, self.node_key), + ) + .arg("--loki") + .arg(loki.as_str()); + } + + // setup the run command + command + .stderr(std::io::stderr()) + .envs(&self.env) + .env("NETWORK", self.network.to_string()) + .env("HOME", &self.ledger_path) + .arg("--log") + .arg(&self.log_path) + .arg("run") + .arg("--agent-rpc-port") + .arg(self.agent_rpc_port.to_string()) + .arg("--type") + .arg(self.node_key.ty.to_string()) + .arg("--ledger") + .arg(&self.ledger_path); + + if let Some(genesis) = &self.genesis_path { + command.arg("--genesis").arg(genesis); + } + + // storage configuration + command + // port configuration + .arg("--bind") + .arg(self.bind_addr.to_string()) + .arg("--bft") + .arg(self.ports.bft.to_string()) + .arg("--rest") + .arg(self.ports.rest.to_string()) + .arg("--metrics") + .arg(self.ports.metrics.to_string()) + .arg("--node") + .arg(self.ports.node.to_string()); + + if let Some(pk) = &self.private_key { + command.env("PRIVATE_KEY", pk); + } + + if let Some(pk_file) = &self.private_key_file { + command.env("PRIVATE_KEY_FILE", pk_file); + } + + // conditionally add retention policy + if let Some(policy) = &self.retention_policy { + command.arg("--retention-policy").arg(policy.to_string()); + } + + if !self.peers.is_empty() { + command.arg("--peers").arg(self.peers.join(",")); + } + + if !self.validators.is_empty() { + command.arg("--validators").arg(self.validators.join(",")); + } + + command + } +} diff --git a/crates/agent/src/reconcile/files.rs b/crates/agent/src/reconcile/files.rs new file mode 100644 index 00000000..d470b01a --- /dev/null +++ b/crates/agent/src/reconcile/files.rs @@ -0,0 +1,277 @@ +use std::{ + os::unix::fs::PermissionsExt, + path::{Path, PathBuf}, + sync::Arc, + time::Duration, +}; + +use chrono::{TimeDelta, Utc}; +use snops_common::{ + api::AgentEnvInfo, + binaries::{BinaryEntry, BinarySource}, + constant::SNARKOS_GENESIS_FILE, + rpc::error::ReconcileError, + state::{ + NetworkId, ReconcileCondition, ReconcileStatus, StorageId, TransferId, TransferStatusUpdate, + }, +}; +use tracing::{error, trace, warn}; +use url::Url; + +use super::Reconcile; +use crate::{ + api::{download_file, get_file_issues}, + state::GlobalState, + transfers, +}; + +pub fn default_binary(info: &AgentEnvInfo) -> BinaryEntry { + BinaryEntry { + source: BinarySource::Path(PathBuf::from(format!( + "/content/storage/{}/{}/binaries/default", + info.network, info.storage.id + ))), + sha256: None, + size: None, + } +} + +pub fn get_genesis_route(endpoint: &str, network: NetworkId, storage_id: StorageId) -> String { + format!("{endpoint}/content/storage/{network}/{storage_id}/{SNARKOS_GENESIS_FILE}") +} + +/// This reconciler creates a directory if it does not exist +pub struct DirectoryReconciler<'a>(pub &'a Path); +impl<'a> Reconcile<(), ReconcileError> for DirectoryReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError> { + std::fs::create_dir_all(self.0) + .map(ReconcileStatus::with) + .map_err(|e| ReconcileError::CreateDirectory(self.0.to_path_buf(), e.to_string())) + } +} + +/// The FileReconciler will download a file from a URL and place it in a local +/// directory. It will also check the file's size and sha256 hash if provided, +/// and set the file's permissions. If the file already exists, it will not be +/// downloaded again. +/// +/// The reconciler will return true when the file is ready, and false when the +/// file cannot be obtained (offline controlplane). +pub struct FileReconciler { + pub state: Arc, + pub src: Url, + pub dst: PathBuf, + pub offline: bool, + pub tx_id: Option, + pub permissions: Option, + pub check_sha256: Option, + pub check_size: Option, +} +impl FileReconciler { + pub fn new(state: Arc, src: Url, dst: PathBuf) -> Self { + Self { + state, + src, + dst, + offline: false, + tx_id: None, + permissions: None, + check_sha256: None, + check_size: None, + } + } + + pub fn with_offline(mut self, offline: bool) -> Self { + self.offline = offline; + self + } + + pub fn with_tx_id(mut self, tx_id: Option) -> Self { + self.tx_id = tx_id; + self + } + + pub fn with_binary(mut self, binary: &BinaryEntry) -> Self { + self.permissions = Some(0o755); + self.check_sha256 = binary.sha256.clone(); + self.check_size = binary.size; + self + } + + pub fn check_and_set_mode(&self) -> Result<(), ReconcileError> { + // ensure the file has the correct permissions + let Some(check_perms) = self.permissions else { + return Ok(()); + }; + + let perms = self + .dst + .metadata() + .map_err(|e| ReconcileError::FileStatError(self.dst.clone(), e.to_string()))? + .permissions(); + + if perms.mode() != check_perms { + std::fs::set_permissions(&self.dst, std::fs::Permissions::from_mode(check_perms)) + .map_err(|e| { + ReconcileError::FilePermissionError(self.dst.clone(), e.to_string()) + })?; + } + + Ok(()) + } +} + +impl Reconcile for FileReconciler { + async fn reconcile(&mut self) -> Result, ReconcileError> { + let client = reqwest::Client::new(); + + // Create a transfer id if one is not provided + if self.tx_id.is_none() { + self.tx_id = Some(transfers::next_id()); + } + + let tx_id = self.tx_id.unwrap(); + + // transfer is pending + let is_complete = match self.state.transfers.entry(tx_id) { + dashmap::Entry::Occupied(occupied_entry) => { + let entry = occupied_entry.get(); + + if entry.is_pending() { + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingTransfer { + source: self.src.to_string(), + id: tx_id, + }) + .requeue_after(Duration::from_secs(1))); + } + + if entry.is_interrupted() { + // if the failure is within the last 60 seconds, requeue + if Utc::now().signed_duration_since(entry.updated_at).abs() + < TimeDelta::seconds(60) + { + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::InterruptedTransfer { + source: self.src.to_string(), + id: tx_id, + reason: entry.interruption.clone(), + }) + .requeue_after(Duration::from_secs(60))); + } + + // if the failure is older than 60 seconds, remove the pending transfer and + // start over. + occupied_entry.remove(); + return Ok(ReconcileStatus::empty() + .add_scope("file/interrupt/restart") + .requeue_after(Duration::from_secs(1))); + } + + // entry is complete + true + } + dashmap::Entry::Vacant(_) => false, + }; + + let file_problems = get_file_issues( + &client, + self.src.as_str(), + self.dst.as_path(), + self.check_size, + self.check_sha256.as_deref(), + self.offline, + ) + .await?; + + // There is an issue with the file being complete and not existing + if is_complete && !self.dst.exists() { + // Clear the download + self.tx_id = None; + warn!( + "File is complete but does not exist: {} (Problem: {file_problems:?})", + self.dst.display() + ); + + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::MissingFile { + path: self.dst.display().to_string(), + }) + .requeue_after(Duration::from_secs(1))); + } + + if is_complete && file_problems.is_some() { + warn!( + "Complete file has {file_problems:?} problems: {}", + self.dst.display() + ); + + // if the file is complete, but there are issues, requeue + if self.dst.exists() { + // delete the file + tokio::fs::remove_file(&self.dst).await.map_err(|e| { + ReconcileError::DeleteFileError(self.dst.clone(), e.to_string()) + })?; + } + + // Clear the download + self.tx_id = None; + + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::MissingFile { + path: self.dst.display().to_string(), + }) + .requeue_after(Duration::from_secs(1))); + } + + // Everything is good. Ensure file permissions + if file_problems.is_none() { + self.check_and_set_mode()?; + trace!("File reconcile complete: {}", self.dst.display()); + return Ok(ReconcileStatus::with(true)); + } + + // file does not exist and cannot be downloaded right now + if !self.dst.exists() && self.offline { + return Ok( + ReconcileStatus::with(false).add_condition(ReconcileCondition::PendingConnection) + ); + } + + let src = self.src.clone(); + let dst = self.dst.clone(); + let transfer_tx = self.state.transfer_tx.clone(); + + // download the file + let handle = tokio::spawn(async move { + download_file(tx_id, &client, src, &dst, transfer_tx) + .await + // Dropping the File from download_file should close the handle + .map(|res| res.is_some()) + }) + .abort_handle(); + + // update the transfer with the handle (so it can be canceled if necessary) + if let Err(e) = self + .state + .transfer_tx + .send((tx_id, TransferStatusUpdate::Handle(handle))) + { + error!("failed to send transfer handle: {e}"); + } + + trace!( + "Started download of {} to {} via tx_id {tx_id}", + self.src, + self.dst.display() + ); + + // transfer is pending - requeue after 1 second with the pending condition + Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingTransfer { + source: self.src.to_string(), + id: tx_id, + }) + .requeue_after(Duration::from_secs(1))) + } +} diff --git a/crates/agent/src/reconcile/mod.rs b/crates/agent/src/reconcile/mod.rs new file mode 100644 index 00000000..2f39407c --- /dev/null +++ b/crates/agent/src/reconcile/mod.rs @@ -0,0 +1,13 @@ +pub mod agent; +pub mod command; +mod files; +pub use files::*; +use snops_common::state::ReconcileStatus; +pub mod address; +pub mod process; +pub mod state; +pub mod storage; + +pub trait Reconcile { + async fn reconcile(&mut self) -> Result, E>; +} diff --git a/crates/agent/src/reconcile/process.rs b/crates/agent/src/reconcile/process.rs new file mode 100644 index 00000000..b75bd15d --- /dev/null +++ b/crates/agent/src/reconcile/process.rs @@ -0,0 +1,164 @@ +use std::time::{Duration, Instant}; + +use snops_common::{ + rpc::error::ReconcileError, + state::{ReconcileCondition, ReconcileStatus}, + util::sha256_file, +}; +use tokio::{process::Child, select}; +use tracing::{error, info}; + +use super::{command::NodeCommand, Reconcile}; +use crate::state::NODE_GRACEFUL_SHUTDOWN_TIMEOUT; + +/// Information about the current process +pub struct ProcessContext { + /// The command used to start the node. If the next command is different, + /// the node should be restarted + pub command: NodeCommand, + /// The child process that is running the node + pub child: Child, + /// Time the child process was started + #[allow(dead_code)] + started_at: Instant, + /// Time a sigint was sent to the child process + sigint_at: Option, + /// Time a sigkill was sent to the child process + sigkill_at: Option, + /// The sha256 hash of the running binary + binary_sha256: String, +} + +impl ProcessContext { + pub fn new(command: NodeCommand) -> Result { + let binary_sha256 = sha256_file(&command.command_path).map_err(|e| { + ReconcileError::FileReadError(command.command_path.clone(), e.to_string()) + })?; + command + .build() + .spawn() + .map(|child| Self { + command, + child, + started_at: Instant::now(), + sigint_at: None, + sigkill_at: None, + binary_sha256, + }) + .map_err(|e| { + error!("failed to start node process: {e:?}"); + ReconcileError::SpawnError(e.to_string()) + }) + } + + /// Returns true when the child process has not exited + pub fn is_running(&mut self) -> bool { + // This code is mutable because try_wait modifies the Child. Without + // mutability, the current running status would never be updated. + self.child.try_wait().is_ok_and(|status| status.is_none()) + } + + /// Check if the running binary matches the provided sha256 hash + pub fn is_sha256_eq(&self, sha256: &str) -> bool { + self.binary_sha256 == sha256 + } + + /// A helper function to gracefully shutdown the node process without + /// a reconciler + pub async fn graceful_shutdown(&mut self) { + if !self.is_running() { + return; + } + + self.send_sigint(); + + select! { + _ = tokio::time::sleep(NODE_GRACEFUL_SHUTDOWN_TIMEOUT) => { + info!("Sending SIGKILL to node process"); + self.send_sigkill(); + }, + _ = tokio::signal::ctrl_c() => { + info!("Received SIGINT, sending SIGKILL to node process"); + self.send_sigkill(); + }, + _ = self.child.wait() => { + info!("Node process has exited gracefully"); + return; + } + } + + let _ = self.child.wait().await; + info!("Node process has exited"); + } + + /// Send a SIGINT to the child process + pub fn send_sigint(&mut self) -> bool { + use nix::{ + sys::signal::{self, Signal}, + unistd::Pid, + }; + + // prevent multiple sigints + if self.sigint_at.is_some() { + return false; + } + + // obtain the id, or return false if the child is not running + let Some(id) = self.child.id() else { + return false; + }; + + // send SIGINT to the child process + signal::kill(Pid::from_raw(id as i32), Signal::SIGINT) + .inspect(|_| { + // update the sigint time if the sigint was successful + self.sigint_at = Some(Instant::now()); + }) + .is_ok() + } + + /// Send a SIGKILL to the child process + pub fn send_sigkill(&mut self) -> bool { + // start_kill return Err if the process is already killed + self.child + .start_kill() + .inspect(|_| { + // update the kill time if the kill was successful + self.sigkill_at = Some(Instant::now()); + }) + .is_ok() + } +} + +/// The EndProcessReconciler will return true when the child process has exited. +/// It will wait NODE_GRACEFUL_SHUTDOWN_TIMEOUT seconds after sending a SIGINT +/// before sending a SIGKILL (if the childi process has not exited), +pub struct EndProcessReconciler<'a>(pub &'a mut ProcessContext); + +impl<'a> Reconcile<(), ReconcileError> for EndProcessReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError> { + if !self.0.is_running() { + return Ok(ReconcileStatus::default()); + } + + let Some(sigint_at) = self.0.sigint_at else { + if self.0.send_sigint() { + info!("Sent SIGINT to node process"); + } + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingShutdown) + .requeue_after(Duration::from_secs(1))); + }; + + if sigint_at.elapsed() > NODE_GRACEFUL_SHUTDOWN_TIMEOUT + && self.0.sigkill_at.is_none() + && self.0.send_sigkill() + { + info!("Sent SIGKILL to node process"); + } + + Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingShutdown) + .requeue_after(Duration::from_secs(1))) + } +} diff --git a/crates/agent/src/reconcile/state.rs b/crates/agent/src/reconcile/state.rs new file mode 100644 index 00000000..cc2a69e0 --- /dev/null +++ b/crates/agent/src/reconcile/state.rs @@ -0,0 +1,73 @@ +use snops_common::{ + api::AgentEnvInfo, + format::{DataFormat, DataHeaderOf}, + state::{NetworkId, StorageId}, +}; + +pub struct EnvState { + network_id: NetworkId, + storage_id: StorageId, + storage_version: u16, +} + +impl EnvState { + pub fn changed(&self, env_info: &AgentEnvInfo) -> bool { + env_info.storage.version != self.storage_version + || env_info.storage.id != self.storage_id + || env_info.network != self.network_id + } +} + +impl From<&AgentEnvInfo> for EnvState { + fn from(info: &AgentEnvInfo) -> Self { + Self { + network_id: info.network, + storage_id: info.storage.id, + storage_version: info.storage.version, + } + } +} + +impl Default for EnvState { + fn default() -> Self { + Self { + network_id: NetworkId::Mainnet, + storage_id: StorageId::default(), + storage_version: 0, + } + } +} + +impl DataFormat for EnvState { + type Header = (u8, DataHeaderOf); + + const LATEST_HEADER: Self::Header = (1u8, NetworkId::LATEST_HEADER); + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + Ok(self.network_id.write_data(writer)? + + self.storage_id.write_data(writer)? + + self.storage_version.write_data(writer)?) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.0 != Self::LATEST_HEADER.0 { + return Err(snops_common::format::DataReadError::unsupported( + "EnvIdentifier", + Self::LATEST_HEADER.0, + header.0, + )); + } + + Ok(Self { + network_id: NetworkId::read_data(reader, &header.1)?, + storage_id: StorageId::read_data(reader, &())?, + storage_version: u16::read_data(reader, &())?, + }) + } +} diff --git a/crates/agent/src/reconcile/storage.rs b/crates/agent/src/reconcile/storage.rs new file mode 100644 index 00000000..90383671 --- /dev/null +++ b/crates/agent/src/reconcile/storage.rs @@ -0,0 +1,517 @@ +use std::{ + path::{Path, PathBuf}, + sync::Arc, + time::{Duration, Instant}, +}; + +use snops_checkpoint::CheckpointManager; +use snops_common::{ + api::AgentEnvInfo, + binaries::{BinaryEntry, BinarySource}, + constant::{ + LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, NODE_DATA_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, + VERSION_FILE, + }, + rpc::error::ReconcileError, + state::{HeightRequest, InternedId, ReconcileCondition, ReconcileStatus, TransferId}, +}; +use tokio::{process::Command, sync::Mutex, task::AbortHandle}; +use tracing::{error, info, trace}; +use url::Url; + +use super::{default_binary, get_genesis_route, DirectoryReconciler, FileReconciler, Reconcile}; +use crate::state::GlobalState; + +/// Download a specific binary file needed to run the node +pub struct BinaryReconciler<'a> { + pub state: Arc, + pub env_info: Arc, + pub node_binary: Option, + /// Metadata about an active binary transfer + pub transfer: &'a mut Option<(TransferId, BinaryEntry)>, + /// Time the binary was marked as OK + pub ok_at: &'a mut Option, +} + +impl<'a> Reconcile<(), ReconcileError> for BinaryReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError> { + let BinaryReconciler { + state, + env_info, + node_binary, + transfer, + ok_at, + } = self; + + // Binary entry for the node + let default_binary = default_binary(env_info); + let target_binary = env_info + .storage + .binaries + .get(&node_binary.unwrap_or_default()) + .unwrap_or(&default_binary); + + // Check if the binary has changed + let binary_has_changed = transfer.as_ref().is_none_or(|(_, b)| b != target_binary); + + let dst = state.cli.path.join(SNARKOS_FILE); + + // The binary does not exist and is marked as OK... + if ok_at.is_some() && (binary_has_changed || !dst.exists()) { + **ok_at = None; + } + + // If the binary has not changed and has not expired, we can skip the binary + // reconciler + if !binary_has_changed && ok_at.is_some() { + return Ok(ReconcileStatus::default()); + } + **ok_at = None; + + let src = match &target_binary.source { + BinarySource::Url(url) => url.clone(), + BinarySource::Path(path) => { + let url = format!("{}{}", &state.endpoint, path.display()); + url.parse::() + .map_err(|e| ReconcileError::UrlParseError(url, e.to_string()))? + } + }; + + let mut file_rec = FileReconciler::new(Arc::clone(state), src, dst) + .with_offline(target_binary.is_api_file() && !state.is_ws_online()) + .with_binary(target_binary) + .with_tx_id(transfer.as_ref().map(|(tx, _)| *tx)); + let file_res = file_rec.reconcile().await?; + + **transfer = file_rec.tx_id.map(|tx_id| (tx_id, target_binary.clone())); + + // Transfer is pending or a failure occurred + if file_res.is_requeue() { + return Ok(file_res.emptied().add_scope("file/requeue")); + } + + match file_res.inner { + // If the binary is OK, update the context + Some(true) => { + **ok_at = Some(Instant::now()); + Ok(ReconcileStatus::default()) + } + // If the binary is not OK, we will wait for the endpoint to come back + // online... + Some(false) => { + trace!("binary is not OK, waiting for the endpoint to come back online..."); + Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingConnection) + .add_condition(ReconcileCondition::MissingFile { + path: SNARKOS_FILE.to_string(), + }) + .add_scope("binary/offline") + .requeue_after(Duration::from_secs(5))) + } + None => unreachable!("file reconciler returns a result when not requeued"), + } + } +} + +/// Download the genesis block needed to run the node +pub struct GenesisReconciler<'a> { + pub state: Arc, + pub env_info: Arc, + /// Metadata about an active genesis transfer + pub transfer: &'a mut Option, + /// Time the genesis was marked as OK + pub ok_at: &'a mut Option, +} + +impl<'a> Reconcile<(), ReconcileError> for GenesisReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError> { + let GenesisReconciler { + state, + env_info, + transfer, + ok_at, + } = self; + + let storage_path = state + .cli + .storage_path(env_info.network, env_info.storage.id); + + let genesis_file = storage_path.join(SNARKOS_GENESIS_FILE); + + // If the genesis file doesn't exist, it's not okay... + if !genesis_file.exists() && ok_at.is_some() { + **ok_at = None; + } + + // Genesis block file has been checked within 5 minutes + let genesis_file_ok = ok_at + .map(|ok| ok.elapsed().as_secs() < 300) + .unwrap_or(false); + + if env_info.storage.native_genesis || genesis_file_ok { + return Ok(ReconcileStatus::default()); + } + **ok_at = None; + + let genesis_url = get_genesis_route(&state.endpoint, env_info.network, env_info.storage.id); + let mut file_rec = FileReconciler::new( + Arc::clone(&self.state), + genesis_url.parse::().map_err(|e| { + ReconcileError::UrlParseError(genesis_url.to_string(), e.to_string()) + })?, + genesis_file, + ) + .with_offline(!self.state.is_ws_online()) + .with_tx_id(**transfer); + let file_res = file_rec.reconcile().await?; + + **transfer = file_rec.tx_id; + + if file_res.is_requeue() { + return Ok(file_res.emptied().add_scope("file/requeue")); + } + + match file_res.inner { + // If the binary is OK, update the context + Some(true) => { + **ok_at = Some(Instant::now()); + Ok(ReconcileStatus::default()) + } + // If the binary is not OK, we will wait for the endpoint to come back + // online... + Some(false) => { + trace!("genesis is not OK, waiting for the endpoint to come back online..."); + Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingConnection) + .add_condition(ReconcileCondition::MissingFile { + path: SNARKOS_GENESIS_FILE.to_string(), + }) + .add_scope("genesis/offline") + .requeue_after(Duration::from_secs(5))) + } + None => unreachable!("file reconciler returns a result when not requeued"), + } + } +} + +pub type LedgerModifyResult = Result; + +pub struct LedgerReconciler<'a> { + pub state: Arc, + pub env_info: Arc, + pub target_height: (usize, HeightRequest), + pub last_height: &'a mut Option<(usize, HeightRequest)>, + pub pending_height: &'a mut Option<(usize, HeightRequest)>, + pub modify_handle: &'a mut Option<(AbortHandle, Arc>>)>, +} + +impl<'a> LedgerReconciler<'a> { + pub fn untar_paths(&self) -> (PathBuf, &'static str) { + if self.env_info.storage.persist { + ( + self.state + .cli + .storage_path(self.env_info.network, self.env_info.storage.id), + LEDGER_PERSIST_DIR, + ) + } else { + (self.state.cli.path.join(NODE_DATA_DIR), LEDGER_BASE_DIR) + } + } + + pub fn ledger_path(&self) -> PathBuf { + let (path, dir) = self.untar_paths(); + path.join(dir) + } + + /// Find the checkpoint to apply to the ledger + /// Guaranteed error when target height is not the top, 0, or unlimited span + pub fn find_checkpoint(&self) -> Result { + let (untar_base, ledger_dir) = self.untar_paths(); + let ledger_path = untar_base.join(ledger_dir); + + // If there's a retention policy, load the checkpoint manager + // this is so we can wipe all leftover checkpoints for non-persisted storage + // after resets or new environments + let manager = self + .env_info + .storage + .retention_policy + .clone() + .map(|policy| { + trace!("loading checkpoints from {untar_base:?}..."); + CheckpointManager::load(ledger_path.clone(), policy).map_err(|e| { + error!("failed to load checkpoints: {e}"); + ReconcileError::CheckpointLoadError(e.to_string()) + }) + }) + .transpose()? + .ok_or(ReconcileError::MissingRetentionPolicy(self.target_height.1))?; + + // Determine which checkpoint to use by the next available height/time + match self.target_height.1 { + HeightRequest::Absolute(height) => manager.nearest_with_height(height), + HeightRequest::Checkpoint(span) => manager.nearest_with_span(span), + // top cannot be a target height + _ => None, + } + .map(|(_, path)| path) + .ok_or(ReconcileError::NoAvailableCheckpoints(self.target_height.1)) + .cloned() + } + + pub fn spawn_modify( + &self, + checkpoint: PathBuf, + ) -> (AbortHandle, Arc>>) { + let result = Arc::new(Mutex::new(None)); + let result2 = Arc::clone(&result); + + let is_native_genesis = self.env_info.storage.native_genesis; + let snarkos_path = self.state.cli.path.join(SNARKOS_FILE); + let network = self.env_info.network; + let storage_path = self + .state + .cli + .storage_path(network, self.env_info.storage.id); + let ledger_path = self.ledger_path(); + + // apply the checkpoint to the ledger + let mut command = Command::new(snarkos_path); + command + .stdout(std::io::stdout()) + .stderr(std::io::stderr()) + .env("NETWORK", network.to_string()) + .arg("ledger") + .arg("--ledger") + .arg(&ledger_path); + + if !is_native_genesis { + command + .arg("--genesis") + .arg(storage_path.join(SNARKOS_GENESIS_FILE)); + } + + command.arg("checkpoint").arg("apply").arg(checkpoint); + + let handle = tokio::spawn(async move { + let mut mutex = result.lock().await; + + let res = command + .spawn() + .map_err(|e| { + error!("failed to spawn checkpoint apply process: {e}"); + mutex.replace(Err(ReconcileError::CheckpointApplyError(String::from( + "spawn checkpoint apply process", + )))); + })? + .wait() + .await + .map_err(|e| { + error!("failed to await checkpoint apply process: {e}"); + mutex.replace(Err(ReconcileError::CheckpointApplyError(String::from( + "await checkpoint apply process", + )))); + })?; + + mutex.replace(Ok(res.success())); + + Ok::<(), ()>(()) + }) + .abort_handle(); + + (handle, result2) + } +} + +impl<'a> Reconcile<(), ReconcileError> for LedgerReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError> { + let env_info = self.env_info.clone(); + let target_height = self.target_height; + + let ledger_path = self.ledger_path(); + + // Ledger reconcile behavior is different depending on whether the storage is + // persistent. + let is_persist = env_info.storage.persist; + + // Defaulting the initial height allows the reconciler to treat + // a persisted env with non-top target heights as a request to delete + // the ledger + if self.last_height.is_none() { + // The default last height is top + *self.last_height = Some((0, HeightRequest::Top)); + + // delete ledger because no last_height indicates a fresh env + if !is_persist { + let _ = tokio::fs::remove_dir_all(&ledger_path).await; + } + } + let last_height = self.last_height.unwrap(); + + // TODO: only call this after unpacking the ledger + // create the ledger path if it doesn't exist + DirectoryReconciler(&ledger_path.join(".aleo/storage")) + .reconcile() + .await?; + + // If there is no pending height, check if there should be a pending height + if self.pending_height.is_none() { + // target height has been realized + if last_height == target_height { + return Ok(ReconcileStatus::default()); + } + + // If the target height is the top, we can skip the ledger reconciler + if target_height.1.is_top() { + *self.last_height = Some(target_height); + if let Err(e) = self.state.db.set_last_height(Some(target_height)) { + error!("failed to save last height to db: {e}"); + } + + // ledger operation is complete + return Ok(ReconcileStatus::default()); + } + + // If the target height is 0, we can delete the ledger + if target_height.1.reset() { + let _ = tokio::fs::remove_dir_all(&ledger_path).await; + *self.last_height = Some(target_height); + if let Err(e) = self.state.db.set_last_height(Some(target_height)) { + error!("failed to save last height to db: {e}"); + } + + // Ledger operation is complete... immediately requeue because the ledger was + // wiped + return Ok(ReconcileStatus::default() + .add_scope("ledger/wipe") + .requeue_after(Duration::ZERO)); + } + + // Target height is guaranteed to be different, not top, and not 0, which means + // it's up to the retention policies + + // TODO: implement a heightrequest that downloads a remote ledger + // TODO: ledger URL handling here instead of retention policy + // TODO: ledger downloading would enter a new code path that downloads a new one + + // Find the checkpoint for the reconciler's target height + let checkpoint = self.find_checkpoint()?; + trace!("Applying checkpoint: {}", checkpoint.display()); + // Start a task to modify the ledger with the checkpoint + *self.modify_handle = Some(self.spawn_modify(checkpoint)); + // Now that a task is running, set the pending height + *self.pending_height = Some(target_height); + trace!("Pending ledger modification to height {}", target_height.1); + + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingProcess { + process: format!("ledger modification to height {}", target_height.1), + }) + .requeue_after(Duration::from_secs(5))); + } + + let Some(modify_handle) = self.modify_handle.as_ref() else { + // This should be an unreachable condition, but may not be unreachable + // when more complex ledger operations are implemented + error!("modify handle missing for pending height"); + *self.pending_height = None; + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::InterruptedModify { + reason: String::from("modify handle missing"), + }) + .requeue_after(Duration::from_secs(1))); + }; + + // If the modify handle is locked, requeue until it's unlocked + let Ok(Some(handle)) = modify_handle.1.try_lock().map(|r| r.clone()) else { + trace!("Waiting for modify handle to unlock..."); + return Ok(ReconcileStatus::empty() + .add_condition(ReconcileCondition::PendingProcess { + process: format!("ledger modification to height {}", target_height.1), + }) + .requeue_after(Duration::from_secs(1))); + }; + + let pending = self.pending_height.unwrap(); + + match handle { + // If the ledger was modified successfully, update the last height + Ok(true) => { + info!( + "Ledger modification to height {} succeeded", + target_height.1 + ); + *self.last_height = Some(pending); + if let Err(e) = self.state.db.set_last_height(Some(pending)) { + error!("failed to save last height to db: {e}"); + } + } + // A failure in the ledger modification process is handled at the + // moment... + Ok(false) => { + error!("ledger modification to height {} failed", target_height.1); + // TODO: handle this failure.. maybe even by deleting the ledger + } + // Bubble an actual error up to the caller + Err(err) => { + error!( + "ledger modification to height {} errored: {err}", + target_height.1 + ); + return Err(err.clone()); + } + }; + + // Modification is complete. The last height is change dhwen the modification + // succeeds (above) + *self.pending_height = None; + *self.modify_handle = None; + + Ok(ReconcileStatus::default()) + } +} + +pub struct StorageVersionReconciler<'a>(pub &'a Path, pub u16); + +impl<'a> Reconcile<(), ReconcileError> for StorageVersionReconciler<'a> { + async fn reconcile(&mut self) -> Result, ReconcileError> { + let StorageVersionReconciler(path, version) = self; + + let version_file = path.join(VERSION_FILE); + + let version_file_data = if !version_file.exists() { + None + } else { + tokio::fs::read_to_string(&version_file) + .await + .map_err(|e| ReconcileError::FileReadError(version_file.clone(), e.to_string()))? + .parse() + .ok() + }; + + if path.exists() { + // wipe old storage when the version changes + if version_file_data != Some(*version) { + info!("Removing storage directory for version mismatch: local {version_file_data:?} != remote {version:?}"); + let _ = tokio::fs::remove_dir_all(&path).await; + } else { + // return an empty status if the version is the same + return Ok(ReconcileStatus::empty()); + }; + } + + DirectoryReconciler(path).reconcile().await?; + + if !version_file.exists() { + tokio::fs::write(&version_file, version.to_string()) + .await + .map_err(|e| { + error!("failed to write storage version: {e}"); + ReconcileError::CreateDirectory(version_file.to_path_buf(), e.to_string()) + })?; + } + + Ok(ReconcileStatus::default()) + } +} diff --git a/crates/agent/src/rpc/agent.rs b/crates/agent/src/rpc/agent.rs index 7b668654..e2ab05f8 100644 --- a/crates/agent/src/rpc/agent.rs +++ b/crates/agent/src/rpc/agent.rs @@ -34,8 +34,11 @@ impl AgentNodeService for AgentNodeRpcServer { block_timestamp, }: SnarkOSBlockInfo, ) -> Result<(), ()> { - self.state - .client + let Some(client) = self.state.client.read().await.clone() else { + return Ok(()); // ignore if client is not available + }; + + client .post_block_status( context::current(), height, @@ -50,8 +53,14 @@ impl AgentNodeService for AgentNodeRpcServer { } async fn post_status(self, _: context::Context, status: SnarkOSStatus) -> Result<(), ()> { - self.state - .client + let Some(client) = self.state.client.read().await.clone() else { + return Ok(()); // ignore if client is not available + }; + + // Update the last node status + self.state.set_node_status(Some(status.clone())).await; + + client .post_node_status(context::current(), status.into()) .await .inspect_err(|err| tracing::error!("failed to post node status: {err}")) diff --git a/crates/agent/src/rpc/control.rs b/crates/agent/src/rpc/control.rs index 19f9f5a6..285a7a1c 100644 --- a/crates/agent/src/rpc/control.rs +++ b/crates/agent/src/rpc/control.rs @@ -1,15 +1,9 @@ //! Control plane-to-agent RPC. -use std::{ - collections::HashSet, net::IpAddr, ops::Deref, path::PathBuf, process::Stdio, sync::Arc, -}; +use std::net::IpAddr; use snops_common::{ aot_cmds::AotCmd, - binaries::{BinaryEntry, BinarySource}, - constant::{ - LEDGER_BASE_DIR, LEDGER_PERSIST_DIR, SNARKOS_FILE, SNARKOS_GENESIS_FILE, SNARKOS_LOG_FILE, - }, define_rpc_mux, prelude::snarkos_status::SnarkOSLiteBlock, rpc::{ @@ -18,21 +12,17 @@ use snops_common::{ AgentMetric, AgentService, AgentServiceRequest, AgentServiceResponse, AgentStatus, Handshake, }, - ControlServiceRequest, ControlServiceResponse, + ControlServiceClient, ControlServiceRequest, ControlServiceResponse, }, - error::{AgentError, ReconcileError, SnarkosRequestError}, + error::{AgentError, SnarkosRequestError}, }, - state::{AgentId, AgentPeer, AgentState, EnvId, InternedId, KeyState, NetworkId, PortConfig}, + state::{AgentId, AgentState, EnvId, InternedId, NetworkId, PortConfig, ReconcileOptions}, }; -use tarpc::context; -use tokio::process::Command; -use tracing::{debug, error, info, trace, warn}; +use tarpc::context::Context; +use tracing::{error, info, trace}; use crate::{ - api, make_env_filter, - metrics::MetricComputer, - reconcile::{self, ensure_correct_binary}, - state::AppState, + api, log::make_env_filter, metrics::MetricComputer, reconcile::default_binary, state::AppState, }; define_rpc_mux!(child; @@ -42,44 +32,46 @@ define_rpc_mux!(child; #[derive(Clone)] pub struct AgentRpcServer { + pub client: ControlServiceClient, pub state: AppState, pub version: &'static str, } impl AgentService for AgentRpcServer { - async fn kill(self, _: context::Context) { - self.state.node_graceful_shutdown().await; - std::thread::spawn(|| { - std::thread::sleep(std::time::Duration::from_secs(1)); - std::process::exit(0) - }); + async fn kill(self, _: Context) { + info!("Kill RPC invoked..."); + self.state.shutdown().await; } - async fn handshake( - self, - context: context::Context, - handshake: Handshake, - ) -> Result<(), ReconcileError> { + async fn handshake(self, context: Context, handshake: Handshake) { if let Some(token) = handshake.jwt { // cache the JWT in the state JWT mutex - self.state - .db - .set_jwt(Some(token)) - .map_err(|_| ReconcileError::Database)?; + if let Err(e) = self.state.db.set_jwt(Some(token)) { + error!("failed to save JWT to db: {e}"); + } } // store loki server URL - if let Some(loki) = handshake.loki.and_then(|l| l.parse::().ok()) { - self.state - .loki - .lock() - .expect("failed to acquire loki URL lock") - .replace(loki); + let loki_url = handshake.loki.and_then(|l| l.parse::().ok()); + + if let Err(e) = self + .state + .db + .set_loki_url(loki_url.as_ref().map(|u| u.to_string())) + { + error!("failed to save loki URL to db: {e}"); + } + match self.state.loki.lock() { + Ok(mut guard) => { + *guard = loki_url; + } + Err(e) => { + error!("failed to acquire loki URL lock: {e}"); + } } // emit the transfer statuses if let Err(err) = self - .state .client .post_transfer_statuses( context, @@ -94,330 +86,51 @@ impl AgentService for AgentRpcServer { error!("failed to send transfer statuses: {err}"); } - // reconcile if state has changed - let needs_reconcile = *self.state.agent_state.read().await != handshake.state; - if needs_reconcile { - Self::reconcile(self, context, handshake.state).await?; - } - - Ok(()) - } - - async fn reconcile( - self, - _: context::Context, - target: AgentState, - ) -> Result<(), ReconcileError> { - info!("beginning reconcilation..."); - - // acquire the handle lock - let mut handle_container = self.state.reconcilation_handle.lock().await; - - // abort if we are already reconciling - if let Some(handle) = handle_container.take() { - info!("aborting previous reconcilation task..."); - handle.abort(); - } - - // perform the reconcilation - let state = Arc::clone(&self.state); - let handle = tokio::spawn(async move { - // previous state cleanup - let old_state = { - let agent_state_lock = state.agent_state.read().await; - match agent_state_lock.deref() { - // kill existing child if running - AgentState::Node(_, node) if node.online => { - info!("cleaning up snarkos process..."); - state.node_graceful_shutdown().await; - } - - _ => (), - } - - agent_state_lock.deref().clone() - }; - - // download new storage if storage_id changed - 'storage: { - let (is_same_env, is_same_index) = match (&old_state, &target) { - (AgentState::Node(old_env, old_node), AgentState::Node(new_env, new_node)) => { - (old_env == new_env, old_node.height.0 == new_node.height.0) - } - _ => (false, false), - }; - - // skip if we don't need storage - let AgentState::Node(env_id, node) = &target else { - break 'storage; - }; - - // get the storage info for this environment if we don't have it cached - let info = state - .get_env_info(*env_id) - .await - .map_err(|_| ReconcileError::StorageAcquireError("storage info".to_owned()))?; - - // ensure the binary is correct every reconcile (or restart) - ensure_correct_binary(node.binary, &state, &info).await?; - - if is_same_env && is_same_index { - debug!("skipping storage download"); - break 'storage; - } - - // TODO: download storage to a cache directory (~/config/.snops) to prevent - // multiple agents from having to redownload - // can be configurable to also work from a network drive - - // download and decompress the storage - let height = &node.height.1; - - trace!("checking storage files..."); - - // only download storage if it's a new environment - // if a node starts at height: 0, the node will never - // download the ledger - if !is_same_env { - reconcile::check_files(&state, &info, height).await?; - } - reconcile::load_ledger(&state, &info, height, !is_same_env).await?; - // TODO: checkpoint/absolute height request handling - } - - // reconcile towards new state - match target.clone() { - // inventory state is waiting for a node to be started - AgentState::Inventory => { - // wipe the env info cache. don't want to have stale storage info - state.env_info.write().await.take(); - } - - // start snarkOS node when node - AgentState::Node(env_id, node) => { - let mut child_lock = state.child.write().await; - let mut command = Command::new(state.cli.path.join(SNARKOS_FILE)); - - // get the storage info for this environment if we don't have it cached - let info = state.get_env_info(env_id).await.map_err(|_| { - ReconcileError::StorageAcquireError("storage info".to_owned()) - })?; - - let storage_id = &info.storage.id; - let storage_path = state - .cli - .path - .join("storage") - .join(info.network.to_string()) - .join(storage_id.to_string()); - let ledger_path = if info.storage.persist { - storage_path.join(LEDGER_PERSIST_DIR) - } else { - state.cli.path.join(LEDGER_BASE_DIR) - }; - - // add loki URL if one is set - if let Some(loki) = &*state.loki.lock().unwrap() { - command - .env( - "SNOPS_LOKI_LABELS", - format!("env_id={},node_key={}", env_id, node.node_key), - ) - .arg("--loki") - .arg(loki.as_str()); - } - - if state.cli.quiet { - command.stdout(Stdio::null()); - } else { - command.stdout(std::io::stdout()); - } - - command - .stderr(std::io::stderr()) - .envs(&node.env) - .env("NETWORK", info.network.to_string()) - .env("HOME", &ledger_path) - .arg("--log") - .arg(state.cli.path.join(SNARKOS_LOG_FILE)) - .arg("run") - .arg("--agent-rpc-port") - .arg(state.agent_rpc_port.to_string()) - .arg("--type") - .arg(node.node_key.ty.to_string()) - .arg("--ledger") - .arg(ledger_path); - - if !info.storage.native_genesis { - command - .arg("--genesis") - .arg(storage_path.join(SNARKOS_GENESIS_FILE)); - } - - // storage configuration - command - // port configuration - .arg("--bind") - .arg(state.cli.bind_addr.to_string()) - .arg("--bft") - .arg(state.cli.ports.bft.to_string()) - .arg("--rest") - .arg(state.cli.ports.rest.to_string()) - .arg("--metrics") - .arg(state.cli.ports.metrics.to_string()) - .arg("--node") - .arg(state.cli.ports.node.to_string()); - - match node.private_key { - KeyState::None => {} - KeyState::Local => { - command.arg("--private-key-file").arg( - state - .cli - .private_key_file - .as_ref() - .ok_or(ReconcileError::NoLocalPrivateKey)?, - ); - } - KeyState::Literal(pk) => { - command.arg("--private-key").arg(pk); - } - } - - // conditionally add retention policy - if let Some(policy) = &info.storage.retention_policy { - command.arg("--retention-policy").arg(policy.to_string()); - } - - // Find agents that do not have cached addresses - let unresolved_addrs: HashSet = { - let resolved_addrs = state.resolved_addrs.read().await; - node.peers - .iter() - .chain(node.validators.iter()) - .filter_map(|p| { - if let AgentPeer::Internal(id, _) = p { - (!resolved_addrs.contains_key(id)).then_some(*id) - } else { - None - } - }) - .collect() - }; - - // Fetch all unresolved addresses and update the cache - if !unresolved_addrs.is_empty() { - tracing::debug!( - "need to resolve addrs: {}", - unresolved_addrs - .iter() - .map(|id| id.to_string()) - .collect::>() - .join(",") - ); - let new_addrs = state - .client - .resolve_addrs(context::current(), unresolved_addrs) - .await - .map_err(|err| { - error!("rpc error while resolving addresses: {err}"); - ReconcileError::Unknown - })? - .map_err(ReconcileError::ResolveAddrError)?; - tracing::debug!( - "resolved new addrs: {}", - new_addrs - .iter() - .map(|(id, addr)| format!("{}: {}", id, addr)) - .collect::>() - .join(", ") - ); - state.resolved_addrs.write().await.extend(new_addrs); - } - - if !node.peers.is_empty() { - command - .arg("--peers") - .arg(state.agentpeers_to_cli(&node.peers).await.join(",")); - } - - if !node.validators.is_empty() { - command - .arg("--validators") - .arg(state.agentpeers_to_cli(&node.validators).await.join(",")); - } - - if node.online { - tracing::trace!("spawning node process..."); - tracing::debug!("node command: {command:?}"); - let child = command.spawn().expect("failed to start child"); - - *child_lock = Some(child); - - // todo: check to ensure the node actually comes online - // by hitting the REST latest block - } else { - tracing::debug!("skipping node spawn"); - } - } - } - - // After completing the reconcilation, update the agent state - let mut agent_state = state.agent_state.write().await; - *agent_state = target; - - Ok(()) - }); - - // update the mutex with our new handle and drop the lock - *handle_container = Some(handle.abort_handle()); - drop(handle_container); - - // await reconcilation completion - let res = match handle.await { - Err(e) if e.is_cancelled() => { - warn!("reconcilation was aborted by a newer reconcilation request"); + info!("Received control-plane handshake"); - // early return (don't clean up the handle lock) - return Err(ReconcileError::Aborted); - } + // Re-fetch peer addresses to ensure no addresses changed while offline + self.state.re_fetch_peer_addrs().await; - Ok(inner) => inner, - Err(e) => { - warn!("reconcilation task panicked: {e}"); - Err(ReconcileError::Unknown) - } - }; + // Queue a reconcile immediately as we have received new state. + // The reconciler will decide if anything has actually changed + self.state + .update_agent_state(handshake.state, handshake.reconcile_opts) + .await; + } - // clean up the abort handle - // we can't be here if we were cancelled (see early return above) - self.state.reconcilation_handle.lock().await.take(); + async fn set_agent_state(self, _: Context, target: AgentState, opts: ReconcileOptions) { + info!("Received new agent state, queuing reconcile..."); + self.state.update_agent_state(target, opts).await; + } - res + async fn clear_peer_addr(self, _: Context, agent_id: AgentId) { + self.state + .resolved_addrs + .write() + .await + .swap_remove(&agent_id); } - async fn get_addrs(self, _: context::Context) -> (PortConfig, Option, Vec) { + async fn get_addrs(self, _: Context) -> (PortConfig, Option, Vec) { ( - self.state.cli.ports.clone(), + self.state.cli.ports, self.state.external_addr, self.state.internal_addrs.clone(), ) } - async fn snarkos_get( - self, - _: context::Context, - route: String, - ) -> Result { - let env_id = - if let AgentState::Node(env_id, state) = self.state.agent_state.read().await.deref() { - if !state.online { - return Err(SnarkosRequestError::OfflineNode); - } - *env_id - } else { - return Err(SnarkosRequestError::InvalidState); - }; + async fn snarkos_get(self, _: Context, route: String) -> Result { + self.state + .get_node_client() + .await + .ok_or(SnarkosRequestError::OfflineNode)?; + + let env_id = self + .state + .get_agent_state() + .await + .env() + .ok_or(SnarkosRequestError::InvalidState)?; let network = self .state @@ -447,13 +160,18 @@ impl AgentService for AgentRpcServer { .map_err(|err| SnarkosRequestError::JsonSerializeError(err.to_string())) } - async fn broadcast_tx(self, _: context::Context, tx: String) -> Result<(), AgentError> { - let env_id = - if let AgentState::Node(env_id, _) = self.state.agent_state.read().await.deref() { - *env_id - } else { - return Err(AgentError::InvalidState); - }; + async fn broadcast_tx(self, _: Context, tx: String) -> Result<(), AgentError> { + self.state + .get_node_client() + .await + .ok_or(AgentError::NodeClientNotReady)?; + + let env_id = self + .state + .get_agent_state() + .await + .env() + .ok_or(AgentError::InvalidState)?; let network = self .state @@ -491,7 +209,7 @@ impl AgentService for AgentRpcServer { } } - async fn get_metric(self, _: context::Context, metric: AgentMetric) -> f64 { + async fn get_metric(self, _: Context, metric: AgentMetric) -> f64 { let metrics = self.state.metrics.read().await; match metric { @@ -501,17 +219,18 @@ impl AgentService for AgentRpcServer { async fn execute_authorization( self, - _: context::Context, + _: Context, env_id: EnvId, network: NetworkId, query: String, auth: String, ) -> Result { - info!("executing authorization..."); + info!("Executing authorization for {env_id}..."); // TODO: maybe in the env config store a branch label for the binary so it won't // be put in storage and won't overwrite itself + // TODO: compute agents wiping out env info when alternating environments let info = self .state .get_env_info(env_id) @@ -524,14 +243,7 @@ impl AgentService for AgentRpcServer { .path .join(format!("snarkos-aot-{env_id}-compute")); - let default_entry = BinaryEntry { - source: BinarySource::Path(PathBuf::from(format!( - "/content/storage/{}/{}/binaries/default", - info.network, info.storage.id, - ))), - sha256: None, - size: None, - }; + let default_entry = default_binary(&info); // download the snarkOS binary api::check_binary( @@ -546,10 +258,10 @@ impl AgentService for AgentRpcServer { &self.state.endpoint, &aot_bin, self.state.transfer_tx(), - ) // TODO: http(s)? + ) .await .map_err(|e| { - error!("failed obtain runner binary: {e}"); + error!("failed obtain compute binary: {e}"); AgentError::ProcessFailed })?; @@ -561,9 +273,17 @@ impl AgentService for AgentRpcServer { ) .await { - Ok(exec) => { + Ok(mut exec) => { let elapsed = start.elapsed().as_millis(); - info!("authorization executed in {elapsed}ms"); + + // Truncate the output to the first { + // because Aleo decided to print parameters.aleo.org download + // status to stdout... + if let Some(index) = exec.find("{") { + exec = exec.split_off(index); + } + + info!("Authorization executed in {elapsed}ms"); trace!("authorization output: {exec}"); Ok(exec) } @@ -574,7 +294,7 @@ impl AgentService for AgentRpcServer { } } - async fn set_log_level(self, _: context::Context, level: String) -> Result<(), AgentError> { + async fn set_log_level(self, _: Context, level: String) -> Result<(), AgentError> { tracing::debug!("setting log level to {level}"); let level: tracing_subscriber::filter::LevelFilter = level .parse() @@ -587,15 +307,12 @@ impl AgentService for AgentRpcServer { Ok(()) } - async fn set_aot_log_level( - self, - ctx: context::Context, - verbosity: u8, - ) -> Result<(), AgentError> { + async fn set_aot_log_level(self, ctx: Context, verbosity: u8) -> Result<(), AgentError> { tracing::debug!("agent setting aot log verbosity to {verbosity:?}"); - let lock = self.state.node_client.lock().await; - let node_client = lock.as_ref().ok_or(AgentError::NodeClientNotSet)?; - node_client + self.state + .get_node_client() + .await + .ok_or(AgentError::NodeClientNotSet)? .set_log_level(ctx, verbosity) .await .map_err(|_| AgentError::FailedToChangeLogLevel)? @@ -603,12 +320,13 @@ impl AgentService for AgentRpcServer { async fn get_snarkos_block_lite( self, - ctx: context::Context, + ctx: Context, block_hash: String, ) -> Result, AgentError> { - let lock = self.state.node_client.lock().await; - let node_client = lock.as_ref().ok_or(AgentError::NodeClientNotSet)?; - node_client + self.state + .get_node_client() + .await + .ok_or(AgentError::NodeClientNotSet)? .get_block_lite(ctx, block_hash) .await .map_err(|_| AgentError::FailedToMakeRequest)? @@ -616,23 +334,27 @@ impl AgentService for AgentRpcServer { async fn find_transaction( self, - context: context::Context, + context: Context, tx_id: String, ) -> Result, AgentError> { - let lock = self.state.node_client.lock().await; - let node_client = lock.as_ref().ok_or(AgentError::NodeClientNotSet)?; - node_client + self.state + .get_node_client() + .await + .ok_or(AgentError::NodeClientNotSet)? .find_transaction(context, tx_id) .await .map_err(|_| AgentError::FailedToMakeRequest)? } - async fn get_status(self, ctx: context::Context) -> Result { - let lock = self.state.node_client.lock().await; - let node_client = lock.as_ref().ok_or(AgentError::NodeClientNotSet)?; + async fn get_status(self, ctx: Context) -> Result { + let aot_online = if let Some(c) = self.state.get_node_client().await { + c.status(ctx).await.is_ok() + } else { + false + }; Ok(AgentStatus { - aot_online: node_client.status(ctx).await.is_ok(), + aot_online, version: self.version.to_string(), }) } diff --git a/crates/agent/src/server.rs b/crates/agent/src/server.rs index 3ac220b4..f10ab506 100644 --- a/crates/agent/src/server.rs +++ b/crates/agent/src/server.rs @@ -28,7 +28,10 @@ pub async fn start(listener: tokio::net::TcpListener, state: AppState) -> Result let app = Router::new() .route("/node", get(node_ws_handler)) .with_state(Arc::clone(&state)); - info!("axum router listening on: {}", listener.local_addr()?); + info!( + "Starting internal node RPC server on: {}", + listener.local_addr()? + ); axum::serve(listener, app).await?; @@ -41,9 +44,9 @@ async fn node_ws_handler(ws: WebSocketUpgrade, State(state): State) -> } async fn handle_socket(mut socket: WebSocket, state: AppState) { - let mut node_client = state.node_client.lock().await; + let mut node_client = state.node_client.write().await; if node_client.is_some() { - warn!("a new node RPC connection tried to establish when one was already established"); + warn!("A new node RPC connection tried to establish when one was already established"); let _ = socket.send(Message::Close(None)).await; return; } @@ -56,7 +59,7 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { let client = NodeServiceClient::new(tarpc::client::Config::default(), client_transport).spawn(); // store the client in state - tracing::info!("node client connected"); + tracing::info!("Connection established with the node"); *node_client = Some(client); drop(node_client); @@ -82,7 +85,7 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { match msg { Some(Err(_)) | None => break, Some(Ok(Message::Binary(bin))) => { - let msg = match bincode::deserialize(&bin) { + let msg = match snops_common::rpc::codec::decode(&bin) { Ok(msg) => msg, Err(e) => { error!("failed to deserialize a message from node: {e}"); @@ -91,8 +94,18 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { }; match msg { - MuxedMessageIncoming::Parent(msg) => server_request_in.send(msg).expect("internal RPC channel closed"), - MuxedMessageIncoming::Child(msg) => client_response_in.send(msg).expect("internal RPC channel closed"), + MuxedMessageIncoming::Parent(msg) => { + if let Err(e) = server_request_in.send(msg) { + error!("internal node RPC channel closed: {e}"); + break; + } + }, + MuxedMessageIncoming::Child(msg) => { + if let Err(e) = client_response_in.send(msg) { + error!("internal node RPC channel closed: {e}"); + break; + } + } } } _ => (), @@ -101,8 +114,14 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { // handle outgoing requests msg = client_request_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&MuxedMessageOutgoing::Child(msg)).expect("failed to serialize request"); + let Some(msg) = msg else { error!("internal node RPC channel closed"); break; }; + let bin = match snops_common::rpc::codec::encode(&MuxedMessageOutgoing::Child(msg)) { + Ok(bin) => bin, + Err(e) => { + error!("failed to serialize a request to node: {e}"); + continue; + } + }; if socket.send(Message::Binary(bin)).await.is_err() { break; } @@ -110,8 +129,14 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { // handle outgoing response msg = server_response_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&MuxedMessageOutgoing::Parent(msg)).expect("failed to serialize response"); + let Some(msg) = msg else { error!("internal node RPC channel closed"); break; }; + let bin = match snops_common::rpc::codec::encode(&MuxedMessageOutgoing::Parent(msg)) { + Ok(bin) => bin, + Err(e) => { + error!("failed to serialize a response to node: {e}"); + continue; + } + }; if socket.send(Message::Binary(bin)).await.is_err() { break; } @@ -121,5 +146,5 @@ async fn handle_socket(mut socket: WebSocket, state: AppState) { // abort the RPC server handle server_handle.abort(); - state.node_client.lock().await.take(); + state.node_client.write().await.take(); } diff --git a/crates/agent/src/state.rs b/crates/agent/src/state.rs index 49897aaa..06361a80 100644 --- a/crates/agent/src/state.rs +++ b/crates/agent/src/state.rs @@ -1,40 +1,38 @@ use std::{ - collections::HashMap, + collections::HashSet, net::IpAddr, sync::{Arc, Mutex}, time::{Duration, Instant}, }; -use anyhow::bail; use dashmap::DashMap; +use indexmap::IndexMap; use reqwest::Url; use snops_common::{ - api::EnvInfo, - rpc::{agent::node::NodeServiceClient, control::ControlServiceClient}, - state::{AgentId, AgentPeer, AgentState, EnvId, TransferId, TransferStatus}, + api::AgentEnvInfo, + rpc::{agent::node::NodeServiceClient, control::ControlServiceClient, error::ReconcileError}, + state::{ + snarkos_status::SnarkOSStatus, AgentId, AgentPeer, AgentState, EnvId, ReconcileOptions, + TransferId, TransferStatus, + }, util::OpaqueDebug, }; use tarpc::context; -use tokio::{ - process::Child, - select, - sync::{Mutex as AsyncMutex, RwLock}, - task::AbortHandle, -}; -use tracing::info; +use tokio::sync::{mpsc::Sender, oneshot, RwLock}; +use tracing::{error, info}; -use crate::{cli::Cli, db::Database, metrics::Metrics, transfers::TransferTx, ReloadHandler}; +use crate::{cli::Cli, db::Database, log::ReloadHandler, metrics::Metrics, transfers::TransferTx}; -pub const NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(10); +pub const NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(30); pub type AppState = Arc; +pub type ClientLock = Arc>>; /// Global state for this agent runner. pub struct GlobalState { - pub client: ControlServiceClient, + pub client: ClientLock, pub db: OpaqueDebug, pub _started: Instant, - pub connected: Mutex, pub external_addr: Option, pub internal_addrs: Vec, @@ -42,24 +40,40 @@ pub struct GlobalState { pub cli: Cli, pub endpoint: String, pub loki: Mutex>, - pub agent_state: RwLock, - pub env_info: RwLock>, - pub reconcilation_handle: AsyncMutex>, - pub child: RwLock>, /* TODO: this may need to be handled by an owning thread, - * not sure yet */ + /// Desired state the agent should be in. After each reconciliation, the + /// agent will attempt to transition to this state. + pub agent_state: RwLock>, + /// A sender for emitting the next time to reconcile the agent. + /// Helpful for scheduling the next reconciliation. + pub queue_reconcile_tx: Sender<(Instant, ReconcileOptions)>, + pub env_info: RwLock)>>, // Map of agent IDs to their resolved addresses. - pub resolved_addrs: RwLock>, + pub resolved_addrs: RwLock>, pub metrics: RwLock, pub transfer_tx: TransferTx, pub transfers: Arc>, - pub node_client: AsyncMutex>, - + pub node_client: RwLock>, + pub last_node_status: RwLock>, pub log_level_handler: ReloadHandler, + /// A oneshot sender to shutdown the agent. + pub shutdown: RwLock>>, } impl GlobalState { + pub fn is_ws_online(&self) -> bool { + self.client.try_read().is_ok_and(|c| c.is_some()) + } + + pub async fn get_ws_client(&self) -> Option { + self.client.read().await.clone() + } + + pub async fn get_agent_state(&self) -> Arc { + self.agent_state.read().await.clone() + } + // Resolve the addresses of the given agents. // Locks resolve_addrs pub async fn agentpeers_to_cli(&self, peers: &[AgentPeer]) -> Vec { @@ -76,49 +90,148 @@ impl GlobalState { .collect::>() } - pub async fn get_env_info(&self, env_id: EnvId) -> anyhow::Result { + pub async fn queue_reconcile(&self, duration: Duration, opts: ReconcileOptions) -> bool { + self.queue_reconcile_tx + .try_send((Instant::now() + duration, opts)) + .is_ok() + } + + pub async fn set_env_info(&self, info: Option<(EnvId, Arc)>) { + if let Err(e) = self.db.set_env_info(info.clone()) { + error!("failed to save env info to db: {e}"); + } + *self.env_info.write().await = info; + } + + /// Fetch the environment info for the given env_id, caching the result. + pub async fn get_env_info(&self, env_id: EnvId) -> Result, ReconcileError> { match self.env_info.read().await.as_ref() { Some((id, info)) if *id == env_id => return Ok(info.clone()), _ => {} } - let Some(info) = self.client.get_env_info(context::current(), env_id).await? else { - bail!("failed to get env info: env not found {env_id}"); - }; + let client = self + .client + .read() + .await + .clone() + .ok_or(ReconcileError::Offline)?; + + let info = client + .get_env_info(context::current(), env_id) + .await + .map_err(|e| ReconcileError::RpcError(e.to_string()))? + .ok_or(ReconcileError::MissingEnv(env_id))?; + + let env_info = (env_id, Arc::new(info)); + if let Err(e) = self.db.set_env_info(Some(env_info.clone())) { + error!("failed to save env info to db: {e}"); + } + *self.env_info.write().await = Some(env_info.clone()); + + // clear the resolved addrs cache when the env info changes + self.resolved_addrs.write().await.clear(); + if let Err(e) = self.db.set_resolved_addrs(None) { + error!("failed to save resolved addrs to db: {e}"); + } + + Ok(env_info.1) + } + + pub fn transfer_tx(&self) -> TransferTx { + self.transfer_tx.clone() + } + + pub async fn shutdown(&self) { + if let Some(tx) = self.shutdown.write().await.take() { + let _ = tx.send(()); + } + } - *self.env_info.write().await = Some((env_id, info.clone())); + pub fn is_node_online(&self) -> bool { + self.node_client.try_read().is_ok_and(|c| c.is_some()) + } - Ok(info) + pub async fn get_node_client(&self) -> Option { + self.node_client.read().await.clone() } - /// Attempt to gracefully shutdown the node if one is running. - pub async fn node_graceful_shutdown(&self) { - if let Some((mut child, id)) = self.child.write().await.take().and_then(|ch| { - let id = ch.id()?; - Some((ch, id)) - }) { - use nix::{ - sys::signal::{self, Signal}, - unistd::Pid, - }; - - // send SIGINT to the child process - signal::kill(Pid::from_raw(id as i32), Signal::SIGINT).unwrap(); - - // wait for graceful shutdown or kill process after 10 seconds - let timeout = tokio::time::sleep(NODE_GRACEFUL_SHUTDOWN_TIMEOUT); - - select! { - _ = child.wait() => (), - _ = timeout => { - info!("snarkos process did not gracefully shut down, killing..."); - child.kill().await.unwrap(); + pub async fn update_agent_state(&self, state: AgentState, opts: ReconcileOptions) { + if let Err(e) = self.db.set_agent_state(&state) { + error!("failed to save agent state to db: {e}"); + } + let state = Arc::new(state); + *self.agent_state.write().await = state; + + // Queue a reconcile to apply the new state + self.queue_reconcile(Duration::ZERO, opts).await; + } + + pub async fn re_fetch_peer_addrs(&self) { + let agent_state = self.get_agent_state().await; + let AgentState::Node(_, node) = agent_state.as_ref() else { + return; + }; + + let Some(client) = self.get_ws_client().await else { + return; + }; + + let peer_ids = node + .peers + .iter() + .chain(node.validators.iter()) + .filter_map(|p| { + if let snops_common::state::AgentPeer::Internal(id, _) = p { + Some(*id) + } else { + None } + }) + // Ensure we only have unique agent ids (can use itertools down the line) + .collect::>() + .into_iter() + .collect::>(); + + if peer_ids.is_empty() { + return; + } + + let new_addrs = match client.resolve_addrs(context::current(), peer_ids).await { + Ok(Ok(new_addrs)) => new_addrs, + Ok(Err(e)) => { + error!("Control plane failed to resolve addresses: {e}"); + return; + } + Err(e) => { + error!("RPC failed to resolve addresses: {e}"); + return; } + }; + + // Extend the cache with the updated addrs + let mut lock = self.resolved_addrs.write().await; + let has_new_addr = new_addrs + .iter() + .any(|(id, addr)| lock.get(id) != Some(addr)); + + if !has_new_addr { + return; + } + + info!("Resolved updated addrs from handshake"); + + lock.extend(new_addrs); + if let Err(e) = self.db.set_resolved_addrs(Some(&lock)) { + error!("failed to save resolved addrs to db: {e}"); } } - pub fn transfer_tx(&self) -> TransferTx { - self.transfer_tx.clone() + pub async fn set_node_status(&self, status: Option) { + *self.last_node_status.write().await = status.map(|s| (Instant::now(), s)); + } + + pub async fn get_node_status(&self) -> Option { + self.last_node_status.read().await.clone().map(|(_, s)| s) } } diff --git a/crates/agent/src/transfers.rs b/crates/agent/src/transfers.rs index 2d95cc28..77d35cc1 100644 --- a/crates/agent/src/transfers.rs +++ b/crates/agent/src/transfers.rs @@ -5,13 +5,12 @@ use std::sync::{ use chrono::{TimeDelta, Utc}; use dashmap::{mapref::entry::Entry, DashMap}; -use snops_common::{ - rpc::control::ControlServiceClient, - state::{TransferId, TransferStatus, TransferStatusUpdate}, -}; +use snops_common::state::{TransferId, TransferStatus, TransferStatusUpdate}; use tarpc::context; use tokio::{select, sync::mpsc}; +use crate::state::ClientLock; + pub type TransferTx = mpsc::UnboundedSender<(TransferId, TransferStatusUpdate)>; // how long to wait before cleaning up a transfer that has ended @@ -23,9 +22,7 @@ pub fn next_id() -> TransferId { TRANSFER_ID_CTR.fetch_add(1, Ordering::AcqRel) } -pub fn start_monitor( - client: ControlServiceClient, -) -> (TransferTx, Arc>) { +pub fn start_monitor(client: ClientLock) -> (TransferTx, Arc>) { let (tx, mut rx) = mpsc::unbounded_channel::<(TransferId, TransferStatusUpdate)>(); let state_transfers = Arc::new(DashMap::new()); @@ -39,7 +36,7 @@ pub fn start_monitor( // cleanup transfers that have ended _ = interval.tick() => { let now = Utc::now(); - let client = client.clone(); + let client = Arc::clone(&client); transfers.retain(|&id, transfer: &mut TransferStatus| { let is_done = transfer.total_bytes == transfer.downloaded_bytes; let is_error = transfer.interruption.is_some(); @@ -57,6 +54,10 @@ pub fn start_monitor( // send the update to the control plane let client = client.clone(); tokio::spawn(async move { + let Some(client) = client.read().await.clone() else { + return + }; + if let Err(e) = client.post_transfer_status(context::current(), id, TransferStatusUpdate::Cleanup).await { tracing::error!("failed to send transfer cleanup update: {e}"); } @@ -79,6 +80,7 @@ pub fn start_monitor( total_bytes: total, downloaded_bytes: 0, interruption: None, + handle: None, }); }, @@ -99,12 +101,24 @@ pub fn start_monitor( transfer.updated_at = Utc::now(); }, + (Handle(handle), Entry::Occupied(mut ent)) => { + let transfer = ent.get_mut(); + transfer.handle = Some(handle); + + // prevent broadcasting the handle to the control plane + continue; + }, + _ => continue, } // send the update to the control plane let client = client.clone(); tokio::spawn(async move { + let Some(client) = client.read().await.clone() else { + return + }; + if let Err(e) = client.post_transfer_status(context::current(), id, message).await { tracing::error!("failed to send transfer status update: {e}"); } diff --git a/crates/aot/Cargo.toml b/crates/aot/Cargo.toml index cb9d75a4..4cdc3d28 100644 --- a/crates/aot/Cargo.toml +++ b/crates/aot/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2021" name = "snarkos-aot" -version = "0.1.0" +version = "0.1.1" license = "MIT" description = "Ahead of time utilities for SnarkVM, and a wrapper around the SnarkOS node for more options" diff --git a/crates/aot/src/auth/auth_deploy.rs b/crates/aot/src/auth/auth_deploy.rs index bb0e3592..f8e836aa 100644 --- a/crates/aot/src/auth/auth_deploy.rs +++ b/crates/aot/src/auth/auth_deploy.rs @@ -35,7 +35,7 @@ impl AuthorizeDeploy { pub fn parse(self) -> Result> { // get the program from the file (or stdin) let program = self.options.program.clone().contents()?; - let mut process = Process::load_no_storage()?; + let mut process = Process::load()?; query::get_process_imports(&mut process, &program, self.options.query.as_deref())?; let deployment = diff --git a/crates/aot/src/auth/auth_fee.rs b/crates/aot/src/auth/auth_fee.rs index 8e01255c..a05edfa9 100644 --- a/crates/aot/src/auth/auth_fee.rs +++ b/crates/aot/src/auth/auth_fee.rs @@ -1,12 +1,12 @@ -use anyhow::{bail, Ok, Result}; +use anyhow::{anyhow, bail, Ok, Result}; use clap::Args; use clap_stdin::MaybeStdin; use rand::{CryptoRng, Rng}; use snarkvm::{ ledger::Deployment, - prelude::Field, + prelude::{cost_in_microcredits_v1, Field}, synthesizer::{ - process::{cost_in_microcredits, deployment_cost}, + process::{cost_in_microcredits_v2, deployment_cost}, Process, }, utilities::ToBytes, @@ -52,6 +52,9 @@ pub struct AuthorizeFee { /// The seed to use for the authorization generation #[clap(long)] pub seed: Option, + /// Enable cost v1 for the transaction cost estimation (v2 by default) + #[clap(long, default_value_t = false)] + pub cost_v1: bool, } impl AuthorizeFee { @@ -59,13 +62,16 @@ impl AuthorizeFee { let (id, base_fee) = match (self.auth, self.deployment, self.id, self.cost) { (Some(auth), None, None, None) => { let auth = auth.into_inner(); - let mut process = Process::load_no_storage()?; + let mut process = Process::load()?; if let Some(query) = self.query.as_deref() { let programs = query::get_programs_from_auth(&auth); query::add_many_programs_to_process(&mut process, programs, query)?; } - (auth.to_execution_id()?, estimate_cost(&process, &auth)?) + ( + auth.to_execution_id()?, + estimate_cost(&process, &auth, !self.cost_v1)?, + ) } (None, Some(deployment), None, None) => { let deployment = deployment.into_inner(); @@ -128,7 +134,11 @@ pub fn fee_auth( Ok(Some(fee)) } -pub fn estimate_cost(process: &Process, func: &Authorization) -> Result { +pub fn estimate_cost( + process: &Process, + func: &Authorization, + use_cost_v2: bool, +) -> Result { let transitions = func.transitions(); let storage_cost = { @@ -174,21 +184,35 @@ pub fn estimate_cost(process: &Process, func: &Authorization) }; //execution.size_in_bytes().map_err(|e| e.to_string())?; - // Compute the finalize cost in microcredits. - let mut finalize_cost = 0u64; - // Iterate over the transitions to accumulate the finalize cost. - for (_key, transition) in transitions { - // Retrieve the function name, program id, and program. - let function_name = *transition.function_name(); + let finalize_cost = if use_cost_v2 { + // cost v2 uses the finalize cost of the first transition + let transition = transitions + .values() + .next() + .ok_or(anyhow!("No transitions"))?; let stack = process.get_stack(transition.program_id())?; - let cost = cost_in_microcredits(&stack, &function_name)?; + cost_in_microcredits_v2(stack, transition.function_name())? + } else { + // Compute the finalize cost in microcredits. + let mut finalize_cost = 0u64; + + // Iterate over the transitions to accumulate the finalize cost. + for (_key, transition) in transitions { + // Retrieve the function name, program id, and program. + let function_name = *transition.function_name(); + let stack = process.get_stack(transition.program_id())?; + let cost = cost_in_microcredits_v1(stack, &function_name)?; + + // Accumulate the finalize cost. + if let Some(cost) = finalize_cost.checked_add(cost) { + finalize_cost = cost; + } else { + bail!("The finalize cost computation overflowed for an execution") + }; + } + + finalize_cost + }; - // Accumulate the finalize cost. - if let Some(cost) = finalize_cost.checked_add(cost) { - finalize_cost = cost; - } else { - bail!("The finalize cost computation overflowed for an execution") - }; - } Ok(storage_cost + finalize_cost) } diff --git a/crates/aot/src/auth/auth_program.rs b/crates/aot/src/auth/auth_program.rs index 014787ee..d15477eb 100644 --- a/crates/aot/src/auth/auth_program.rs +++ b/crates/aot/src/auth/auth_program.rs @@ -26,6 +26,9 @@ pub struct AuthorizeProgram { /// The seed to use for the authorization generation #[clap(long)] pub seed: Option, + /// Enable cost v1 for the transaction cost estimation (v2 by default) + #[clap(long, default_value_t = false)] + pub cost_v1: bool, } impl AuthorizeProgram { @@ -33,7 +36,7 @@ impl AuthorizeProgram { pub fn parse(self) -> Result<(Authorization, u64)> { let private_key = self.key.try_get()?; - let mut process = Process::load_no_storage()?; + let mut process = Process::load()?; match (self.options.query, self.options.locator.program_id()) { (_, id) if *id == N::credits() => {} (None, id) => { @@ -51,7 +54,7 @@ impl AuthorizeProgram { &mut super::rng_from_seed(self.seed), )?; - let cost = estimate_cost(&process, &auth)?; + let cost = estimate_cost(&process, &auth, !self.cost_v1)?; Ok((auth, cost)) } diff --git a/crates/aot/src/auth/mod.rs b/crates/aot/src/auth/mod.rs index e9e8a7bd..ee270d3a 100644 --- a/crates/aot/src/auth/mod.rs +++ b/crates/aot/src/auth/mod.rs @@ -46,6 +46,9 @@ pub struct CostCommand { query: Option, #[clap(flatten)] auth: AuthArgs, + /// Enable cost v1 for the transaction cost estimation (v2 by default) + #[clap(long, default_value_t = false)] + pub cost_v1: bool, } /// Authorize a program execution. @@ -65,6 +68,9 @@ pub struct AuthProgramCommand { /// The seed to use for the authorization generation #[clap(long)] pub seed: Option, + /// Enable cost v1 for the transaction cost estimation (v2 by default) + #[clap(long, default_value_t = false)] + pub cost_v1: bool, } /// Deploy a program to the network. @@ -84,6 +90,9 @@ pub struct AuthDeployCommand { /// The seed to use for the authorization generation #[clap(long)] pub seed: Option, + /// Enable cost v1 for the transaction cost estimation (v2 by default) + #[clap(long, default_value_t = false)] + pub cost_v1: bool, } impl AuthCommand { @@ -118,7 +127,11 @@ impl AuthCommand { println!("{id}"); Ok(()) } - AuthCommand::Cost(CostCommand { query, auth }) => { + AuthCommand::Cost(CostCommand { + query, + auth, + cost_v1, + }) => { let cost = match auth.pick()? { AuthBlob::Program { auth, .. } => { let auth = auth.into(); @@ -126,13 +139,13 @@ impl AuthCommand { // load the programs the auth references into the process // as cost estimation measures the size of values from within the auth's // transitions - let mut process = Process::load_no_storage()?; + let mut process = Process::load()?; if let Some(query) = query.as_deref() { let programs = query::get_programs_from_auth(&auth); query::add_many_programs_to_process(&mut process, programs, query)?; } - estimate_cost(&process, &auth)? + estimate_cost(&process, &auth, !cost_v1)? } AuthBlob::Deploy { deployment, .. } => deployment_cost(&deployment)?.0, }; @@ -146,6 +159,7 @@ impl AuthCommand { program_opts, fee_opts, seed, + cost_v1, }) => { let query = program_opts.query.clone(); @@ -154,6 +168,7 @@ impl AuthCommand { key: key.clone(), options: program_opts, seed, + cost_v1, } .parse()?; @@ -172,6 +187,7 @@ impl AuthCommand { id: Some(auth.to_execution_id()?), cost: Some(cost), seed, + cost_v1, } .parse()?; @@ -191,6 +207,7 @@ impl AuthCommand { deploy_opts, fee_opts, seed, + cost_v1, }) => { // authorize the deployment without a fee let AuthBlob::Deploy { @@ -227,6 +244,7 @@ impl AuthCommand { id: Some(deployment.to_deployment_id()?), cost: Some(deployment_cost(&deployment)?.0), seed, + cost_v1, } .parse()? .map(Into::into); diff --git a/crates/aot/src/genesis.rs b/crates/aot/src/genesis.rs index 065e6469..bd4b5c4b 100644 --- a/crates/aot/src/genesis.rs +++ b/crates/aot/src/genesis.rs @@ -158,6 +158,7 @@ pub fn genesis_quorum( let (ratifications, transactions, aborted_transaction_ids, ratified_finalize_operations) = vm .speculate( state, + 0, None, ratifications, &solutions, diff --git a/crates/aot/src/ledger/query.rs b/crates/aot/src/ledger/query.rs index e38db79d..bdf9cc6c 100644 --- a/crates/aot/src/ledger/query.rs +++ b/crates/aot/src/ledger/query.rs @@ -91,6 +91,10 @@ impl LedgerQuery { &format!("/{network}/latest/stateRoot"), get(Self::latest_state_root), ) + .route( + &format!("/{network}/stateRoot/latest"), + get(Self::latest_state_root), + ) .route( &format!("/{network}/block/height/latest"), get(Self::latest_height), diff --git a/crates/aot/src/ledger/truncate.rs b/crates/aot/src/ledger/truncate.rs index a04cf735..e243568f 100644 --- a/crates/aot/src/ledger/truncate.rs +++ b/crates/aot/src/ledger/truncate.rs @@ -32,8 +32,6 @@ pub struct Replay { /// When checkpoint is enabled, checkpoints. #[arg(short, long, default_value_t = false)] checkpoint: bool, - // TODO: duration based truncation (blocks within a duration before now) - // TODO: timestamp based truncation (blocks after a certain date) } /// A command to truncate the ledger to a specific height. diff --git a/crates/aot/src/ledger/util.rs b/crates/aot/src/ledger/util.rs index bfb535d5..1529d725 100644 --- a/crates/aot/src/ledger/util.rs +++ b/crates/aot/src/ledger/util.rs @@ -10,7 +10,7 @@ use snarkvm::{ types::{Address, Field, U64}, }, ledger::{query::Query, store::ConsensusStorage, Block, Execution, Fee, Ledger, Transaction}, - prelude::{execution_cost, Network}, + prelude::{execution_cost_v2, Network}, synthesizer::VM, }; @@ -90,7 +90,7 @@ pub fn public_transaction, A: Aleo(vm, &private_key_fee, min_fee, execution.to_execution_id()?)?; @@ -170,7 +170,7 @@ pub fn _make_transaction_proof_private, A: Al )?; // compute fee for the execution - let (min_fee, _) = execution_cost(&vm.process().read(), &execution)?; + let (min_fee, _) = execution_cost_v2(&vm.process().read(), &execution)?; // proof for the fee, authorizing the execution let fee = diff --git a/crates/aot/src/lib.rs b/crates/aot/src/lib.rs index 9dfee0a3..e8fc71b9 100644 --- a/crates/aot/src/lib.rs +++ b/crates/aot/src/lib.rs @@ -92,7 +92,7 @@ macro_rules! network_to_circuit { fn process<'a>() -> &'a Process<$net_name> { static PROCESS: OnceLock> = OnceLock::new(); - PROCESS.get_or_init(|| Process::load_no_storage().unwrap()) + PROCESS.get_or_init(|| Process::load().unwrap()) } fn credits() -> ProgramID<$net_name> { diff --git a/crates/aot/src/program/cost.rs b/crates/aot/src/program/cost.rs index bb20e416..c372a33f 100644 --- a/crates/aot/src/program/cost.rs +++ b/crates/aot/src/program/cost.rs @@ -25,6 +25,9 @@ pub struct CostCommand { /// Program inputs (eg. 1u64 5field) #[clap(num_args = 1, value_delimiter = ' ')] inputs: Vec>, + /// Enable cost v1 for the transaction cost estimation (v2 by default) + #[clap(long, default_value_t = false)] + pub cost_v1: bool, } impl CostCommand { @@ -34,10 +37,11 @@ impl CostCommand { program, function, inputs, + cost_v1, } = self; let program = program.contents()?; - let mut process = Process::load_no_storage()?; + let mut process = Process::load()?; query::get_process_imports(&mut process, &program, query.as_deref())?; if let Some(function) = function { @@ -57,7 +61,7 @@ impl CostCommand { &mut rand::thread_rng(), )?; - estimate_cost(&process, &auth) + estimate_cost(&process, &auth, !cost_v1) } else { let deployment = process.deploy::(&program, &mut rand::thread_rng())?; Ok(deployment_cost(&deployment)?.0) diff --git a/crates/aot/src/runner/mod.rs b/crates/aot/src/runner/mod.rs index 60e95914..fd3dd59d 100644 --- a/crates/aot/src/runner/mod.rs +++ b/crates/aot/src/runner/mod.rs @@ -5,10 +5,13 @@ use std::{ }; use aleo_std::StorageMode; -use anyhow::Result; +use anyhow::{anyhow, Result}; use clap::Args; use rpc::RpcClient; -use snarkos_node::Node; +use snarkos_node::{ + bft::helpers::{proposal_cache_path, ProposalCache}, + Node, +}; use snarkvm::{ ledger::store::{ helpers::rocksdb::{BlockDB, CommitteeDB}, @@ -20,7 +23,7 @@ use snarkvm::{ use snops_checkpoint::{CheckpointManager, RetentionPolicy}; use snops_common::state::{snarkos_status::SnarkOSStatus, NodeType}; -use crate::{cli::ReloadHandler, Account, DbLedger, Key, Network}; +use crate::{cli::ReloadHandler, Account, Address, DbLedger, Key, Network}; mod metrics; mod rpc; @@ -116,13 +119,21 @@ impl Runner { let bft_ip = SocketAddr::new(bind_addr, self.bft); let metrics_ip = SocketAddr::new(bind_addr, self.metrics); - let account = Account::try_from(self.key.try_get()?)?; - - let genesis = if let Some(path) = self.genesis.as_ref() { - Block::read_le(std::fs::File::open(path)?)? - } else { - Block::read_le(N::genesis_bytes())? - }; + let account = Account::try_from( + self.key + .try_get() + .map_err(|e| e.context("obtain private key"))?, + )?; + + let genesis = + if let Some(path) = self.genesis.as_ref() { + Block::read_le(std::fs::File::open(path).map_err(|e| { + anyhow!(e).context(format!("open genesis file {}", path.display())) + })?) + .map_err(|e| anyhow!(e).context("parse genesis block from file"))? + } else { + Block::read_le(N::genesis_bytes())? + }; // conditionally create a checkpoint manager based on the presence // of a retention policy @@ -170,6 +181,7 @@ impl Runner { let _node = match self.node_type { NodeType::Validator => { + Self::check_proposal_cache(account.address()); Node::new_validator( node_ip, Some(bft_ip), @@ -185,36 +197,37 @@ impl Runner { false, shutdown, ) - .await? - } - NodeType::Prover => { - Node::new_prover( - node_ip, - account, - &self.peers, - genesis, - storage_mode.clone(), - shutdown, - ) - .await? - } - NodeType::Client => { - Node::new_client( - node_ip, - Some(rest_ip), - self.rest_rps, - account, - &self.peers, - genesis, - None, - storage_mode.clone(), - false, - shutdown, - ) - .await? + .await + .map_err(|e| e.context("create validator"))? } + NodeType::Prover => Node::new_prover( + node_ip, + account, + &self.peers, + genesis, + storage_mode.clone(), + shutdown, + ) + .await + .map_err(|e| e.context("create prover"))?, + NodeType::Client => Node::new_client( + node_ip, + Some(rest_ip), + self.rest_rps, + account, + &self.peers, + genesis, + None, + storage_mode.clone(), + false, + shutdown, + ) + .await + .map_err(|e| e.context("create client"))?, }; + agent.status(SnarkOSStatus::Started); + // only monitor block updates if we have a checkpoint manager or agent status // API if manager.is_some() || agent.is_enabled() { @@ -267,6 +280,24 @@ impl Runner { Ok(()) } + /// Check the proposal cache for this address and remove it if it is + /// invalid. + fn check_proposal_cache(addr: Address) { + let proposal_cache_path = proposal_cache_path(N::ID, None); + if !proposal_cache_path.exists() { + return; + } + + let Err(e) = ProposalCache::::load(addr, None) else { + return; + }; + + tracing::error!("failed to load proposal cache: {e}"); + if let Err(e) = std::fs::remove_file(&proposal_cache_path) { + tracing::error!("failed to remove proposal cache: {e}"); + } + } + /// Returns a runtime for the node. pub fn runtime() -> tokio::runtime::Runtime { // Retrieve the number of cores. diff --git a/crates/aot/src/runner/rpc/mod.rs b/crates/aot/src/runner/rpc/mod.rs index e43936a6..3722caf7 100644 --- a/crates/aot/src/runner/rpc/mod.rs +++ b/crates/aot/src/runner/rpc/mod.rs @@ -81,115 +81,119 @@ impl RpcClient { // ws connection loop tokio::spawn(async move { loop { - 'connection: { - let (mut ws_stream, _) = match connect_async(ws_req.to_owned()).await { - Ok(r) => r, - Err(e) => { - error!("An error occurred establishing the connection: {e}"); - break 'connection; + let (mut ws_stream, _) = match connect_async(ws_req.to_owned()).await { + Ok(r) => r, + Err(e) => { + error!("An error occurred establishing the connection: {e}"); + tokio::time::sleep(Duration::from_secs(1)).await; + continue; + } + }; + + let mut interval = tokio::time::interval(Duration::from_secs(PING_INTERVAL_SEC)); + let mut num_pings: u32 = 0; + + 'event: loop { + select! { + // ping + _ = interval.tick() => { + let mut payload = Vec::from(PING_HEADER); + payload.extend_from_slice(&num_pings.to_le_bytes()); + payload.extend_from_slice(&start_time.elapsed().as_micros().to_le_bytes()); + + let send = ws_stream.send(tungstenite::Message::Ping(payload)); + if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { + error!("The connection to the control plane was interrupted while sending ping"); + break 'event; + } } - }; - - let mut interval = - tokio::time::interval(Duration::from_secs(PING_INTERVAL_SEC)); - let mut num_pings: u32 = 0; - - 'event: loop { - select! { - // ping - _ = interval.tick() => { - let mut payload = Vec::from(PING_HEADER); - payload.extend_from_slice(&num_pings.to_le_bytes()); - payload.extend_from_slice(&start_time.elapsed().as_micros().to_le_bytes()); - - let send = ws_stream.send(tungstenite::Message::Ping(payload)); - if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { - error!("The connection to the control plane was interrupted while sending ping"); - break 'event; - } + + // handle outgoing responses + msg = server_response_out.recv() => { + let Some(msg) = msg else { + error!("internal RPC channel closed"); + break 'event; + }; + let bin = snops_common::rpc::codec::encode(&MuxedMessageOutgoing::Child(msg)).expect("failed to serialize response"); + let send = ws_stream.send(tungstenite::Message::Binary(bin)); + if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { + error!("The connection to the agent was interrupted while sending node message"); + break 'event; } + } - // handle outgoing responses - msg = server_response_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&MuxedMessageOutgoing::Child(msg)).expect("failed to serialize response"); - let send = ws_stream.send(tungstenite::Message::Binary(bin)); - if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { - error!("The connection to the agent was interrupted while sending node message"); - break 'event; - } + // handle outgoing requests + msg = client_request_out.recv() => { + let Some(msg) = msg else { + error!("internal RPC channel closed"); + break 'event; + }; + let bin = snops_common::rpc::codec::encode(&MuxedMessageOutgoing::Parent(msg)).expect("failed to serialize request"); + let send = ws_stream.send(tungstenite::Message::Binary(bin)); + if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { + error!("The connection to the agent was interrupted while sending node message"); + break 'event; } + } - // handle outgoing requests - msg = client_request_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&MuxedMessageOutgoing::Parent(msg)).expect("failed to serialize request"); - let send = ws_stream.send(tungstenite::Message::Binary(bin)); - if tokio::time::timeout(Duration::from_secs(10), send).await.is_err() { - error!("The connection to the agent was interrupted while sending node message"); - break 'event; + // handle incoming messages + msg = ws_stream.next() => match msg { + Some(Ok(tungstenite::Message::Close(frame))) => { + match frame { + Some(frame) => info!("The agent closed the connection: {frame}"), + None => info!("The agent closed the connection"), } + break 'event; } - // handle incoming messages - msg = ws_stream.next() => match msg { - Some(Ok(tungstenite::Message::Close(frame))) => { - match frame { - Some(frame) => info!("The agent closed the connection: {frame}"), - None => info!("The agent closed the connection"), - } - break 'event; + Some(Ok(tungstenite::Message::Pong(payload))) => { + let mut payload = payload.as_slice(); + + // check the header + if !payload.starts_with(PING_HEADER) { + warn!("Received a pong payload with an invalid header prefix"); + continue; } - Some(Ok(tungstenite::Message::Pong(payload))) => { - let mut payload = payload.as_slice(); + payload = &payload[PING_HEADER.len()..]; + if payload.len() != PING_LENGTH { + warn!("Received a pong payload with an invalid length {}, expected {PING_LENGTH}", payload.len()); + continue; + } - // check the header - if !payload.starts_with(PING_HEADER) { - warn!("Received a pong payload with an invalid header prefix"); - continue; - } + let (left, right) = payload.split_at(size_of::()); + let ping_index = u32::from_le_bytes(left.try_into().unwrap()); + let _uptime_start = u128::from_le_bytes(right.try_into().unwrap()); - payload = &payload[PING_HEADER.len()..]; - if payload.len() != PING_LENGTH { - warn!("Received a pong payload with an invalid length {}, expected {PING_LENGTH}", payload.len()); - continue; - } + if ping_index != num_pings { + warn!("Received a pong payload with an invalid index {ping_index}, expected {num_pings}"); + continue; + } - let (left, right) = payload.split_at(size_of::()); - let ping_index = u32::from_le_bytes(left.try_into().unwrap()); - let _uptime_start = u128::from_le_bytes(right.try_into().unwrap()); + num_pings += 1; + } - if ping_index != num_pings { - warn!("Received a pong payload with an invalid index {ping_index}, expected {num_pings}"); + Some(Ok(tungstenite::Message::Binary(bin))) => { + let msg = match snops_common::rpc::codec::decode(&bin) { + Ok(msg) => msg, + Err(e) => { + error!("failed to deserialize a message from the agent: {e}"); continue; } + }; - num_pings += 1; - } - - Some(Ok(tungstenite::Message::Binary(bin))) => { - let msg = match bincode::deserialize(&bin) { - Ok(msg) => msg, - Err(e) => { - error!("failed to deserialize a message from the agent: {e}"); - continue; - } - }; - - match msg { - MuxedMessageIncoming::Child(msg) => server_request_in.send(msg).expect("internal RPC channel closed"), - MuxedMessageIncoming::Parent(msg) => client_response_in.send(msg).expect("internal RPC channel closed"), - } - } - - None | Some(Err(_)) => { - error!("The connection to the agent was interrupted"); - break 'event; + match msg { + MuxedMessageIncoming::Child(msg) => server_request_in.send(msg).expect("internal RPC channel closed"), + MuxedMessageIncoming::Parent(msg) => client_response_in.send(msg).expect("internal RPC channel closed"), } + } - Some(Ok(o)) => println!("{o:#?}"), + None | Some(Err(_)) => { + error!("The connection to the agent was interrupted"); + break 'event; } + + Some(Ok(o)) => println!("{o:#?}"), } } } diff --git a/crates/checkpoint/Cargo.toml b/crates/checkpoint/Cargo.toml index 244d2035..1526b4a3 100644 --- a/crates/checkpoint/Cargo.toml +++ b/crates/checkpoint/Cargo.toml @@ -15,6 +15,7 @@ aleo-std = { workspace = true, optional = true } anyhow = { workspace = true, optional = true } chrono.workspace = true glob.workspace = true +lazysort.workspace = true rayon.workspace = true serde = { workspace = true, optional = true } snarkvm = { workspace = true, optional = true } diff --git a/crates/checkpoint/src/manager.rs b/crates/checkpoint/src/manager.rs index 8492c232..d06d32c7 100644 --- a/crates/checkpoint/src/manager.rs +++ b/crates/checkpoint/src/manager.rs @@ -1,12 +1,15 @@ use std::{collections::BTreeMap, fs, path::PathBuf}; use chrono::{DateTime, TimeDelta, Utc}; +use lazysort::SortedBy; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use tracing::{error, trace}; #[cfg(feature = "write")] use crate::errors::{ManagerCullError, ManagerInsertError, ManagerPollError}; -use crate::{errors::ManagerLoadError, path_from_height, CheckpointHeader, RetentionPolicy}; +use crate::{ + errors::ManagerLoadError, path_from_height, CheckpointHeader, RetentionPolicy, RetentionSpan, +}; #[derive(Debug, Clone)] pub struct CheckpointManager { @@ -215,6 +218,28 @@ impl CheckpointManager { pub fn checkpoints(&self) -> impl Iterator { self.checkpoints.values() } + + /// Find the nearest checkpoint with a height less than or equal to the + /// given height + pub fn nearest_with_height(&self, height: u32) -> Option<&(CheckpointHeader, PathBuf)> { + self.checkpoints() + .sorted_by(|(a, _), (b, _)| b.block_height.cmp(&a.block_height)) + .find(|(c, _)| (c.block_height <= height)) + } + + /// Find the nearest checkpoint with a timestamp less than or equal to the + /// given span + pub fn nearest_with_span(&self, span: RetentionSpan) -> Option<&(CheckpointHeader, PathBuf)> { + self.nearest_with_timestamp(span.as_timestamp()?) + } + + /// Find the nearest checkpoint with a timestamp less than or equal to the + /// given timestamp + pub fn nearest_with_timestamp(&self, timestamp: i64) -> Option<&(CheckpointHeader, PathBuf)> { + self.checkpoints() + .sorted_by(|(a, _), (b, _)| b.timestamp.cmp(&a.timestamp)) + .find(|(c, _)| (c.timestamp <= timestamp)) + } } impl std::fmt::Display for CheckpointManager { diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index fb71f540..ce3bf359 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -16,6 +16,13 @@ anyhow.workspace = true clap.workspace = true clap_complete.workspace = true clap-stdin.workspace = true -reqwest = { workspace = true, features = ["blocking", "json"] } +futures-util.workspace = true +http.workspace = true +reqwest = { workspace = true, features = ["json"] } +rustls.workspace = true +serde.workspace = true serde_json.workspace = true snops-common = { workspace = true, features = ["aot_cmds"] } +tokio = { workspace = true, features = ["macros", "signal", "rt-multi-thread"] } +tokio-tungstenite.workspace = true +urlencoding = "2.1.3" diff --git a/crates/cli/src/cli.rs b/crates/cli/src/cli.rs index 7378db00..639d4807 100644 --- a/crates/cli/src/cli.rs +++ b/crates/cli/src/cli.rs @@ -14,7 +14,7 @@ pub struct Cli { impl Cli { /// Runs the subcommand. - pub fn run(self) -> Result<()> { - self.subcommand.run(&self.url) + pub async fn run(self) -> Result<()> { + self.subcommand.run(&self.url).await } } diff --git a/crates/cli/src/commands/agent.rs b/crates/cli/src/commands/agent.rs index 2d4c69c2..5eb5fa84 100644 --- a/crates/cli/src/commands/agent.rs +++ b/crates/cli/src/commands/agent.rs @@ -2,7 +2,7 @@ use std::str::FromStr; use anyhow::Result; use clap::{error::ErrorKind, ArgGroup, CommandFactory, Parser, ValueHint}; -use reqwest::blocking::{Client, Response}; +use reqwest::{Client, Response}; use serde_json::json; use snops_common::state::AgentId; @@ -73,11 +73,13 @@ enum AgentCommands { /// Get the specific agent's status. Status, + /// Set the log level of the agent. SetLogLevel { /// The log level to set. level: String, }, + /// Set the log level of the node running on an agent. SetSnarkosLogLevel { /// The log verbosity to set. verbosity: u8, @@ -85,7 +87,7 @@ enum AgentCommands { } impl Agent { - pub fn run(self, url: &str, client: Client) -> Result { + pub async fn run(self, url: &str, client: Client) -> Result { use AgentCommands::*; Ok(match self.command { Find { @@ -116,12 +118,13 @@ impl Agent { "include_offline": include_offline, "local_pk": local_pk, })) - .send()? + .send() + .await? } List => { let ep = format!("{url}/api/v1/agents"); - client.get(ep).send()? + client.get(ep).send().await? } _ if self.id == AgentId::from_str(DUMMY_ID).unwrap() => { let mut cmd = Cli::command(); @@ -134,32 +137,32 @@ impl Agent { Info => { let ep = format!("{url}/api/v1/agents/{}", self.id); - client.get(ep).send()? + client.get(ep).send().await? } Kill => { let ep = format!("{url}/api/v1/agents/{}/kill", self.id); - client.post(ep).send()? + client.post(ep).send().await? } Status => { let ep = format!("{url}/api/v1/agents/{}/status", self.id); - client.get(ep).send()? + client.get(ep).send().await? } Tps => { let ep = format!("{url}/api/v1/agents/{}/tps", self.id); - client.get(ep).send()? + client.get(ep).send().await? } SetLogLevel { level } => { let ep = format!("{url}/api/v1/agents/{}/log/{level}", self.id); - client.post(ep).send()? + client.post(ep).send().await? } SetSnarkosLogLevel { verbosity } => { let ep = format!("{url}/api/v1/agents/{}/aot/log/{verbosity}", self.id); - client.post(ep).send()? + client.post(ep).send().await? } }) } diff --git a/crates/cli/src/commands/env/action/mod.rs b/crates/cli/src/commands/env/action/mod.rs index 6bfe6efb..8fe22c47 100644 --- a/crates/cli/src/commands/env/action/mod.rs +++ b/crates/cli/src/commands/env/action/mod.rs @@ -1,17 +1,21 @@ -use std::{collections::HashMap, str::FromStr}; +use std::{collections::HashMap, str::FromStr, sync::Arc}; use anyhow::Result; use clap::Parser; use clap_stdin::FileOrStdin; -use reqwest::blocking::{Client, Response}; +use reqwest::{Client, RequestBuilder, Response}; use serde_json::json; +use snops_cli::events::EventsClient; use snops_common::{ action_models::{AleoValue, WithTargets}, + events::{Event, EventKind, TransactionEvent}, key_source::KeySource, node_targets::{NodeTarget, NodeTargetError, NodeTargets}, - state::{CannonId, DocHeightRequest, EnvId, InternedId}, + state::{CannonId, EnvId, HeightRequest, InternedId}, }; +use crate::commands::env::post_and_wait; + //scli env canary action online client/* //scli env canary action offline client/* @@ -58,18 +62,39 @@ impl From for NodeTargets { pub enum Action { /// Turn the specified agents(and nodes) offline. #[clap(alias = "off")] - Offline(Nodes), + Offline { + /// The nodes to take offline. (eg. `validator/any`) + #[clap(num_args = 1, value_delimiter = ' ')] + nodes: Vec, + /// When present, don't wait for reconciles to finish before returning + #[clap(long = "async")] + async_mode: bool, + }, /// Turn the specified agents(and nodes) online. #[clap(alias = "on")] - Online(Nodes), + Online { + /// The nodes to turn online (eg. `validator/any`) + #[clap(num_args = 1, value_delimiter = ' ')] + nodes: Vec, + /// When present, don't wait for reconciles to finish before returning + #[clap(long = "async")] + async_mode: bool, + }, /// Reboot the specified agents(and nodes). - Reboot(Nodes), + Reboot { + /// The nodes to reboot (eg. `validator/any`) + #[clap(num_args = 1, value_delimiter = ' ')] + nodes: Vec, + /// When present, don't wait for reconciles to finish before returning + #[clap(long = "async")] + async_mode: bool, + }, /// Execute an aleo program function on the environment. i.e. /// credits.aleo/transfer_public Execute { /// Private key to use, can be `committee.0` to use committee member 0's /// key - #[clap(long, short)] + #[clap(long)] private_key: Option, /// Private key to use for the fee. Defaults to the same as /// --private-key @@ -125,7 +150,7 @@ pub enum Action { online: Option, /// Configure the height of the target nodes. #[clap(long)] - height: Option, + height: Option, /// Configure the peers of the target nodes, or `none`. #[clap(long, short)] peers: Option, @@ -138,15 +163,17 @@ pub enum Action { // Remove environment variables from a node: `--del-env FOO,BAR` #[clap(long, short, value_delimiter = ',', allow_hyphen_values = true)] del_env: Option>, - /// The nodes to configure. - #[clap(num_args = 1, value_delimiter = ' ')] - nodes: Vec, /// Configure the binary for a node. #[clap(long, short)] binary: Option, /// Configure the private key for a node. - #[clap(long, short)] + #[clap(long)] private_key: Option, + #[clap(long = "async")] + async_mode: bool, + /// The nodes to configure. (eg. `validator/any`) + #[clap(num_args = 1, value_delimiter = ' ')] + nodes: Vec, }, } @@ -169,23 +196,38 @@ impl KeyEqValue { } impl Action { - pub fn execute(self, url: &str, env_id: EnvId, client: Client) -> Result { + pub async fn execute(self, url: &str, env_id: EnvId, client: Client) -> Result { use Action::*; Ok(match self { - Offline(Nodes { nodes }) => { + Offline { nodes, async_mode } => { let ep = format!("{url}/api/v1/env/{env_id}/action/offline"); - - client.post(ep).json(&WithTargets::from(nodes)).send()? + let req = client.post(ep).json(&WithTargets::from(nodes)); + if async_mode { + req.send().await? + } else { + post_and_wait(url, req, env_id).await?; + std::process::exit(0); + } } - Online(Nodes { nodes }) => { + Online { nodes, async_mode } => { let ep = format!("{url}/api/v1/env/{env_id}/action/online"); - - client.post(ep).json(&WithTargets::from(nodes)).send()? + let req = client.post(ep).json(&WithTargets::from(nodes)); + if async_mode { + req.send().await? + } else { + post_and_wait(url, req, env_id).await?; + std::process::exit(0); + } } - Reboot(Nodes { nodes }) => { + Reboot { nodes, async_mode } => { let ep = format!("{url}/api/v1/env/{env_id}/action/reboot"); - - client.post(ep).json(&WithTargets::from(nodes)).send()? + let req = client.post(ep).json(&WithTargets::from(nodes)); + if async_mode { + req.send().await? + } else { + post_and_wait(url, req, env_id).await?; + std::process::exit(0); + } } Execute { @@ -230,12 +272,13 @@ impl Action { json["program"] = program.into(); } - let mut builder = client.post(ep); + let req = client.post(ep).query(&[("async", "true")]).json(&json); if async_mode { - let query = [("async", "true")]; - builder = builder.query(&query); + req.send().await? + } else { + post_and_wait_tx(url, req).await?; + std::process::exit(0); } - builder.json(&json).send()? } Deploy { private_key, @@ -268,12 +311,13 @@ impl Action { json["fee_record"] = fee_record.into(); } - let mut builder = client.post(ep); + let req = client.post(ep).query(&[("async", "true")]).json(&json); if async_mode { - let query = [("async", "true")]; - builder = builder.query(&query); + req.send().await? + } else { + post_and_wait_tx(url, req).await?; + std::process::exit(0); } - builder.json(&json).send()? } Config { online, @@ -285,6 +329,7 @@ impl Action { del_env, binary, private_key, + async_mode, } => { let ep = format!("{url}/api/v1/env/{env_id}/action/config"); @@ -321,8 +366,102 @@ impl Action { } // this api accepts a list of json objects - client.post(ep).json(&json!(vec![json])).send()? + let req = client.post(ep).json(&json!(vec![json])); + + if async_mode { + req.send().await? + } else { + post_and_wait(url, req, env_id).await?; + std::process::exit(0); + } } }) } } + +pub async fn post_and_wait_tx(url: &str, req: RequestBuilder) -> Result<()> { + use snops_common::events::EventFilter::*; + + let tx_id: String = req.send().await?.json().await?; + + let mut events = EventsClient::open_with_filter(url, TransactionIs(Arc::new(tx_id))).await?; + + let mut tx = None; + let mut block_hash = None; + let mut broadcast_height = None; + let mut broadcast_time = None; + + while let Some(event) = events.next().await? { + let Event { + content: EventKind::Transaction(e), + agent, + .. + } = event + else { + continue; + }; + + match e { + TransactionEvent::AuthorizationReceived { .. } => { + // ignore output of this event + } + TransactionEvent::Executing => { + eprintln!( + "executing on {}", + agent + .map(|a| a.to_string()) + .unwrap_or_else(|| "unknown".to_string()) + ); + } + TransactionEvent::ExecuteAwaitingCompute => { + eprintln!("waiting for compute...",); + } + TransactionEvent::ExecuteExceeded { attempts } => { + eprintln!("execution failed after {attempts} attempts"); + break; + } + TransactionEvent::ExecuteFailed(reason) => { + eprintln!("execution failed: {reason}"); + } + TransactionEvent::ExecuteAborted(reason) => { + eprintln!( + "execution aborted: {}", + serde_json::to_string_pretty(&reason)? + ); + } + TransactionEvent::ExecuteComplete { transaction } => { + eprintln!("execution complete"); + tx = Some(transaction); + } + TransactionEvent::BroadcastExceeded { attempts } => { + eprintln!("broadcast failed after {attempts} attempts"); + break; + } + TransactionEvent::Broadcasted { height, timestamp } => { + eprintln!( + "broadcasted at height {} at {timestamp}", + height + .map(|h| h.to_string()) + .unwrap_or_else(|| "unknown".to_string()), + ); + broadcast_height = height; + broadcast_time = Some(timestamp); + } + TransactionEvent::Confirmed { hash } => { + eprintln!("confirmed with hash {hash}"); + block_hash = Some(hash); + break; + } + } + } + println!( + "{}", + serde_json::to_string_pretty(&json!({ + "transaction": tx, + "broadcast_height": broadcast_height, + "broadcast_time": broadcast_time, + "block_hash": block_hash, + }))? + ); + events.close().await +} diff --git a/crates/cli/src/commands/env/mod.rs b/crates/cli/src/commands/env/mod.rs index d3b443f6..275971eb 100644 --- a/crates/cli/src/commands/env/mod.rs +++ b/crates/cli/src/commands/env/mod.rs @@ -1,14 +1,16 @@ -use std::path::PathBuf; +use std::collections::HashMap; +use action::post_and_wait_tx; use anyhow::Result; use clap::{Parser, ValueHint}; use clap_stdin::FileOrStdin; -use reqwest::blocking::{Client, Response}; +use reqwest::{Client, RequestBuilder, Response}; +use snops_cli::events::EventsClient; use snops_common::{ action_models::AleoValue, - aot_cmds::Authorization, + events::{AgentEvent, Event, EventKind}, key_source::KeySource, - state::{CannonId, InternedId, NodeKey}, + state::{AgentId, Authorization, CannonId, EnvId, InternedId, NodeKey, ReconcileStatus}, }; mod action; @@ -26,6 +28,7 @@ pub struct Env { /// Env commands. #[derive(Debug, Parser)] enum EnvCommands { + /// Run an action on an environment. #[clap(subcommand)] Action(action::Action), /// Get an env's specific agent by. @@ -76,9 +79,9 @@ enum EnvCommands { #[clap(alias = "tx-details")] TransactionDetails { id: String }, - /// Clean a specific environment. - #[clap(alias = "c")] - Clean, + /// Delete a specific environment. + #[clap(alias = "d")] + Delete, /// Get an env's latest block/state root info. Info, @@ -97,12 +100,15 @@ enum EnvCommands { #[clap(alias = "top-res")] TopologyResolved, - /// Prepare a (test) environment. + /// Apply an environment spec. #[clap(alias = "p")] - Prepare { - /// The test spec file. + Apply { + /// The environment spec file. #[clap(value_hint = ValueHint::AnyPath)] - spec: PathBuf, + spec: FileOrStdin, + /// When present, don't wait for reconciles to finish before returning + #[clap(long = "async")] + async_mode: bool, }, /// Lookup a mapping by program id and mapping name. @@ -128,20 +134,20 @@ enum EnvCommands { } impl Env { - pub fn run(self, url: &str, client: Client) -> Result { + pub async fn run(self, url: &str, client: Client) -> Result { let id = self.id; use EnvCommands::*; Ok(match self.command { - Action(action) => action.execute(url, id, client)?, + Action(action) => action.execute(url, id, client).await?, Agent { key } => { let ep = format!("{url}/api/v1/env/{id}/agents/{key}"); - client.get(ep).send()? + client.get(ep).send().await? } Agents => { let ep = format!("{url}/api/v1/env/{id}/agents"); - client.get(ep).send()? + client.get(ep).send().await? } Auth { async_mode, @@ -156,48 +162,57 @@ impl Env { req = req.query(&[("async", "true")]); } - req.send()? + if async_mode { + req.send().await? + } else { + post_and_wait_tx(url, req).await?; + std::process::exit(0); + } } Balance { address: key } => { let ep = format!("{url}/api/v1/env/{id}/balance/{key}"); - client.get(ep).json(&key).send()? + client.get(ep).json(&key).send().await? } Block { height_or_hash } => { let ep = format!("{url}/api/v1/env/{id}/block/{height_or_hash}"); - client.get(ep).send()? + client.get(ep).send().await? } - Clean => { + Delete => { let ep = format!("{url}/api/v1/env/{id}"); - client.delete(ep).send()? + client.delete(ep).send().await? } Info => { let ep = format!("{url}/api/v1/env/{id}/info"); - client.get(ep).send()? + client.get(ep).send().await? } List => { let ep = format!("{url}/api/v1/env/list"); - client.get(ep).send()? + client.get(ep).send().await? } Topology => { let ep = format!("{url}/api/v1/env/{id}/topology"); - client.get(ep).send()? + client.get(ep).send().await? } TopologyResolved => { let ep = format!("{url}/api/v1/env/{id}/topology/resolved"); - client.get(ep).send()? + client.get(ep).send().await? } - Prepare { spec } => { - let ep = format!("{url}/api/v1/env/{id}/prepare"); - let file: String = std::fs::read_to_string(spec)?; - - client.post(ep).body(file).send()? + Apply { spec, async_mode } => { + let ep = format!("{url}/api/v1/env/{id}/apply"); + let req = client.post(ep).body(spec.contents()?); + if async_mode { + req.send().await? + } else { + post_and_wait(url, req, id).await?; + std::process::exit(0); + } } Mapping { program, @@ -217,39 +232,121 @@ impl Env { } }; - client.get(ep).send()? + client.get(ep).send().await? } Mappings { program } => { let ep = format!("{url}/api/v1/env/{id}/program/{program}/mappings"); - client.get(ep).send()? + client.get(ep).send().await? } Program { id: prog } => { let ep = format!("{url}/api/v1/env/{id}/program/{prog}"); - println!("{}", client.get(ep).send()?.text()?); + println!("{}", client.get(ep).send().await?.text().await?); std::process::exit(0); } Storage => { let ep = format!("{url}/api/v1/env/{id}/storage"); - client.get(ep).send()? + client.get(ep).send().await? } Transaction { id: hash } => { let ep = format!("{url}/api/v1/env/{id}/transaction_block/{hash}"); - client.get(ep).send()? + client.get(ep).send().await? } TransactionDetails { id: hash } => { let ep = format!("{url}/api/v1/env/{id}/transaction/{hash}"); - client.get(ep).send()? + client.get(ep).send().await? } Height => { let ep = format!("{url}/api/v1/env/{id}/height"); - client.get(ep).send()? + client.get(ep).send().await? } }) } } + +pub async fn post_and_wait(url: &str, req: RequestBuilder, env_id: EnvId) -> Result<()> { + use snops_common::events::EventFilter::*; + use snops_common::events::EventKindFilter::*; + + let mut events = EventsClient::open_with_filter( + url, + EnvIs(env_id) + & (AgentConnected + | AgentDisconnected + | AgentReconcile + | AgentReconcileComplete + | AgentReconcileError), + ) + .await?; + + let res = req.send().await?; + + if !res.status().is_success() { + println!( + "{}", + serde_json::to_string_pretty(&res.json::().await?)? + ); + std::process::exit(1); + } + + let mut node_map: HashMap = res.json().await?; + println!("{}", serde_json::to_string_pretty(&node_map)?); + + let filter = node_map + .values() + .copied() + .fold(!Unfiltered, |id, filter| (id | AgentIs(filter))); + + while let Some(event) = events.next().await? { + // Ensure the event is based on the response + if !event.matches(&filter) { + continue; + } + + if let Event { + node_key: Some(node), + content: EventKind::Agent(e), + .. + } = &event + { + match &e { + AgentEvent::Reconcile(ReconcileStatus { + scopes, conditions, .. + }) => { + println!( + "{node}: {} {}", + scopes.join(";"), + conditions + .iter() + // unwrap safety - it was literally just serialized + .map(|s| serde_json::to_string(s).unwrap()) + .collect::>() + .join(",") + ); + } + AgentEvent::ReconcileError(err) => { + println!("{node}: error: {err}"); + } + AgentEvent::ReconcileComplete => { + println!("{node}: done"); + } + _ => {} + } + } + if let (Some(node_key), true) = ( + event.node_key.as_ref(), + event.matches(&AgentReconcileComplete.into()), + ) { + node_map.remove(node_key); + if node_map.is_empty() { + break; + } + } + } + events.close().await +} diff --git a/crates/cli/src/commands/mod.rs b/crates/cli/src/commands/mod.rs index a1f9edf7..4f5eaab1 100644 --- a/crates/cli/src/commands/mod.rs +++ b/crates/cli/src/commands/mod.rs @@ -1,8 +1,9 @@ use anyhow::Result; use clap::{CommandFactory, Parser}; use serde_json::Value; +use snops_common::events::EventFilter; -use crate::Cli; +use crate::{events::EventsClient, Cli}; /// The dummy value for the ids to hack around the missing required argument. pub(crate) static DUMMY_ID: &str = "dummy_value___"; @@ -25,6 +26,13 @@ pub enum Commands { SetLogLevel { level: String, }, + /// Listen to events from the control plane, optionally filtered. + Events { + /// The event filter to apply, such as `agent-connected` or + /// `all-of(env-is(default),node-target-is(validator/any))` + #[clap(default_value = "unfiltered")] + filter: EventFilter, + }, #[cfg(feature = "mangen")] Man(snops_common::mangen::Mangen), #[cfg(feature = "clipages")] @@ -32,8 +40,8 @@ pub enum Commands { } impl Commands { - pub fn run(self, url: &str) -> Result<()> { - let client = reqwest::blocking::Client::new(); + pub async fn run(self, url: &str) -> Result<()> { + let client = reqwest::Client::new(); let response = match self { Commands::Autocomplete { shell } => { @@ -43,10 +51,21 @@ impl Commands { clap_complete::generate(shell, &mut cmd, cmd_name, &mut std::io::stdout()); return Ok(()); } - Commands::Agent(agent) => agent.run(url, client), - Commands::Env(env) => env.run(url, client), + Commands::Agent(agent) => agent.run(url, client).await, + Commands::Env(env) => env.run(url, client).await, Commands::SetLogLevel { level } => { - client.post(format!("{url}/api/v1/log/{level}")).send()?; + client + .post(format!("{url}/api/v1/log/{level}")) + .send() + .await?; + return Ok(()); + } + Commands::Events { filter } => { + let mut client = EventsClient::open_with_filter(url, filter).await?; + while let Some(event) = client.next().await? { + println!("{}", serde_json::to_string_pretty(&event)?); + } + client.close().await?; return Ok(()); } #[cfg(feature = "mangen")] @@ -71,7 +90,7 @@ impl Commands { let value = match response.content_length() { Some(0) | None => None, - _ => response.json::().map(Some)?, + _ => response.json::().await.map(Some)?, }; println!("{}", serde_json::to_string_pretty(&value)?); diff --git a/crates/cli/src/events.rs b/crates/cli/src/events.rs new file mode 100644 index 00000000..a99bd797 --- /dev/null +++ b/crates/cli/src/events.rs @@ -0,0 +1,132 @@ +// subscription code is not in use yet +#![allow(dead_code)] + +use std::{collections::HashSet, str::FromStr, time::Duration}; + +use anyhow::{bail, Context, Result}; +use futures_util::{SinkExt, StreamExt}; +use http::Uri; +use snops_common::events::{Event, EventFilter, EventWsRequest}; +use tokio::{net::TcpStream, select}; +use tokio_tungstenite::{ + connect_async, + tungstenite::{self, client::IntoClientRequest}, + MaybeTlsStream, WebSocketStream, +}; + +pub struct EventsClient { + counter: u32, + stream: WebSocketStream>, + subscriptions: HashSet, + ping_interval: tokio::time::Interval, +} + +impl EventsClient { + pub async fn open(url: &str) -> Result { + Self::new(url, None).await + } + + pub async fn open_with_filter(url: &str, filter: EventFilter) -> Result { + Self::new(url, Some(filter)).await + } + + pub async fn new(url: &str, filter: Option) -> Result { + let (proto, hostname) = url.split_once("://").unwrap_or(("http", url)); + let proto = match proto { + "wss" | "https" => "wss", + _ => "ws", + }; + + let req = Uri::from_str(&match filter { + Some(filter) => format!( + "{proto}://{hostname}/api/v1/events?filter={}", + urlencoding::encode(&filter.to_string()) + ), + None => format!("{proto}://{hostname}/api/v1/events"), + }) + .context("Invalid URI")? + .into_client_request() + .context("Invalid websocket request")?; + + let stream = match connect_async(req).await { + Ok((stream, _)) => stream, + Err(tungstenite::Error::Io(e)) if e.kind() == std::io::ErrorKind::ConnectionRefused => { + bail!("Failed to connect to websocket: Connection refused") + } + Err(e) => bail!("Failed to connect to websocket: {}", e), + }; + + Ok(Self { + counter: 0, + stream, + subscriptions: Default::default(), + ping_interval: tokio::time::interval(Duration::from_secs(10)), + }) + } + + async fn send_json(&mut self, msg: impl serde::Serialize) -> Result<()> { + self.stream + .send(tungstenite::Message::Text( + serde_json::to_string(&msg).context("Failed to serialize message")?, + )) + .await + .context("Failed to send message") + } + + /// Add an additional filter to the current subscription + pub async fn subscribe(&mut self, filter: EventFilter) -> Result { + let id = self.counter; + self.send_json(EventWsRequest::Subscribe { id, filter }) + .await?; + self.counter = self.counter.saturating_add(1); + self.subscriptions.insert(id); + Ok(id) + } + + /// Remove a filter from the current subscription + pub async fn unsubscribe(&mut self, id: u32) -> Result<()> { + if !self.subscriptions.remove(&id) { + bail!("Subscription not found: {}", id); + } + self.send_json(EventWsRequest::Unsubscribe { id }).await?; + Ok(()) + } + + /// Remove all filters from the current subscription + pub async fn unsubscribe_all(&mut self) -> Result<()> { + // Collect the ids to avoid borrowing issues + for id in self.subscriptions.drain().collect::>() { + self.send_json(EventWsRequest::Unsubscribe { id }).await?; + } + Ok(()) + } + + /// Get the next event from the stream + pub async fn next(&mut self) -> Result> { + loop { + select! { + _ = tokio::signal::ctrl_c() => return Ok(None), + _ = self.ping_interval.tick() => { + self.stream.send(tungstenite::Message::Ping(vec![b'p', b'i', b'n', b'g'])).await.context("Failed to send ping")?; + } + msg = self.stream.next() => { + match msg { + Some(Ok(tungstenite::Message::Text(text))) => + return serde_json::from_str(&text).map(Some).with_context(|| format!("Failed to parse event: {text}")), + Some(Ok(tungstenite::Message::Binary(bin))) => + return serde_json::from_slice(&bin).map(Some).with_context(|| format!("Failed to parse event: {}", String::from_utf8_lossy(&bin))), + None | Some(Err(_)) => bail!("Websocket closed"), + Some(Ok(_)) => continue, + + } + } + } + } + } + + /// Close the websocket connection + pub async fn close(mut self) -> Result<()> { + self.stream.close(None).await?; + Ok(()) + } +} diff --git a/crates/cli/src/lib.rs b/crates/cli/src/lib.rs new file mode 100644 index 00000000..a9970c28 --- /dev/null +++ b/crates/cli/src/lib.rs @@ -0,0 +1 @@ +pub mod events; diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index 5025c0f0..6dc459f2 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -6,13 +6,20 @@ use clap::Parser; mod cli; pub(crate) use cli::*; +mod events; + mod commands; pub(crate) use commands::*; -fn main() -> Result<()> { +#[tokio::main] +async fn main() -> Result<()> { + rustls::crypto::ring::default_provider() + .install_default() + .expect("Failed to install rustls crypto provider"); + let cli = cli::Cli::parse(); - if let Err(err) = cli.run() { + if let Err(err) = cli.run().await { eprintln!("⚠️ {err:?}"); exit(1); } diff --git a/crates/common/Cargo.toml b/crates/common/Cargo.toml index 8713d105..3d5268da 100644 --- a/crates/common/Cargo.toml +++ b/crates/common/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "snops-common" -version = "0.1.0" +version = "0.2.0" edition = "2021" license = "MIT" description = "Common types and utilities for snops" @@ -13,7 +13,6 @@ mangen = ["anyhow", "clap_mangen"] [dependencies] anyhow = { workspace = true, optional = true } -bincode.workspace = true bytes.workspace = true chrono = { workspace = true, features = ["serde"] } clap.workspace = true @@ -25,8 +24,8 @@ indexmap = { workspace = true, features = ["std", "serde"] } lasso.workspace = true lazy_static.workspace = true paste.workspace = true -regex.workspace = true rand.workspace = true +regex.workspace = true serde.workspace = true serde_json.workspace = true sha2.workspace = true diff --git a/crates/common/src/action_models.rs b/crates/common/src/action_models.rs index 5e8e950b..af4b2005 100644 --- a/crates/common/src/action_models.rs +++ b/crates/common/src/action_models.rs @@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize}; use crate::{ key_source::KeySource, node_targets::{NodeTarget, NodeTargets}, - state::{CannonId, DocHeightRequest, InternedId}, + state::{CannonId, HeightRequest, InternedId}, }; #[derive(Deserialize, Serialize, Clone)] @@ -116,7 +116,7 @@ pub struct Reconfig { #[serde(default, skip_serializing_if = "Option::is_none")] pub online: Option, #[serde(default, skip_serializing_if = "Option::is_none")] - pub height: Option, + pub height: Option, #[serde(default, skip_serializing_if = "Option::is_none")] pub peers: Option, #[serde(default, skip_serializing_if = "Option::is_none")] diff --git a/crates/common/src/aot_cmds/mod.rs b/crates/common/src/aot_cmds/mod.rs index d03f1191..d12dfcb7 100644 --- a/crates/common/src/aot_cmds/mod.rs +++ b/crates/common/src/aot_cmds/mod.rs @@ -5,15 +5,13 @@ use tokio::{ process::{Child, Command}, }; -mod authorization; pub mod error; -pub use authorization::*; pub use error::AotCmdError; use self::error::CommandError; use crate::{ constant::{LEDGER_BASE_DIR, SNARKOS_GENESIS_FILE}, - state::NetworkId, + state::{Authorization, NetworkId}, }; pub struct AotCmd { @@ -74,6 +72,7 @@ impl AotCmd { query: Option<&String>, priority_fee: Option, fee_record: Option<&String>, + cost_v1: bool, ) -> Result { let mut command = Command::new(&self.bin); command @@ -101,6 +100,10 @@ impl AotCmd { command.arg("--record").arg(fee_record); } + if cost_v1 { + command.arg("--cost-v1"); + } + command .arg(format!("{program_id}/{function_name}")) .args(inputs); @@ -122,6 +125,7 @@ impl AotCmd { query: Option<&String>, priority_fee: Option, fee_record: Option<&String>, + cost_v1: bool, ) -> Result { let mut command = Command::new(&self.bin); command @@ -150,6 +154,10 @@ impl AotCmd { command.arg("--record").arg(fee_record); } + if cost_v1 { + command.arg("--cost-v1"); + } + command.arg("-"); let mut child = command @@ -204,6 +212,7 @@ impl AotCmd { authorization: &str, priority_fee: Option, fee_record: Option<&String>, + cost_v1: bool, ) -> Result { let mut command = Command::new(&self.bin); command @@ -223,6 +232,10 @@ impl AotCmd { command.arg("--record").arg(fee_record); } + if cost_v1 { + command.arg("--cost-v1"); + } + Self::handle_output( command.output().await, "output", diff --git a/crates/common/src/api.rs b/crates/common/src/api.rs index f7f3f2e5..fec625e0 100644 --- a/crates/common/src/api.rs +++ b/crates/common/src/api.rs @@ -4,6 +4,7 @@ use snops_checkpoint::RetentionPolicy; use crate::{ binaries::BinaryEntry, + format::{DataFormat, DataHeaderOf}, prelude::StorageId, state::{InternedId, LatestBlockInfo, NetworkId}, }; @@ -23,14 +24,19 @@ pub struct EnvInfo { pub block: Option, } +/// Lighter-weight version of EnvInfo for the agent #[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AgentEnvInfo { + pub network: NetworkId, + pub storage: StorageInfo, +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub struct StorageInfo { /// String id of this storage pub id: StorageId, /// The retention policy used for this storage pub retention_policy: Option, - /// The available checkpoints in this storage - pub checkpoints: Vec, /// Whether to persist the ledger pub persist: bool, /// Version identifier for this ledger @@ -41,3 +47,258 @@ pub struct StorageInfo { /// download from the control plane) pub binaries: IndexMap, } + +#[derive(Debug, Clone)] +pub struct EnvInfoHeader { + pub version: u8, + pub network: DataHeaderOf, + pub storage: DataHeaderOf, + pub block: DataHeaderOf, +} + +impl DataFormat for EnvInfoHeader { + type Header = (u8, DataHeaderOf>); + const LATEST_HEADER: Self::Header = (1, DataHeaderOf::::LATEST_HEADER); + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + let mut written = self.version.write_data(writer)?; + written += self.network.write_data(writer)?; + written += self.storage.write_data(writer)?; + written += self.block.write_data(writer)?; + Ok(written) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.0 != Self::LATEST_HEADER.0 { + return Err(crate::format::DataReadError::unsupported( + "EnvInfoHeader", + Self::LATEST_HEADER.0, + header.0, + )); + } + Ok(Self { + version: u8::read_data(reader, &())?, + network: DataHeaderOf::::read_data(reader, &())?, + storage: DataHeaderOf::::read_data(reader, &header.1)?, + block: DataHeaderOf::::read_data(reader, &())?, + }) + } +} + +impl DataFormat for EnvInfo { + type Header = EnvInfoHeader; + const LATEST_HEADER: Self::Header = EnvInfoHeader { + version: 1, + network: NetworkId::LATEST_HEADER, + storage: StorageInfo::LATEST_HEADER, + block: LatestBlockInfo::LATEST_HEADER, + }; + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + let mut written = self.network.write_data(writer)?; + written += self.storage.write_data(writer)?; + written += self.block.write_data(writer)?; + Ok(written) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.version != 1 { + return Err(crate::format::DataReadError::unsupported( + "EnvInfo", + 1, + header.version, + )); + } + Ok(Self { + network: NetworkId::read_data(reader, &header.network)?, + storage: StorageInfo::read_data(reader, &header.storage)?, + block: Option::::read_data(reader, &header.block)?, + }) + } +} + +#[derive(Debug, Clone)] +pub struct AgentEnvInfoHeader { + pub version: u8, + pub network: DataHeaderOf, + pub storage: DataHeaderOf, +} + +impl DataFormat for AgentEnvInfoHeader { + type Header = (u8, DataHeaderOf>); + const LATEST_HEADER: Self::Header = (1, DataHeaderOf::::LATEST_HEADER); + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + let mut written = self.version.write_data(writer)?; + written += self.network.write_data(writer)?; + written += self.storage.write_data(writer)?; + Ok(written) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.0 != Self::LATEST_HEADER.0 { + return Err(crate::format::DataReadError::unsupported( + "EnvInfoHeader", + Self::LATEST_HEADER.0, + header.0, + )); + } + Ok(Self { + version: u8::read_data(reader, &())?, + network: DataHeaderOf::::read_data(reader, &())?, + storage: DataHeaderOf::::read_data(reader, &header.1)?, + }) + } +} + +impl DataFormat for AgentEnvInfo { + type Header = AgentEnvInfoHeader; + const LATEST_HEADER: Self::Header = AgentEnvInfoHeader { + version: 1, + network: NetworkId::LATEST_HEADER, + storage: StorageInfo::LATEST_HEADER, + }; + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + let mut written = self.network.write_data(writer)?; + written += self.storage.write_data(writer)?; + Ok(written) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.version != 1 { + return Err(crate::format::DataReadError::unsupported( + "EnvInfo", + 1, + header.version, + )); + } + Ok(Self { + network: NetworkId::read_data(reader, &header.network)?, + storage: StorageInfo::read_data(reader, &header.storage)?, + }) + } +} + +#[derive(Debug, Clone)] +pub struct StorageInfoHeader { + pub version: u8, + pub retention_policy: DataHeaderOf, + pub binaries: DataHeaderOf, +} + +impl DataFormat for StorageInfoHeader { + type Header = u8; + const LATEST_HEADER: Self::Header = 1; + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + let mut written = self.version.write_data(writer)?; + written += self.retention_policy.write_data(writer)?; + written += self.binaries.write_data(writer)?; + Ok(written) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if *header != Self::LATEST_HEADER { + return Err(crate::format::DataReadError::unsupported( + "StorageInfoHeader", + Self::LATEST_HEADER, + header, + )); + } + Ok(Self { + version: u8::read_data(reader, &())?, + retention_policy: DataHeaderOf::::read_data(reader, &((), ()))?, + binaries: DataHeaderOf::::read_data(reader, &())?, + }) + } +} + +impl DataFormat for StorageInfo { + type Header = StorageInfoHeader; + + const LATEST_HEADER: Self::Header = StorageInfoHeader { + version: 2, + retention_policy: RetentionPolicy::LATEST_HEADER, + binaries: BinaryEntry::LATEST_HEADER, + }; + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + let mut written = self.id.write_data(writer)?; + written += self.retention_policy.write_data(writer)?; + written += self.persist.write_data(writer)?; + written += self.version.write_data(writer)?; + written += self.native_genesis.write_data(writer)?; + written += self.binaries.write_data(writer)?; + Ok(written) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.version == 0 || header.version > Self::LATEST_HEADER.version { + return Err(crate::format::DataReadError::unsupported( + "StorageInfo", + Self::LATEST_HEADER.version, + header.version, + )); + } + + let id = StorageId::read_data(reader, &())?; + let retention_policy = + Option::::read_data(reader, &header.retention_policy)?; + + // Omit checkpoints from a previous version + if header.version == 1 { + Vec::<(u32, i64, String)>::read_data(reader, &((), (), ()))?; + }; + + let persist = bool::read_data(reader, &())?; + let version = u16::read_data(reader, &())?; + let native_genesis = bool::read_data(reader, &())?; + let binaries = + IndexMap::::read_data(reader, &((), header.binaries))?; + Ok(Self { + id, + retention_policy, + persist, + version, + native_genesis, + binaries, + }) + } +} diff --git a/crates/common/src/binaries.rs b/crates/common/src/binaries.rs index 4813eb46..9543d3d3 100644 --- a/crates/common/src/binaries.rs +++ b/crates/common/src/binaries.rs @@ -15,7 +15,7 @@ use crate::{ }; /// A BinaryEntry is the location to a binary with an optional shasum -#[derive(Serialize, Deserialize, Debug, Clone)] +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] pub struct BinaryEntry { pub source: BinarySource, #[serde(default)] @@ -43,6 +43,11 @@ impl BinaryEntry { } } + /// Determines if the file is fetched from the control plane + pub fn is_api_file(&self) -> bool { + matches!(self.source, BinarySource::Path(_)) + } + /// Check if the sha256 is a valid sha256 hash pub fn check_sha256(&self) -> bool { self.sha256 @@ -92,7 +97,7 @@ impl Display for BinaryEntry { } } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Eq, PartialEq)] pub enum BinarySource { Url(url::Url), Path(PathBuf), @@ -132,7 +137,7 @@ impl Serialize for BinarySource { } impl<'de> Deserialize<'de> for BinarySource { - fn deserialize(deserializer: D) -> Result + fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, { diff --git a/crates/common/src/constant.rs b/crates/common/src/constant.rs index deffaa84..4f50f0f6 100644 --- a/crates/common/src/constant.rs +++ b/crates/common/src/constant.rs @@ -13,7 +13,7 @@ pub const SNARKOS_GENESIS_FILE: &str = "genesis.block"; pub const LEDGER_BASE_DIR: &str = "ledger"; /// The directory name for persisted ledgers within the storage dir. pub const LEDGER_PERSIST_DIR: &str = "persist"; -/// Temporary storage archive file name. -pub const LEDGER_STORAGE_FILE: &str = "ledger.tar.gz"; /// File containing a version counter for a ledger pub const VERSION_FILE: &str = "version"; +/// Directory name for the node's data. +pub const NODE_DATA_DIR: &str = "node"; diff --git a/crates/common/src/db/error.rs b/crates/common/src/db/error.rs index 7760ba39..cce3ffbc 100644 --- a/crates/common/src/db/error.rs +++ b/crates/common/src/db/error.rs @@ -8,10 +8,6 @@ pub enum DatabaseError { DeleteError(String, String, sled::Error), #[error("save error key {0} in {1}: {2}")] SaveError(String, String, sled::Error), - #[error("deserialize value {0} in {1}: {2}")] - DeserializeError(String, String, bincode::Error), - #[error("serialize value {0} in {1}: {2}")] - SerializeError(String, String, bincode::Error), #[error("missing key {0} in {1}")] MissingKey(String, String), #[error("unknown document version {2} for: {1} in {0}")] diff --git a/crates/common/src/db/tree.rs b/crates/common/src/db/tree.rs index 00df8184..113742b2 100644 --- a/crates/common/src/db/tree.rs +++ b/crates/common/src/db/tree.rs @@ -143,3 +143,48 @@ impl DbTree { .sum()) } } + +pub struct DbRecords { + tree: sled::Tree, + _phantom: std::marker::PhantomData, +} + +impl DbRecords { + pub fn new(tree: sled::Tree) -> Self { + Self { + tree, + _phantom: std::marker::PhantomData, + } + } + + pub fn restore(&self, key: &K) -> Result, DatabaseError> { + Ok(self + .tree + .get(key.to_byte_vec()?)? + .map(|value_bytes| read_dataformat(&mut value_bytes.reader())) + .transpose()?) + } + + pub fn save(&self, key: &K, value: &V) -> Result<(), DatabaseError> { + let key_bytes = key.to_byte_vec()?; + let mut value_bytes = Vec::new(); + write_dataformat(&mut value_bytes, value)?; + self.tree.insert(key_bytes, value_bytes)?; + Ok(()) + } + + pub fn save_option( + &self, + key: &K, + value: Option<&V>, + ) -> Result<(), DatabaseError> { + match value { + Some(value) => self.save(key, value), + None => self.delete(key).map(|_| ()), + } + } + + pub fn delete(&self, key: &K) -> Result { + Ok(self.tree.remove(key.to_byte_vec()?)?.is_some()) + } +} diff --git a/crates/common/src/events/filter.rs b/crates/common/src/events/filter.rs new file mode 100644 index 00000000..ba506cad --- /dev/null +++ b/crates/common/src/events/filter.rs @@ -0,0 +1,109 @@ +use std::{fmt::Display, sync::Arc}; + +use super::{Event, EventKindFilter}; +use crate::{ + node_targets::NodeTargets, + state::{AgentId, EnvId, InternedId, NodeKey}, +}; + +#[derive(Clone, Debug, PartialEq)] + +pub enum EventFilter { + /// No filter + Unfiltered, + + /// Logical AND of filters + AllOf(Vec), + /// Logical OR of filters + AnyOf(Vec), + /// Logical XOR of filters + OneOf(Vec), + /// Logical NOT of filter + Not(Box), + + /// Filter by agent ID + AgentIs(AgentId), + /// Filter by events that have any agent + HasAgent, + /// Filter by environment ID + EnvIs(EnvId), + /// Filter by events that have any environment + HasEnv, + /// Filter by transaction ID + TransactionIs(Arc), + /// Filter by events that have any transaction + HasTransaction, + /// Filter by cannon ID + CannonIs(InternedId), + /// Filter by events that have any cannon + HasCannon, + /// Filter by event kind + EventIs(EventKindFilter), + /// Filter by node key + HasNodeKey, + /// Filter by node key + NodeKeyIs(NodeKey), + /// Filter by node target + NodeTargetIs(NodeTargets), +} + +impl Event { + pub fn matches(&self, filter: &EventFilter) -> bool { + match filter { + EventFilter::Unfiltered => true, + EventFilter::AllOf(filters) => filters.iter().all(|f| self.matches(f)), + EventFilter::AnyOf(filters) => filters.iter().any(|f| self.matches(f)), + EventFilter::OneOf(filters) => filters.iter().filter(|f| self.matches(f)).count() == 1, + EventFilter::Not(f) => !self.matches(f), + EventFilter::AgentIs(agent) => self.agent == Some(*agent), + EventFilter::HasAgent => self.agent.is_some(), + EventFilter::EnvIs(env) => self.env == Some(*env), + EventFilter::HasEnv => self.env.is_some(), + EventFilter::TransactionIs(transaction) => { + self.transaction.as_ref() == Some(transaction) + } + EventFilter::HasTransaction => self.transaction.is_some(), + EventFilter::CannonIs(cannon) => self.cannon == Some(*cannon), + EventFilter::HasCannon => self.cannon.is_some(), + EventFilter::EventIs(kind) => self.content.filter() == *kind, + EventFilter::NodeKeyIs(node_key) => self.node_key.as_ref() == Some(node_key), + EventFilter::HasNodeKey => self.node_key.is_some(), + EventFilter::NodeTargetIs(node_targets) => self + .node_key + .as_ref() + .is_some_and(|key| node_targets.matches(key)), + } + } +} + +fn event_filter_vec(filters: &[EventFilter]) -> String { + filters + .iter() + .map(|f| f.to_string()) + .collect::>() + .join(", ") +} + +impl Display for EventFilter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + EventFilter::Unfiltered => write!(f, "unfiltered"), + EventFilter::AllOf(vec) => write!(f, "all-of({})", event_filter_vec(vec)), + EventFilter::AnyOf(vec) => write!(f, "any-of({})", event_filter_vec(vec)), + EventFilter::OneOf(vec) => write!(f, "one-of({})", event_filter_vec(vec)), + EventFilter::Not(event_filter) => write!(f, "not({})", event_filter), + EventFilter::AgentIs(id) => write!(f, "agent-is({id})"), + EventFilter::HasAgent => write!(f, "has-agent"), + EventFilter::EnvIs(id) => write!(f, "env-is({id})"), + EventFilter::HasEnv => write!(f, "has-env"), + EventFilter::TransactionIs(str) => write!(f, "transaction-is({str})"), + EventFilter::HasTransaction => write!(f, "has-transaction"), + EventFilter::CannonIs(id) => write!(f, "cannon-is({id})"), + EventFilter::HasCannon => write!(f, "has-cannon"), + EventFilter::EventIs(event) => write!(f, "event-is({event})"), + EventFilter::NodeKeyIs(node_key) => write!(f, "node-key-is({node_key})"), + EventFilter::HasNodeKey => write!(f, "has-node-key"), + EventFilter::NodeTargetIs(node_targets) => write!(f, "node-target-is({node_targets})"), + } + } +} diff --git a/crates/common/src/events/filter_ops.rs b/crates/common/src/events/filter_ops.rs new file mode 100644 index 00000000..572c954d --- /dev/null +++ b/crates/common/src/events/filter_ops.rs @@ -0,0 +1,161 @@ +use super::{EventFilter, EventKindFilter}; + +impl std::ops::BitAnd for EventFilter { + type Output = Self; + + fn bitand(self, rhs: Self) -> Self::Output { + match (self, rhs) { + (EventFilter::Unfiltered, rhs) => rhs, + (lhs, EventFilter::Unfiltered) => lhs, + (EventFilter::AllOf(mut filters), EventFilter::AllOf(rhs_filters)) => { + filters.extend(rhs_filters); + EventFilter::AllOf(filters) + } + (EventFilter::AllOf(mut filters), rhs) => { + filters.push(rhs); + EventFilter::AllOf(filters) + } + (lhs, EventFilter::AllOf(mut rhs_filters)) => { + rhs_filters.push(lhs); + EventFilter::AllOf(rhs_filters) + } + (lhs, rhs) => EventFilter::AllOf(vec![lhs, rhs]), + } + } +} + +impl std::ops::BitOr for EventFilter { + type Output = Self; + + fn bitor(self, rhs: Self) -> Self::Output { + match (self, rhs) { + (EventFilter::Unfiltered, _) => EventFilter::Unfiltered, + (_, EventFilter::Unfiltered) => EventFilter::Unfiltered, + (EventFilter::AnyOf(mut filters), EventFilter::AnyOf(rhs_filters)) => { + filters.extend(rhs_filters); + EventFilter::AnyOf(filters) + } + (EventFilter::AnyOf(mut filters), rhs) => { + filters.push(rhs); + EventFilter::AnyOf(filters) + } + (lhs, EventFilter::AnyOf(mut rhs_filters)) => { + rhs_filters.push(lhs); + EventFilter::AnyOf(rhs_filters) + } + (lhs, rhs) => EventFilter::AnyOf(vec![lhs, rhs]), + } + } +} + +impl std::ops::BitXor for EventFilter { + type Output = Self; + + fn bitxor(self, rhs: Self) -> Self::Output { + match (self, rhs) { + (EventFilter::Unfiltered, rhs) => rhs, + (lhs, EventFilter::Unfiltered) => lhs, + (EventFilter::OneOf(mut filters), EventFilter::OneOf(rhs_filters)) => { + filters.extend(rhs_filters); + EventFilter::OneOf(filters) + } + (EventFilter::OneOf(mut filters), rhs) => { + filters.push(rhs); + EventFilter::OneOf(filters) + } + (lhs, EventFilter::OneOf(mut rhs_filters)) => { + rhs_filters.push(lhs); + EventFilter::OneOf(rhs_filters) + } + (lhs, rhs) => EventFilter::OneOf(vec![lhs, rhs]), + } + } +} + +impl std::ops::Not for EventFilter { + type Output = Self; + + fn not(self) -> Self::Output { + EventFilter::Not(Box::new(self)) + } +} + +impl std::ops::Not for EventKindFilter { + type Output = EventFilter; + + fn not(self) -> Self::Output { + !EventFilter::EventIs(self) + } +} + +impl std::ops::BitOr for EventKindFilter { + type Output = EventFilter; + + fn bitor(self, rhs: EventFilter) -> Self::Output { + EventFilter::EventIs(self) | rhs + } +} + +impl std::ops::BitAnd for EventKindFilter { + type Output = EventFilter; + + fn bitand(self, rhs: EventFilter) -> Self::Output { + EventFilter::EventIs(self) & rhs + } +} + +impl std::ops::BitXor for EventKindFilter { + type Output = EventFilter; + + fn bitxor(self, rhs: EventFilter) -> Self::Output { + EventFilter::EventIs(self) ^ rhs + } +} + +impl std::ops::BitOr for EventFilter { + type Output = EventFilter; + + fn bitor(self, rhs: EventKindFilter) -> Self::Output { + self | EventFilter::EventIs(rhs) + } +} + +impl std::ops::BitAnd for EventFilter { + type Output = EventFilter; + + fn bitand(self, rhs: EventKindFilter) -> Self::Output { + self & EventFilter::EventIs(rhs) + } +} + +impl std::ops::BitXor for EventFilter { + type Output = EventFilter; + + fn bitxor(self, rhs: EventKindFilter) -> Self::Output { + self ^ EventFilter::EventIs(rhs) + } +} + +impl std::ops::BitOr for EventKindFilter { + type Output = EventFilter; + + fn bitor(self, rhs: Self) -> Self::Output { + EventFilter::EventIs(self) | EventFilter::EventIs(rhs) + } +} + +impl std::ops::BitAnd for EventKindFilter { + type Output = EventFilter; + + fn bitand(self, rhs: Self) -> Self::Output { + EventFilter::EventIs(self) & EventFilter::EventIs(rhs) + } +} + +impl std::ops::BitXor for EventKindFilter { + type Output = EventFilter; + + fn bitxor(self, rhs: Self) -> Self::Output { + EventFilter::EventIs(self) ^ EventFilter::EventIs(rhs) + } +} diff --git a/crates/common/src/events/filter_parse.rs b/crates/common/src/events/filter_parse.rs new file mode 100644 index 00000000..252926e3 --- /dev/null +++ b/crates/common/src/events/filter_parse.rs @@ -0,0 +1,332 @@ +use std::{fmt::Display, str::FromStr, sync::Arc}; + +use serde::{Deserialize, Serialize, Serializer}; + +use super::EventFilter; +use crate::events::EventKindFilter; +use crate::node_targets::{NodeTarget, NodeTargets}; + +/* Example EventFilter string representation: + + unfiltered + any-of(agent-connected, agent-disconnected) + all-of(not(agent-is(foo-bar)), env-is(default)) + node-key-is(client/foo) + node-target-is(client/test-*@*) + node-target-is(client/any) + not(unfiltered) +*/ + +#[derive(Debug, Copy, Clone)] +enum Token<'a> { + OpenParen, + CloseParen, + Comma, + Whitespace, + Text(&'a str), +} + +impl<'a> Token<'a> { + fn label(self) -> &'static str { + match self { + Token::OpenParen => "open paren", + Token::CloseParen => "close paren", + Token::Comma => "comma", + Token::Whitespace => "whitespace", + Token::Text(_) => "text", + } + } + + fn text(self) -> Option<&'a str> { + match self { + Token::Text(s) => Some(s), + _ => None, + } + } + + fn parsed_text(self) -> Option> { + self.text().map(|s| s.trim().parse()) + } + + fn open_paren(self) -> Option<()> { + matches!(self, Token::OpenParen).then(|| ()) + } + + fn close_paren(self) -> Option<()> { + matches!(self, Token::CloseParen).then(|| ()) + } +} + +struct Lexer<'a> { + string: &'a str, + chars: std::iter::Peekable>>, +} + +impl<'a> Lexer<'a> { + fn new(string: &'a str) -> Lexer<'a> { + Lexer { + string, + chars: string.chars().enumerate().peekable(), + } + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Token<'a>; + + fn next(&mut self) -> Option { + let (index, c) = self.chars.next()?; + Some(match c { + '(' => Token::OpenParen, + ')' => Token::CloseParen, + ',' => Token::Comma, + c if c.is_whitespace() => { + while let Some((_, c)) = self.chars.peek() { + if !c.is_whitespace() { + break; + } + self.chars.next(); + } + // In the future, we might want to return the whitespace + + // let end = self + // .chars + // .peek() + // .map_or_else(|| self.string.len(), |(i, _)| *i); + // Token::Whitespace(&self.string[index..end]) + + Token::Whitespace + } + _ => { + while let Some((_, c)) = self.chars.peek() { + if c == &'(' || c == &')' || c == &',' { + break; + } + self.chars.next(); + } + let end = self + .chars + .peek() + .map_or_else(|| self.string.len(), |(i, _)| *i); + Token::Text(&self.string[index..end]) + } + }) + } +} + +#[derive(Debug, thiserror::Error)] +pub enum EventFilterParseError { + #[error("invalid filter: {0}")] + InvalidFilter(String), + #[error("expected token {0:?}, received {1}")] + ExpectedToken(EventFilterParsable, String), + #[error("error parsing {0:?}: {1}")] + ParseError(EventFilterParsable, String), + #[error("unexpected trailing tokens")] + TrailingTokens, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub enum EventFilterParsable { + OpenParen, + CloseParen, + CommaOrCloseParen, + FilterName, + AgentId, + EnvId, + TransactionId, + CannonId, + EventKind, + NodeKey, + NodeTarget, +} + +struct FilterParser<'a> { + tokens: std::iter::Peekable>, +} + +fn expect_token<'a, T>( + token: Option>, + label: EventFilterParsable, + matcher: impl Fn(Token<'a>) -> Option, +) -> Result { + use EventFilterParseError::*; + let token = token.ok_or_else(|| ExpectedToken(label, "EOF".to_string()))?; + matcher(token).ok_or_else(|| ExpectedToken(label, token.label().to_string())) +} + +fn expect_parsed( + token: Option, + label: EventFilterParsable, +) -> Result +where + ::Err: Display, +{ + expect_token(token, label, |token| token.parsed_text::())? + .map_err(|e| EventFilterParseError::ParseError(label, e.to_string())) +} + +fn expect_open_paren(token: Option) -> Result<(), EventFilterParseError> { + expect_token(token, EventFilterParsable::OpenParen, |token| { + token.open_paren() + }) +} + +fn expect_close_paren(token: Option) -> Result<(), EventFilterParseError> { + expect_token(token, EventFilterParsable::CloseParen, |token| { + token.close_paren() + }) +} + +impl<'a> FilterParser<'a> { + fn new(str: &'a str) -> Self { + Self { + tokens: Lexer::new(str).peekable(), + } + } + + fn next(&mut self) -> Option> { + self.tokens.next() + } + + fn parens( + &mut self, + filter: impl Fn(&mut Self) -> Result, + ) -> Result { + self.trim_whitespace(); + expect_open_paren(self.next())?; + self.trim_whitespace(); + let filter = filter(self)?; + expect_close_paren(self.next())?; + Ok(filter) + } + + fn expect_filter(&mut self) -> Result { + use EventFilter::*; + use EventFilterParsable as P; + use EventFilterParseError::InvalidFilter; + + self.trim_whitespace(); + + let filter_name = expect_token(self.next(), P::FilterName, |token| token.text())?; + + match filter_name.trim() { + "unfiltered" => Ok(Unfiltered), + "any-of" => self.parens(|t| t.vec_of(|s| s.expect_filter()).map(AnyOf)), + "all-of" => self.parens(|t| t.vec_of(|s| s.expect_filter()).map(AllOf)), + "one-of" => self.parens(|t| t.vec_of(|s| s.expect_filter()).map(OneOf)), + "not" => self.parens(|t| Ok(Not(Box::new(t.expect_filter()?)))), + + "agent-is" => self.parens(|t| expect_parsed(t.next(), P::AgentId).map(AgentIs)), + "has-agent" => Ok(HasAgent), + "env-is" => self.parens(|t| expect_parsed(t.next(), P::EnvId).map(EnvIs)), + "has-env" => Ok(HasEnv), + "transaction-is" => self.parens(|t| { + expect_token(t.next(), P::TransactionId, |token| token.text()) + .map(|t| TransactionIs(Arc::new(t.to_string()))) + }), + "has-transaction" => Ok(HasTransaction), + "cannon-is" => self.parens(|t| expect_parsed(t.next(), P::CannonId).map(CannonIs)), + "has-cannon" => Ok(HasCannon), + "event-is" => self.parens(|t| expect_parsed(t.next(), P::EventKind).map(EventIs)), + "node-key-is" => self.parens(|t| expect_parsed(t.next(), P::NodeKey).map(NodeKeyIs)), + "has-node-key" => Ok(HasNodeKey), + "node-target-is" => self.parens(|t| { + t.vec_of(|t| expect_parsed::(t.next(), P::NodeTarget)) + .map(|v| NodeTargetIs(NodeTargets::from(v))) + }), + + // Try to parse as an event kind filter as a fallback + unknown => unknown + .parse::() + .map(EventIs) + .map_err(|_| InvalidFilter(unknown.to_string())), + } + } + + fn vec_of( + &mut self, + matcher: impl Fn(&mut Self) -> Result, + ) -> Result, EventFilterParseError> { + use EventFilterParsable::*; + use EventFilterParseError::ExpectedToken; + + self.trim_whitespace(); + let mut filters = Vec::new(); + loop { + match self.tokens.peek() { + Some(Token::CloseParen) => break, + None => return Err(ExpectedToken(CloseParen, "EOF".to_string())), + Some(_) => {} + } + + filters.push(matcher(self)?); + self.trim_whitespace(); + + // Expect either a comma or a close paren + match self.tokens.peek() { + // This also supports trailing commas + Some(Token::Comma) => { + self.tokens.next(); + self.trim_whitespace(); + } + Some(Token::CloseParen) => break, + Some(_) => { + return Err(ExpectedToken( + CommaOrCloseParen, + self.tokens.peek().unwrap().label().to_string(), + )) + } + None => return Err(ExpectedToken(CommaOrCloseParen, "EOF".to_string())), + } + } + Ok(filters) + } + + /// Remove leading whitespace tokens from the token stream. + fn trim_whitespace(&mut self) { + while let Some(Token::Whitespace) = self.tokens.peek() { + self.tokens.next(); + } + } + + fn trailing_tokens(&mut self) -> Result<(), EventFilterParseError> { + self.trim_whitespace(); + if self.tokens.next().is_some() { + Err(EventFilterParseError::TrailingTokens) + } else { + Ok(()) + } + } +} + +impl FromStr for EventFilter { + type Err = EventFilterParseError; + + fn from_str(s: &str) -> Result { + let mut parser = FilterParser::new(s); + let filter = parser.expect_filter()?; + parser.trailing_tokens()?; + Ok(filter) + } +} + +impl<'de> Deserialize<'de> for EventFilter { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + String::deserialize(deserializer)? + .parse() + .map_err(serde::de::Error::custom) + } +} + +impl Serialize for EventFilter { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_str(&self.to_string()) + } +} diff --git a/crates/common/src/events/mod.rs b/crates/common/src/events/mod.rs new file mode 100644 index 00000000..f0cfa860 --- /dev/null +++ b/crates/common/src/events/mod.rs @@ -0,0 +1,21 @@ +mod models; +pub use models::*; +mod filter_parse; +mod traits; +pub use traits::*; +mod filter; +pub use filter::*; +mod filter_ops; + +pub mod prelude { + pub use super::filter::EventFilter::*; + pub use super::models::EventKindFilter::*; + pub use super::models::*; +} + +#[cfg(test)] +mod test_filter; +#[cfg(test)] +mod test_filter_ops; +#[cfg(test)] +mod test_filter_parse; diff --git a/crates/common/src/events/models.rs b/crates/common/src/events/models.rs new file mode 100644 index 00000000..b030a6b5 --- /dev/null +++ b/crates/common/src/events/models.rs @@ -0,0 +1,243 @@ +use std::{fmt::Display, str::FromStr, sync::Arc}; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; + +use super::EventFilter; +use crate::{ + rpc::error::ReconcileError, + state::{ + AgentId, Authorization, EnvId, InternedId, LatestBlockInfo, NodeKey, NodeStatus, + ReconcileStatus, TransactionSendState, + }, +}; + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "action", rename_all = "snake_case")] +pub enum EventWsRequest { + Subscribe { id: u32, filter: EventFilter }, + Unsubscribe { id: u32 }, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct Event { + pub created_at: DateTime, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub agent: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub node_key: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub env: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub transaction: Option>, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub cannon: Option, + #[serde(flatten)] + pub content: EventKind, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "event_kind", rename_all = "snake_case")] +pub enum EventKind { + Agent(AgentEvent), + Transaction(TransactionEvent), +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "event_name", content = "data", rename_all = "snake_case")] +pub enum AgentEvent { + /// An agent connects to the control plane + Connected { version: String }, + /// An agent completes a handshake with the control plane + HandshakeComplete, + /// An agent disconnects from the control plane + Disconnected, + /// An agent finishes a reconcile + ReconcileComplete, + /// An agent updates its reconcile status + Reconcile(ReconcileStatus), + /// An error occurs during reconcile + ReconcileError(ReconcileError), + /// An agent emits a node status + NodeStatus(NodeStatus), + /// An agent emits a block update + BlockInfo(LatestBlockInfo), +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "event_name", content = "data", rename_all = "snake_case")] +pub enum TransactionEvent { + /// The authorization was inserted into the cannon + AuthorizationReceived { authorization: Arc }, + /// The transaction execution was aborted + ExecuteAborted(TransactionAbortReason), + /// The transaction is awaiting compute resources + ExecuteAwaitingCompute, + /// An execution failed to complete after multiple attempts + ExecuteExceeded { attempts: u32 }, + /// The transaction execution failed + ExecuteFailed(String), + /// The transaction is currently executing + Executing, + /// The transaction execution is complete + ExecuteComplete { transaction: Arc }, + /// The transaction has been broadcasted + Broadcasted { + height: Option, + timestamp: DateTime, + }, + /// The transaction broadcast has exceeded the maximum number of attempts + BroadcastExceeded { attempts: u32 }, + /// The transaction has been confirmed by the network + Confirmed { hash: String }, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "reason", rename_all = "snake_case")] +pub enum TransactionAbortReason { + MissingTracker, + UnexpectedStatus { + transaction_status: TransactionSendState, + }, + MissingAuthorization, +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum EventKindFilter { + AgentConnected, + AgentHandshakeComplete, + AgentDisconnected, + AgentReconcileComplete, + AgentReconcile, + AgentReconcileError, + AgentNodeStatus, + AgentBlockInfo, + TransactionAuthorizationReceived, + TransactionExecuteAborted, + TransactionExecuteAwaitingCompute, + TransactionExecuteExceeded, + TransactionExecuteFailed, + TransactionExecuting, + TransactionExecuteComplete, + TransactionBroadcasted, + TransactionBroadcastExceeded, + TransactionConfirmed, +} + +impl EventKind { + pub fn filter(&self) -> EventKindFilter { + use AgentEvent::*; + use EventKind::*; + use EventKindFilter::*; + use TransactionEvent::*; + + match self { + Agent(Connected { .. }) => AgentConnected, + Agent(HandshakeComplete) => AgentHandshakeComplete, + Agent(Disconnected) => AgentDisconnected, + Agent(ReconcileComplete) => AgentReconcileComplete, + Agent(Reconcile(_)) => AgentReconcile, + Agent(ReconcileError(_)) => AgentReconcileError, + Agent(NodeStatus(_)) => AgentNodeStatus, + Agent(BlockInfo(_)) => AgentBlockInfo, + Transaction(AuthorizationReceived { .. }) => TransactionAuthorizationReceived, + Transaction(ExecuteAborted(_)) => TransactionExecuteAborted, + Transaction(ExecuteAwaitingCompute) => TransactionExecuteAwaitingCompute, + Transaction(ExecuteExceeded { .. }) => TransactionExecuteExceeded, + Transaction(ExecuteFailed(_)) => TransactionExecuteFailed, + Transaction(Executing) => TransactionExecuting, + Transaction(ExecuteComplete { .. }) => TransactionExecuteComplete, + Transaction(Broadcasted { .. }) => TransactionBroadcasted, + Transaction(BroadcastExceeded { .. }) => TransactionBroadcastExceeded, + Transaction(Confirmed { .. }) => TransactionConfirmed, + } + } +} + +impl FromStr for EventKindFilter { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + // kebab-case + "agent-connected" => Ok(Self::AgentConnected), + "agent-handshake-complete" => Ok(Self::AgentHandshakeComplete), + "agent-disconnected" => Ok(Self::AgentDisconnected), + "agent-reconcile-complete" => Ok(Self::AgentReconcileComplete), + "agent-reconcile" => Ok(Self::AgentReconcile), + "agent-reconcile-error" => Ok(Self::AgentReconcileError), + "agent-node-status" => Ok(Self::AgentNodeStatus), + "agent-block-info" => Ok(Self::AgentBlockInfo), + "transaction-authorization-received" => Ok(Self::TransactionAuthorizationReceived), + "transaction-execute-aborted" => Ok(Self::TransactionExecuteAborted), + "transaction-execute-awaiting-compute" => Ok(Self::TransactionExecuteAwaitingCompute), + "transaction-execute-exceeded" => Ok(Self::TransactionExecuteExceeded), + "transaction-execute-failed" => Ok(Self::TransactionExecuteFailed), + "transaction-executing" => Ok(Self::TransactionExecuting), + "transaction-execute-complete" => Ok(Self::TransactionExecuteComplete), + "transaction-broadcasted" => Ok(Self::TransactionBroadcasted), + "transaction-broadcast-exceeded" => Ok(Self::TransactionBroadcastExceeded), + "transaction-confirmed" => Ok(Self::TransactionConfirmed), + _ => Err(format!("invalid event kind: {s}")), + } + } +} + +impl Display for EventKindFilter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use EventKindFilter::*; + + let s = match self { + AgentConnected => "agent-connected", + AgentHandshakeComplete => "agent-handshake-complete", + AgentDisconnected => "agent-disconnected", + AgentReconcileComplete => "agent-reconcile-complete", + AgentReconcile => "agent-reconcile", + AgentReconcileError => "agent-reconcile-error", + AgentNodeStatus => "agent-node-status", + AgentBlockInfo => "agent-block-info", + TransactionAuthorizationReceived => "transaction-authorization-received", + TransactionExecuteAborted => "transaction-execute-aborted", + TransactionExecuteAwaitingCompute => "transaction-execute-awaiting-compute", + TransactionExecuteExceeded => "transaction-execute-exceeded", + TransactionExecuteFailed => "transaction-execute-failed", + TransactionExecuting => "transaction-executing", + TransactionExecuteComplete => "transaction-execute-complete", + TransactionBroadcasted => "transaction-broadcasted", + TransactionBroadcastExceeded => "transaction-broadcast-exceeded", + TransactionConfirmed => "transaction-confirmed", + }; + + write!(f, "{}", s) + } +} + +impl Event { + pub fn new(content: EventKind) -> Self { + Self { + created_at: Utc::now(), + agent: None, + node_key: None, + env: None, + transaction: None, + cannon: None, + content, + } + } + + pub fn kind(&self) -> EventKindFilter { + self.content.filter() + } + + pub fn replace_content(&self, content: impl Into) -> Self { + Self { + created_at: Utc::now(), + agent: self.agent, + node_key: self.node_key.clone(), + env: self.env, + transaction: self.transaction.clone(), + cannon: self.cannon, + content: content.into().content, + } + } +} diff --git a/crates/common/src/events/test_filter.rs b/crates/common/src/events/test_filter.rs new file mode 100644 index 00000000..9ec2e81c --- /dev/null +++ b/crates/common/src/events/test_filter.rs @@ -0,0 +1,134 @@ +use std::str::FromStr; + +use chrono::Utc; +use lazy_static::lazy_static; + +use super::{AgentEvent::*, EventFilter::*, EventKind::*, EventKindFilter::*}; +use crate::events::{Event, EventHelpers}; +use crate::{ + node_targets::NodeTargets, + rpc::error::ReconcileError, + state::{InternedId, LatestBlockInfo, NodeKey, NodeStatus, ReconcileStatus}, +}; + +lazy_static! { + static ref A: InternedId = InternedId::from_str("a").unwrap(); + static ref B: InternedId = InternedId::from_str("b").unwrap(); + static ref C: InternedId = InternedId::from_str("c").unwrap(); + static ref D: InternedId = InternedId::from_str("d").unwrap(); +} + +#[test] +fn test_unfiltered() { + assert!(Connected { + version: "0.0.0".to_string() + } + .event() + .matches(&Unfiltered)); + assert!(HandshakeComplete.event().matches(&Unfiltered)); + assert!(Disconnected.event().matches(&Unfiltered)); + assert!(ReconcileComplete.event().matches(&Unfiltered)); + assert!(Reconcile(ReconcileStatus::empty()) + .event() + .matches(&Unfiltered)); + assert!(ReconcileError(ReconcileError::Offline) + .event() + .matches(&Unfiltered)); + assert!(NodeStatus(NodeStatus::Unknown).event().matches(&Unfiltered)); + assert!(BlockInfo(LatestBlockInfo::default()) + .event() + .matches(&Unfiltered)); +} + +#[test] +fn test_all_of() { + assert!(Connected { + version: "0.0.0".to_string() + } + .event() + .matches(&AllOf(vec![EventIs(AgentConnected)]))); + + let e = Event { + created_at: Utc::now(), + agent: Some(*A), + node_key: Some(NodeKey::from_str("client/foo").unwrap()), + env: Some(*B), + transaction: None, + cannon: None, + content: Agent(Connected { + version: "0.0.0".to_string(), + }), + }; + + assert!(e.matches(&(AgentConnected & AgentIs(*A)))); + assert!(e.matches(&(AgentConnected & NodeKeyIs(NodeKey::from_str("client/foo").unwrap())))); + assert!(e.matches(&(AgentConnected & EnvIs(*B)))); + assert!(e.matches(&(AgentIs(*A) & NodeTargetIs(NodeTargets::ALL) & EnvIs(*B)))); + + assert!(!e.matches(&(AgentConnected & AgentIs(*B)))); + assert!(!e.matches(&(AgentConnected & NodeKeyIs(NodeKey::from_str("client/bar").unwrap())))); + assert!(!e.matches(&(AgentConnected & EnvIs(*A)))); + assert!(!e.matches(&(AgentIs(*B) & NodeTargetIs(NodeTargets::ALL) & EnvIs(*B)))); +} + +#[test] +fn test_any_of() { + assert!(Connected { + version: "0.0.0".to_string() + } + .event() + .matches(&AnyOf(vec![EventIs(AgentConnected)]))); + + let e = Event { + created_at: Utc::now(), + agent: Some(*A), + node_key: Some(NodeKey::from_str("client/foo").unwrap()), + env: Some(*B), + transaction: None, + cannon: None, + content: Agent(Connected { + version: "0.0.0".to_string(), + }), + }; + + assert!(e.matches(&(AgentConnected | AgentIs(*A)))); + assert!(e.matches(&(AgentConnected | NodeKeyIs(NodeKey::from_str("client/foo").unwrap())))); + assert!(e.matches(&(AgentConnected | EnvIs(*B)))); + assert!(e.matches(&(AgentIs(*A) | NodeTargetIs(NodeTargets::ALL) | EnvIs(*B)))); + + assert!(e.matches(&(AgentConnected | AgentIs(*B)))); + assert!(e.matches(&(AgentConnected | NodeKeyIs(NodeKey::from_str("client/bar").unwrap())))); + assert!(e.matches(&(AgentConnected | EnvIs(*A)))); + + assert!(e.matches(&(AgentIs(*B) | NodeTargetIs(NodeTargets::ALL) | EnvIs(*B)))); + + assert!(!e.matches(&(AgentDisconnected | AgentIs(*C)))); + assert!(!e.matches(&(AgentDisconnected | NodeKeyIs(NodeKey::from_str("client/bar").unwrap())))); +} + +#[test] +fn test_one_of() { + assert!(Connected { + version: "0.0.0".to_string() + } + .event() + .matches(&OneOf(vec![EventIs(AgentConnected)]))); + + let e = Event { + created_at: Utc::now(), + agent: Some(*A), + node_key: Some(NodeKey::from_str("client/foo").unwrap()), + env: Some(*B), + transaction: None, + cannon: None, + content: Agent(Connected { + version: "0.0.0".to_string(), + }), + }; + + assert!(e.matches(&(AgentConnected ^ AgentIs(*B)))); + assert!(e.matches(&(AgentConnected & (AgentIs(*A) ^ AgentIs(*B) ^ AgentIs(*C))))); + + assert!(!e.matches(&(AgentConnected ^ AgentIs(*A)))); + assert!(e.matches(&(!(AgentConnected ^ AgentIs(*A))))); +} diff --git a/crates/common/src/events/test_filter_ops.rs b/crates/common/src/events/test_filter_ops.rs new file mode 100644 index 00000000..42ab1a25 --- /dev/null +++ b/crates/common/src/events/test_filter_ops.rs @@ -0,0 +1,79 @@ +use std::str::FromStr; + +use lazy_static::lazy_static; + +use super::EventFilter::*; +use super::EventKindFilter::*; +use crate::state::InternedId; + +lazy_static! { + static ref A: InternedId = InternedId::from_str("a").unwrap(); + static ref B: InternedId = InternedId::from_str("b").unwrap(); + static ref C: InternedId = InternedId::from_str("c").unwrap(); + static ref D: InternedId = InternedId::from_str("d").unwrap(); +} + +#[test] +fn test_filter_bitand() { + assert_eq!(Unfiltered & Unfiltered, Unfiltered); + assert_eq!(AgentBlockInfo & Unfiltered, EventIs(AgentBlockInfo)); + assert_eq!( + AgentBlockInfo & AgentIs(*A), + AllOf(vec![EventIs(AgentBlockInfo), AgentIs(*A)]) + ); + assert_eq!( + AgentIs(*A) & AgentIs(*B), + AllOf(vec![AgentIs(*A), AgentIs(*B)]) + ); + assert_eq!( + AgentIs(*A) & AgentIs(*B) & AgentIs(*C), + AllOf(vec![AgentIs(*A), AgentIs(*B), AgentIs(*C)]) + ); +} + +#[test] +fn test_filter_bitor() { + assert_eq!(Unfiltered | Unfiltered, Unfiltered); + assert_eq!(AgentBlockInfo | Unfiltered, Unfiltered); + assert_eq!( + AgentBlockInfo | AgentIs(*A), + AnyOf(vec![EventIs(AgentBlockInfo), AgentIs(*A)]) + ); + assert_eq!( + AgentIs(*A) | AgentIs(*B), + AnyOf(vec![AgentIs(*A), AgentIs(*B)]) + ); + assert_eq!( + AgentIs(*A) | AgentIs(*B) | AgentIs(*C), + AnyOf(vec![AgentIs(*A), AgentIs(*B), AgentIs(*C)]) + ); +} + +#[test] +fn test_filter_bitxor() { + assert_eq!(Unfiltered ^ Unfiltered, Unfiltered); + assert_eq!(AgentBlockInfo ^ Unfiltered, EventIs(AgentBlockInfo)); + assert_eq!( + AgentBlockInfo ^ AgentIs(*A), + OneOf(vec![EventIs(AgentBlockInfo), AgentIs(*A)]) + ); + assert_eq!( + AgentIs(*A) ^ AgentIs(*B), + OneOf(vec![AgentIs(*A), AgentIs(*B)]) + ); + assert_eq!( + AgentIs(*A) ^ AgentIs(*B) ^ AgentIs(*C), + OneOf(vec![AgentIs(*A), AgentIs(*B), AgentIs(*C)]) + ); +} + +#[test] +fn test_filter_not() { + assert_eq!(!Unfiltered, Not(Box::new(Unfiltered))); + assert_eq!(!AgentBlockInfo, Not(Box::new(EventIs(AgentBlockInfo)))); + assert_eq!(!AgentIs(*A), Not(Box::new(AgentIs(*A)))); + assert_eq!( + !AgentIs(*A) & AgentIs(*B), + AllOf(vec![Not(Box::new(AgentIs(*A))), AgentIs(*B)]) + ); +} diff --git a/crates/common/src/events/test_filter_parse.rs b/crates/common/src/events/test_filter_parse.rs new file mode 100644 index 00000000..e5069fab --- /dev/null +++ b/crates/common/src/events/test_filter_parse.rs @@ -0,0 +1,202 @@ +use std::sync::Arc; + +use super::{ + filter_parse::EventFilterParseError, + EventFilter::{self, *}, + EventKindFilter::*, +}; +use crate::events::filter_parse::EventFilterParsable; +use crate::{node_targets::NodeTargets, state::InternedId}; + +macro_rules! eq { + ($s:expr, $f:expr) => { + assert_eq!($s.parse::()?, $f); + }; +} + +macro_rules! err { + ($s:expr, $pattern:pat $(if $guard:expr)?) => { + assert!(match $s.parse::() { + $pattern $(if $guard)? => true, + other => { + eprintln!("Received {other:?}"); + false + } + }) + }; +} + +#[test] +fn test_each_filter() -> Result<(), EventFilterParseError> { + eq!("unfiltered", Unfiltered); + eq!("all-of(unfiltered)", AllOf(vec![Unfiltered])); + eq!("any-of(unfiltered)", AnyOf(vec![Unfiltered])); + eq!("one-of(unfiltered)", OneOf(vec![Unfiltered])); + eq!("not(unfiltered)", Not(Box::new(Unfiltered))); + eq!("agent-is(default)", AgentIs(InternedId::default())); + eq!("env-is(default)", EnvIs(InternedId::default())); + eq!( + "transaction-is(foo)", + TransactionIs(Arc::new(String::from("foo"))) + ); + eq!("cannon-is(default)", CannonIs(InternedId::default())); + eq!("event-is(agent-connected)", EventIs(AgentConnected)); + eq!( + "node-key-is(client/foo)", + NodeKeyIs("client/foo".parse().unwrap()) + ); + eq!( + "node-target-is(client/any)", + NodeTargetIs(NodeTargets::One("client/any".parse().unwrap())) + ); + + Ok(()) +} + +#[test] +fn test_array() -> Result<(), EventFilterParseError> { + eq!( + "all-of(unfiltered, unfiltered)", + AllOf(vec![Unfiltered, Unfiltered]) + ); + eq!( + "any-of(unfiltered, unfiltered)", + AnyOf(vec![Unfiltered, Unfiltered]) + ); + eq!( + "one-of(unfiltered, unfiltered)", + OneOf(vec![Unfiltered, Unfiltered]) + ); + + eq!( + "any-of( + unfiltered, + all-of(unfiltered), + any-of(unfiltered), + one-of(unfiltered), + not(unfiltered), + agent-is(default), + env-is(default), + transaction-is(foo), + cannon-is(default), + event-is(agent-connected), + node-key-is(client/foo), + node-target-is(client/any) + )", + AnyOf(vec![ + Unfiltered, + AllOf(vec![Unfiltered]), + AnyOf(vec![Unfiltered]), + OneOf(vec![Unfiltered]), + Not(Box::new(Unfiltered)), + AgentIs(InternedId::default()), + EnvIs(InternedId::default()), + TransactionIs(Arc::new(String::from("foo"))), + CannonIs(InternedId::default()), + EventIs(AgentConnected), + NodeKeyIs("client/foo".parse().unwrap()), + NodeTargetIs(NodeTargets::One("client/any".parse().unwrap())), + ]) + ); + + eq!( + "node-target-is(client/any,validator/any)", + NodeTargetIs(NodeTargets::Many(vec![ + "client/any".parse().unwrap(), + "validator/any".parse().unwrap(), + ])) + ); + + Ok(()) +} + +#[test] +fn test_whitespace_ignore() -> Result<(), EventFilterParseError> { + eq!( + " all-of ( unfiltered , unfiltered ) ", + AllOf(vec![Unfiltered, Unfiltered]) + ); + Ok(()) +} + +#[test] +fn test_trailing_commas() -> Result<(), EventFilterParseError> { + eq!("all-of(unfiltered,)", AllOf(vec![Unfiltered])); + Ok(()) +} + +#[test] +fn test_deep_nesting() -> Result<(), EventFilterParseError> { + eq!( + "all-of(all-of(all-of(all-of(all-of(all-of(unfiltered))))))", + AllOf(vec![AllOf(vec![AllOf(vec![AllOf(vec![AllOf(vec![ + AllOf(vec![Unfiltered]) + ])])])])]) + ); + + // not + eq!("not(not(not(not(not(not(unfiltered))))))", !!!!!!Unfiltered); + + Ok(()) +} + +#[test] +fn test_invalid() { + err!( + "invalid", + Err(EventFilterParseError::InvalidFilter(e)) if e == "invalid" + ); +} + +#[test] +fn test_expected_parens() { + use EventFilterParsable::*; + + err!( + "all-of", + Err(EventFilterParseError::ExpectedToken(a, b)) if a == OpenParen && b == "EOF" + ); + err!( + "all-of(", + Err(EventFilterParseError::ExpectedToken(a, b)) if a == CloseParen && b == "EOF" + ); + err!( + "all-of(unfiltered", + Err(EventFilterParseError::ExpectedToken(a, b)) if a == CommaOrCloseParen && b == "EOF" + ); +} + +#[test] +fn test_failed_agent_parse() { + err!( + "agent-is(|)", + Err(EventFilterParseError::ParseError(EventFilterParsable::AgentId, e)) + if e.starts_with("invalid InternedId expected pattern") + ); +} + +#[test] +fn test_str() { + macro_rules! test { + ($s:expr) => { + assert_eq!($s.parse::().unwrap().to_string(), $s); + }; + } + + test!("unfiltered"); + test!("any-of(unfiltered)"); + test!("all-of(unfiltered)"); + test!("one-of(unfiltered)"); + test!("not(unfiltered)"); + test!("agent-is(default)"); + test!("env-is(default)"); + test!("transaction-is(foo)"); + test!("cannon-is(default)"); + test!("event-is(agent-connected)"); + test!("node-key-is(client/foo)"); + test!("node-target-is(client/any)"); + test!("node-target-is(client/any, validator/any)"); + + test!("any-of(unfiltered, unfiltered)"); + test!("any-of(agent-is(foo), cannon-is(bar))"); +} diff --git a/crates/common/src/events/traits.rs b/crates/common/src/events/traits.rs new file mode 100644 index 00000000..82042820 --- /dev/null +++ b/crates/common/src/events/traits.rs @@ -0,0 +1,73 @@ +use std::sync::Arc; + +use super::{AgentEvent, Event, EventFilter, EventKind, EventKindFilter, TransactionEvent}; +use crate::state::{AgentId, EnvId, InternedId, NodeKey}; + +impl From for EventFilter { + fn from(kind: EventKindFilter) -> Self { + EventFilter::EventIs(kind) + } +} + +pub trait EventHelpers { + fn event(self) -> Event; + fn with_agent_id(self, agent_id: AgentId) -> Event; + fn with_node_key(self, node_key: NodeKey) -> Event; + fn with_env_id(self, env_id: EnvId) -> Event; + fn with_transaction(self, transaction: Arc) -> Event; + fn with_cannon(self, cannon: InternedId) -> Event; +} + +impl> EventHelpers for T { + fn event(self) -> Event { + self.into() + } + + fn with_agent_id(self, agent_id: AgentId) -> Event { + let mut event = self.into(); + event.agent = Some(agent_id); + event + } + + fn with_node_key(self, node_key: NodeKey) -> Event { + let mut event = self.into(); + event.node_key = Some(node_key); + event + } + + fn with_env_id(self, env_id: EnvId) -> Event { + let mut event = self.into(); + event.env = Some(env_id); + event + } + + fn with_transaction(self, transaction: Arc) -> Event { + let mut event = self.into(); + event.transaction = Some(transaction); + event + } + + fn with_cannon(self, cannon: InternedId) -> Event { + let mut event = self.into(); + event.cannon = Some(cannon); + event + } +} + +impl From for Event { + fn from(kind: EventKind) -> Self { + Self::new(kind) + } +} + +impl From for Event { + fn from(kind: AgentEvent) -> Self { + Self::new(EventKind::Agent(kind)) + } +} + +impl From for Event { + fn from(kind: TransactionEvent) -> Self { + Self::new(EventKind::Transaction(kind)) + } +} diff --git a/crates/common/src/format/impl_collections.rs b/crates/common/src/format/impl_collections.rs index 9939cdf9..0ba87e34 100644 --- a/crates/common/src/format/impl_collections.rs +++ b/crates/common/src/format/impl_collections.rs @@ -10,6 +10,62 @@ use super::{ DataWriteError, }; +/// BytesFormat is a simple wrapper around a Vec that implements DataFormat +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct BytesFormat(pub Vec); +impl From> for BytesFormat { + fn from(data: Vec) -> Self { + Self(data) + } +} +impl From for Vec { + fn from(data: BytesFormat) -> Self { + data.0 + } +} + +impl DataFormat for BytesFormat { + type Header = (); + const LATEST_HEADER: Self::Header = (); + + fn write_data(&self, writer: &mut W) -> Result { + Ok(PackedUint::from(self.0.len()).write_data(writer)? + writer.write(&self.0)?) + } + + fn read_data(reader: &mut R, _header: &Self::Header) -> Result { + let mut data = vec![0; usize::from(PackedUint::read_data(reader, &())?)]; + reader.read_exact(&mut data)?; + Ok(Self(data)) + } +} + +/// EncodedFormat is a simple wrapper around a DataFormat to encode header data +/// with the data +#[derive(Debug, Clone)] +pub struct EncodedFormat(pub F); + +impl PartialEq for EncodedFormat { + fn eq(&self, other: &Self) -> bool { + self.0 == other.0 + } +} + +impl Eq for EncodedFormat {} + +impl DataFormat for EncodedFormat { + type Header = (); + const LATEST_HEADER: Self::Header = (); + + fn write_data(&self, writer: &mut W) -> Result { + Ok(self.write_header(writer)? + self.write_data(writer)?) + } + + fn read_data(reader: &mut R, _header: &Self::Header) -> Result { + let header = F::read_header(reader)?; + Ok(Self(F::read_data(reader, &header)?)) + } +} + impl DataFormat for [T; N] { type Header = T::Header; const LATEST_HEADER: Self::Header = T::LATEST_HEADER; @@ -131,7 +187,7 @@ impl_map!(IndexMap); #[cfg(test)] #[rustfmt::skip] mod test { - use crate::format::DataFormat; + use crate::format::{BytesFormat, DataFormat}; macro_rules! case { ($name:ident, $ty:ty, $a:expr, $b:expr) => { @@ -182,4 +238,12 @@ mod test { 3, 0, 4, 0 ]); + + // binary data test + case!(test_binary_data, BytesFormat, BytesFormat(vec![1, 2, 3]), [ + 1, 3, + 1, + 2, + 3 + ]); } diff --git a/crates/common/src/format/impl_containers.rs b/crates/common/src/format/impl_containers.rs index 3707ce05..de567668 100644 --- a/crates/common/src/format/impl_containers.rs +++ b/crates/common/src/format/impl_containers.rs @@ -1,4 +1,7 @@ -use std::io::{Read, Write}; +use std::{ + io::{Read, Write}, + sync::Arc, +}; use super::{DataFormat, DataFormatReader, DataFormatWriter, DataReadError, DataWriteError}; @@ -37,6 +40,19 @@ impl DataFormat for Box { } } +impl DataFormat for Arc { + type Header = T::Header; + const LATEST_HEADER: Self::Header = T::LATEST_HEADER; + + fn write_data(&self, writer: &mut W) -> Result { + self.as_ref().write_data(writer) + } + + fn read_data(reader: &mut R, header: &Self::Header) -> Result { + Ok(Arc::new(reader.read_data(header)?)) + } +} + #[cfg(test)] #[rustfmt::skip] mod test { diff --git a/crates/common/src/format/mod.rs b/crates/common/src/format/mod.rs index 6f698154..ed079221 100644 --- a/crates/common/src/format/mod.rs +++ b/crates/common/src/format/mod.rs @@ -14,6 +14,7 @@ mod impl_strings; mod impl_tuples; mod packed_int; +pub use impl_collections::{BytesFormat, EncodedFormat}; pub use packed_int::*; use thiserror::Error; @@ -108,6 +109,14 @@ pub trait DataFormat: Sized { self.write_data(&mut buf)?; Ok(buf) } + + /// Convert the data to a bytevec and include the header + fn to_byte_vec_headered(&self) -> Result, DataWriteError> { + let mut buf = Vec::new(); + self.write_header(&mut buf)?; + self.write_data(&mut buf)?; + Ok(buf) + } } pub trait DataFormatWriter { diff --git a/crates/common/src/lib.rs b/crates/common/src/lib.rs index b433e967..948881e8 100644 --- a/crates/common/src/lib.rs +++ b/crates/common/src/lib.rs @@ -9,6 +9,7 @@ pub mod api; pub mod binaries; pub mod constant; pub mod db; +pub mod events; pub mod format; pub mod key_source; pub mod node_targets; diff --git a/crates/common/src/node_targets.rs b/crates/common/src/node_targets.rs index 7f336378..4191137d 100644 --- a/crates/common/src/node_targets.rs +++ b/crates/common/src/node_targets.rs @@ -108,7 +108,7 @@ impl<'de> Deserialize<'de> for NodeTargets { lazy_static! { static ref NODE_TARGET_REGEX: Regex = - Regex::new(r"^(?P\*|client|validator|prover)\/(?P[A-Za-z0-9\-*]+)(?:@(?P[A-Za-z0-9\-*]+))?$") + Regex::new(r"^(?P\*|any|client|validator|prover)\/(?P[A-Za-z0-9\-*]+)(?:@(?P[A-Za-z0-9\-*]+))?$") .unwrap(); } @@ -135,13 +135,13 @@ impl fmt::Display for NodeTargets { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { NodeTargets::None => write!(f, ""), - NodeTargets::One(target) => write!(f, "{}", target), + NodeTargets::One(target) => write!(f, "{target}"), NodeTargets::Many(targets) => { let mut iter = targets.iter(); if let Some(target) = iter.next() { - write!(f, "{}", target)?; + write!(f, "{target}")?; for target in iter { - write!(f, ", {}", target)?; + write!(f, ", {target}")?; } } Ok(()) @@ -184,6 +184,7 @@ impl FromStr for NodeTarget { // match the type let ty = match &captures["ty"] { "*" => NodeTargetType::All, + "any" => NodeTargetType::All, "client" => NodeTargetType::One(NodeType::Client), "validator" => NodeTargetType::One(NodeType::Validator), "prover" => NodeTargetType::One(NodeType::Prover), @@ -194,6 +195,7 @@ impl FromStr for NodeTarget { let id = match &captures["id"] { // full wildcard "*" => NodeTargetId::All, + "any" => NodeTargetId::All, // partial wildcard id if id.contains('*') => NodeTargetId::WildcardPattern(WildMatch::new(id)), @@ -206,6 +208,7 @@ impl FromStr for NodeTarget { let ns = match captures.name("ns") { // full wildcard Some(id) if id.as_str() == "*" => NodeTargetNamespace::All, + Some(id) if id.as_str() == "any" => NodeTargetNamespace::All, // local; either explicitly stated, or empty Some(id) if id.as_str() == "local" => NodeTargetNamespace::Local, @@ -225,16 +228,16 @@ impl fmt::Display for NodeTarget { f, "{}/{}{}", match self.ty { - NodeTargetType::All => "*".to_owned(), + NodeTargetType::All => "any".to_owned(), NodeTargetType::One(ty) => ty.to_string(), }, match &self.id { - NodeTargetId::All => "*".to_owned(), + NodeTargetId::All => "any".to_owned(), NodeTargetId::WildcardPattern(pattern) => pattern.to_string(), NodeTargetId::Literal(id) => id.to_owned(), }, match &self.ns { - NodeTargetNamespace::All => "@*".to_owned(), + NodeTargetNamespace::All => "@any".to_owned(), NodeTargetNamespace::Local => "".to_owned(), NodeTargetNamespace::Literal(ns) => format!("@{}", ns), } diff --git a/crates/common/src/rpc/codec.rs b/crates/common/src/rpc/codec.rs new file mode 100644 index 00000000..957a341f --- /dev/null +++ b/crates/common/src/rpc/codec.rs @@ -0,0 +1,20 @@ +// rmp_serde and bincode have various limitations and are troublesome to debug. +// the overhead of JSON for messages is not a concern for the RPC layer. + +pub fn encode(msg: &T) -> serde_json::Result> { + serde_json::to_vec(msg) +} + +pub fn decode<'de, T: serde::Deserialize<'de>>(msg: &'de [u8]) -> serde_json::Result { + serde_json::from_slice(msg) +} + +// pub fn encode(msg: &T) -> Result, +// rmp_serde::encode::Error> { rmp_serde::to_vec(msg) +// } + +// pub fn decode<'de, T: serde::Deserialize<'de>>( +// msg: &'de [u8], +// ) -> Result { +// rmp_serde::from_slice(msg) +// } diff --git a/crates/common/src/rpc/control/agent.rs b/crates/common/src/rpc/control/agent.rs index 34403458..9c902b73 100644 --- a/crates/common/src/rpc/control/agent.rs +++ b/crates/common/src/rpc/control/agent.rs @@ -4,6 +4,7 @@ use serde::{Deserialize, Serialize}; use crate::rpc::error::*; use crate::state::snarkos_status::SnarkOSLiteBlock; +use crate::state::{AgentId, ReconcileOptions}; use crate::{ prelude::EnvId, state::{AgentState, NetworkId, PortConfig}, @@ -14,21 +15,25 @@ pub struct Handshake { pub jwt: Option, pub loki: Option, pub state: AgentState, + pub reconcile_opts: ReconcileOptions, } /// The RPC service that agents implement as a server. #[tarpc::service] pub trait AgentService { /// Handshake with some initial connection details. - async fn handshake(handshake: Handshake) -> Result<(), ReconcileError>; + async fn handshake(handshake: Handshake); /// Control plane asks the agent for its external network address, along /// with local addrs. async fn get_addrs() -> (PortConfig, Option, Vec); + /// An agent is instructed to clear the address of a peer. + async fn clear_peer_addr(agent_id: AgentId); + /// Control plane instructs the agent to reconcile towards a particular /// state. - async fn reconcile(to: AgentState) -> Result<(), ReconcileError>; + async fn set_agent_state(to: AgentState, opts: ReconcileOptions); /// Broadcast a transaction locally async fn broadcast_tx(tx: String) -> Result<(), AgentError>; diff --git a/crates/common/src/rpc/control/mod.rs b/crates/common/src/rpc/control/mod.rs index a99360a7..c8e17536 100644 --- a/crates/common/src/rpc/control/mod.rs +++ b/crates/common/src/rpc/control/mod.rs @@ -1,14 +1,11 @@ pub mod agent; -use std::{ - collections::{HashMap, HashSet}, - net::IpAddr, -}; +use std::{collections::HashMap, net::IpAddr}; -use super::error::ResolveError; +use super::error::{ReconcileError, ResolveError}; use crate::{ - api::EnvInfo, - state::{AgentId, EnvId, NodeStatus, TransferStatus, TransferStatusUpdate}, + api::AgentEnvInfo, + state::{AgentId, EnvId, NodeStatus, ReconcileStatus, TransferStatus, TransferStatusUpdate}, }; pub const PING_HEADER: &[u8] = b"snops-agent"; @@ -16,12 +13,10 @@ pub const PING_HEADER: &[u8] = b"snops-agent"; #[tarpc::service] pub trait ControlService { /// Resolve the addresses of the given agents. - async fn resolve_addrs( - peers: HashSet, - ) -> Result, ResolveError>; + async fn resolve_addrs(peers: Vec) -> Result, ResolveError>; /// Get the environment info for the given environment. - async fn get_env_info(env_id: EnvId) -> Option; + async fn get_env_info(env_id: EnvId) -> Option; /// Emit an agent transfer status update. async fn post_transfer_status(id: u32, status: TransferStatusUpdate); @@ -40,4 +35,7 @@ pub trait ControlService { /// Emit an agent node status update. async fn post_node_status(update: NodeStatus); + + /// Emit an agent reconcile status update. + async fn post_reconcile_status(status: Result, ReconcileError>); } diff --git a/crates/common/src/rpc/error.rs b/crates/common/src/rpc/error.rs index cc7c856a..3a6ecb24 100644 --- a/crates/common/src/rpc/error.rs +++ b/crates/common/src/rpc/error.rs @@ -1,7 +1,11 @@ +use std::path::PathBuf; + use serde::{Deserialize, Serialize}; use strum_macros::AsRefStr; use thiserror::Error; +use crate::state::{EnvId, HeightRequest}; + #[macro_export] macro_rules! impl_into_type_str { ($name:path) => { @@ -111,7 +115,7 @@ pub enum SnarkosRequestError { TimedOut, } -#[derive(Debug, Error, Serialize, Deserialize, AsRefStr)] +#[derive(Debug, Clone, Error, Serialize, Deserialize, AsRefStr)] pub enum ResolveError { #[error("source agent not found")] SourceAgentNotFound, @@ -119,30 +123,47 @@ pub enum ResolveError { AgentHasNoAddresses, } -#[derive(Debug, Error, Serialize, Deserialize, AsRefStr)] +#[derive(Debug, Clone, Error, Serialize, Deserialize, AsRefStr)] +#[serde(tag = "error", content = "message")] pub enum ReconcileError { - #[error("aborted by a more recent reconcilation request")] - Aborted, - #[error("failed setup storage: {0}")] - StorageSetupError(String), - #[error("failed to download {0} from the control plane")] - StorageAcquireError(String), - #[error("failed to get the binary from the control plane: {0}")] - BinaryAcquireError(String), - #[error("failed to find a checkpoint for the requested height/span")] - CheckpointAcquireError, - #[error("failed to apply checkpoint: {0}")] - CheckpointApplyError(String), - #[error("failed to resolve addresses of stated peers")] - ResolveAddrError(ResolveError), - #[error("a rention policy is required to rewind the ledger")] - MissingRetentionPolicy, - #[error("failed to load checkpoints for storage")] - CheckpointLoadError, - #[error("agent did not provide a local private key")] - NoLocalPrivateKey, - #[error("generic database error")] - Database, + #[error("node is not connected to the controlplane")] + Offline, + #[error("env {0} not found")] + MissingEnv(EnvId), #[error("unknown error")] Unknown, + #[error("rpc error: {0}")] + RpcError(String), + #[error(transparent)] + AddressResolve(#[from] ResolveError), + #[error("missing local private key")] + MissingLocalPrivateKey, + #[error("failed to create directory {0}: {1}")] + CreateDirectory(PathBuf, String), + #[error("failed to delete file {0}: {1}")] + DeleteFileError(PathBuf, String), + #[error("failed to get metadata for {0}: {1}")] + FileStatError(PathBuf, String), + #[error("failed to read file {0}: {1}")] + FileReadError(PathBuf, String), + #[error("failed to make {method} request {url}: {error}")] + HttpError { + method: String, + url: String, + error: String, + }, + #[error("failed to spawn process: {0}")] + SpawnError(String), + #[error("failed to set file permissions {0}: {1}")] + FilePermissionError(PathBuf, String), + #[error("failed to parse {0} as a url: {1}")] + UrlParseError(String, String), + #[error("error loading checkpoints: {0}")] + CheckpointLoadError(String), + #[error("missing retention policy for request: {0}")] + MissingRetentionPolicy(HeightRequest), + #[error("no available checkpoints for request: {0}")] + NoAvailableCheckpoints(HeightRequest), + #[error("failed to apply checkpoint: {0}")] + CheckpointApplyError(String), } diff --git a/crates/common/src/rpc/mod.rs b/crates/common/src/rpc/mod.rs index c724dabb..14108486 100644 --- a/crates/common/src/rpc/mod.rs +++ b/crates/common/src/rpc/mod.rs @@ -22,6 +22,7 @@ use tarpc::transport::channel::ChannelError; use tokio::sync::mpsc; pub mod agent; +pub mod codec; pub mod control; pub mod error; diff --git a/crates/common/src/state/agent_state.rs b/crates/common/src/state/agent_state.rs index 8a443ce6..cc88d6f5 100644 --- a/crates/common/src/state/agent_state.rs +++ b/crates/common/src/state/agent_state.rs @@ -1,4 +1,5 @@ use super::{EnvId, NodeState}; +use crate::format::{DataFormat, DataHeaderOf}; #[derive(Debug, Default, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum AgentState { @@ -19,4 +20,66 @@ impl AgentState { Self::Node(id, state) => Self::Node(id, Box::new(f(*state))), } } + + pub fn env(&self) -> Option { + match self { + Self::Inventory => None, + Self::Node(id, _) => Some(*id), + } + } + + pub fn map_env_id(&self, f: F) -> Option + where + F: Fn(EnvId) -> Option, + { + match self { + Self::Inventory => None, + Self::Node(id, _) => f(*id), + } + } +} + +impl DataFormat for AgentState { + type Header = (u8, DataHeaderOf); + const LATEST_HEADER: Self::Header = (1, NodeState::LATEST_HEADER); + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + match self { + Self::Inventory => Ok(0u8.write_data(writer)?), + Self::Node(id, state) => { + let mut written = 1u8.write_data(writer)?; + written += id.write_data(writer)?; + written += state.write_data(writer)?; + Ok(written) + } + } + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if header.0 != Self::LATEST_HEADER.0 { + return Err(crate::format::DataReadError::unsupported( + "AgentState", + Self::LATEST_HEADER.0, + header.0, + )); + } + + match u8::read_data(reader, &())? { + 0 => Ok(Self::Inventory), + 1 => { + let id = EnvId::read_data(reader, &())?; + let state = NodeState::read_data(reader, &header.1)?; + Ok(Self::Node(id, Box::new(state))) + } + n => Err(crate::format::DataReadError::custom(format!( + "Invalid AgentState variant {n}", + ))), + } + } } diff --git a/crates/common/src/state/agent_status.rs b/crates/common/src/state/agent_status.rs index 1916991e..a882b66f 100644 --- a/crates/common/src/state/agent_status.rs +++ b/crates/common/src/state/agent_status.rs @@ -1,10 +1,15 @@ +use std::time::Instant; + use chrono::{DateTime, Utc}; use indexmap::IndexMap; use serde::{Deserialize, Serialize}; +use tokio::task::AbortHandle; -use super::snarkos_status::SnarkOSStatus; +use super::{snarkos_status::SnarkOSStatus, ReconcileStatus}; +use crate::{format::DataFormat, rpc::error::ReconcileError}; #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "status")] pub enum NodeStatus { /// The last known status of the node is unknown #[default] @@ -14,7 +19,7 @@ pub enum NodeStatus { /// The node waiting on other tasks to complete before starting PendingStart, /// The node is running - Running(SnarkOSStatus), + Running { running_status: SnarkOSStatus }, /// The node has exited with a status code Exited(u8), /// The node was online and is in the process of shutting down @@ -26,7 +31,9 @@ pub enum NodeStatus { impl From for NodeStatus { fn from(status: SnarkOSStatus) -> Self { - NodeStatus::Running(status) + NodeStatus::Running { + running_status: status, + } } } @@ -99,6 +106,7 @@ pub enum TransferStatusUpdate { total: u64, /// The time the transfer started. time: DateTime, + // The transfer's abort handle, if any. }, /// The transfer has made progress. Progress { @@ -112,6 +120,9 @@ pub enum TransferStatusUpdate { }, /// The transfer has been cleaned up. Cleanup, + // Client only - specifies a handle to abort the transfer task + #[serde(skip)] + Handle(AbortHandle), } #[derive(Debug, Default, Clone, Serialize, Deserialize)] @@ -128,9 +139,24 @@ pub struct TransferStatus { pub total_bytes: u64, /// A transfer interruption reason, if any. pub interruption: Option, + /// The transfer's abort handle, if any. + #[serde(skip)] + pub handle: Option, } -#[derive(Debug, Default, Clone, Serialize, Deserialize)] +impl TransferStatus { + pub fn is_pending(&self) -> bool { + self.interruption.is_none() && self.downloaded_bytes < self.total_bytes + } + pub fn is_interrupted(&self) -> bool { + self.interruption.is_some() + } + pub fn is_complete(&self) -> bool { + self.downloaded_bytes >= self.total_bytes + } +} + +#[derive(Debug, Default, Clone)] pub struct AgentStatus { /// Version of the agent binary pub agent_version: Option, @@ -144,4 +170,47 @@ pub struct AgentStatus { pub connected_time: Option>, /// A map of transfers in progress pub transfers: IndexMap, + /// Latest reconcile status of the agent + pub reconcile: Option<(Instant, Result, ReconcileError>)>, +} + +impl DataFormat for LatestBlockInfo { + type Header = u8; + + const LATEST_HEADER: Self::Header = 1; + + fn write_data( + &self, + writer: &mut W, + ) -> Result { + let mut written = self.height.write_data(writer)?; + written += self.state_root.write_data(writer)?; + written += self.block_hash.write_data(writer)?; + written += self.previous_hash.write_data(writer)?; + written += self.block_timestamp.write_data(writer)?; + written += self.update_time.write_data(writer)?; + Ok(written) + } + + fn read_data( + reader: &mut R, + header: &Self::Header, + ) -> Result { + if *header != Self::LATEST_HEADER { + return Err(crate::format::DataReadError::unsupported( + "LatestBlockInfo", + Self::LATEST_HEADER, + *header, + )); + } + + Ok(LatestBlockInfo { + height: u32::read_data(reader, &())?, + state_root: String::read_data(reader, &())?, + block_hash: String::read_data(reader, &())?, + previous_hash: String::read_data(reader, &())?, + block_timestamp: i64::read_data(reader, &())?, + update_time: DateTime::read_data(reader, &())?, + }) + } } diff --git a/crates/common/src/aot_cmds/authorization.rs b/crates/common/src/state/authorization.rs similarity index 98% rename from crates/common/src/aot_cmds/authorization.rs rename to crates/common/src/state/authorization.rs index c5d1a2e8..9fb0e2ee 100644 --- a/crates/common/src/aot_cmds/authorization.rs +++ b/crates/common/src/state/authorization.rs @@ -15,7 +15,7 @@ pub enum Authorization { Deploy { owner: Value, deployment: Value, - #[serde(skip_serializing_if = "Option::is_none", default)] + #[serde(default, skip_serializing_if = "Option::is_none")] fee_auth: Option, }, } diff --git a/crates/common/src/state/height_request.rs b/crates/common/src/state/height_request.rs index c66ec7d5..162b1c9e 100644 --- a/crates/common/src/state/height_request.rs +++ b/crates/common/src/state/height_request.rs @@ -1,96 +1,44 @@ -use std::str::FromStr; +use std::{fmt::Display, str::FromStr}; use snops_checkpoint::RetentionSpan; use crate::format::{DataFormat, DataFormatReader, DataHeaderOf, DataReadError}; -/// for some reason bincode does not allow deserialize_any so if i want to allow -/// end users to type "top", 42, or "persist" i need to do have to copies of -/// this where one is not untagged. -/// -/// bincode. please. -#[derive(Debug, Copy, Default, Clone, serde::Serialize, serde::Deserialize, PartialEq, Eq)] -#[serde(rename_all = "lowercase", untagged)] -pub enum DocHeightRequest { - #[default] - /// Use the latest height for the ledger - #[serde(with = "super::strings::top")] - Top, - /// Set the height to the given block (there must be a checkpoint at this - /// height) Setting to 0 will reset the height to the genesis block - Absolute(u32), - /// Use the next checkpoint that matches this checkpoint span - Checkpoint(snops_checkpoint::RetentionSpan), - // the control plane doesn't know the heights the nodes are at - // TruncateHeight(u32), - // TruncateTime(i64), -} - -impl FromStr for DocHeightRequest { +impl FromStr for HeightRequest { type Err = String; fn from_str(s: &str) -> Result { match s { - "top" => Ok(DocHeightRequest::Top), + "top" => Ok(HeightRequest::Top), s => { if let Ok(height) = s.parse() { - Ok(DocHeightRequest::Absolute(height)) + Ok(HeightRequest::Absolute(height)) } else if let Ok(span) = s.parse() { - Ok(DocHeightRequest::Checkpoint(span)) + Ok(HeightRequest::Checkpoint(span)) } else { - Err(format!("invalid DocHeightRequest: {}", s)) + Err(format!("invalid HeightRequest: {}", s)) } } } } } -impl DataFormat for DocHeightRequest { - type Header = (u8, DataHeaderOf); - const LATEST_HEADER: Self::Header = (1, RetentionSpan::LATEST_HEADER); - - fn write_data( - &self, - writer: &mut W, - ) -> Result { +impl Display for HeightRequest { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - DocHeightRequest::Top => 0u8.write_data(writer), - DocHeightRequest::Absolute(height) => { - Ok(1u8.write_data(writer)? + height.write_data(writer)?) - } - DocHeightRequest::Checkpoint(retention) => { - Ok(2u8.write_data(writer)? + retention.write_data(writer)?) - } - } - } - - fn read_data( - reader: &mut R, - header: &Self::Header, - ) -> Result { - if header.0 != Self::LATEST_HEADER.0 { - return Err(DataReadError::unsupported( - "DocHeightRequest", - Self::LATEST_HEADER.0, - header.0, - )); - } - match reader.read_data(&())? { - 0u8 => Ok(DocHeightRequest::Top), - 1u8 => Ok(DocHeightRequest::Absolute(reader.read_data(&())?)), - 2u8 => Ok(DocHeightRequest::Checkpoint(reader.read_data(&header.1)?)), - n => Err(DataReadError::Custom(format!( - "invalid DocHeightRequest discrminant: {n}" - ))), + HeightRequest::Top => write!(f, "top"), + HeightRequest::Absolute(h) => write!(f, "{h}"), + HeightRequest::Checkpoint(c) => write!(f, "{c}"), } } } #[derive(Debug, Default, Copy, Clone, serde::Serialize, serde::Deserialize, PartialEq, Eq)] -#[serde(rename_all = "lowercase")] +#[serde(rename_all = "lowercase", untagged)] pub enum HeightRequest { #[default] /// Use the latest height for the ledger + #[serde(with = "super::strings::top")] Top, /// Set the height to the given block (there must be a checkpoint at this /// height) Setting to 0 will reset the height to the genesis block @@ -102,8 +50,6 @@ pub enum HeightRequest { // TruncateTime(i64), } -// TODO: now that we don't use bincode for storage format, we should be able to -// make remove HeightRequest and rename DocHeightRequest to HeightRequest impl DataFormat for HeightRequest { type Header = (u8, DataHeaderOf); const LATEST_HEADER: Self::Header = (1, RetentionSpan::LATEST_HEADER); @@ -139,7 +85,7 @@ impl DataFormat for HeightRequest { 1u8 => Ok(HeightRequest::Absolute(reader.read_data(&())?)), 2u8 => Ok(HeightRequest::Checkpoint(reader.read_data(&header.1)?)), n => Err(DataReadError::Custom(format!( - "invalid HeightRequest discrminant: {n}" + "invalid HeightRequest discriminant: {n}" ))), } } @@ -151,16 +97,8 @@ impl HeightRequest { } pub fn reset(&self) -> bool { - *self == Self::Absolute(0) - } -} - -impl From for HeightRequest { - fn from(req: DocHeightRequest) -> Self { - match req { - DocHeightRequest::Top => Self::Top, - DocHeightRequest::Absolute(h) => Self::Absolute(h), - DocHeightRequest::Checkpoint(c) => Self::Checkpoint(c), - } + // height 0 = genesis block + // checkpoint an unlimited time in the past is also a reset + *self == Self::Absolute(0) || *self == Self::Checkpoint(RetentionSpan::Unlimited) } } diff --git a/crates/common/src/state/mod.rs b/crates/common/src/state/mod.rs index a2198e6b..f228edfe 100644 --- a/crates/common/src/state/mod.rs +++ b/crates/common/src/state/mod.rs @@ -4,6 +4,7 @@ use regex::Regex; mod agent_mode; mod agent_state; mod agent_status; +mod authorization; mod height_request; mod id; mod network; @@ -11,12 +12,15 @@ mod node_key; mod node_state; mod node_type; mod port_config; +mod reconcile; pub mod snarkos_status; pub mod strings; +mod transaction_status; pub use agent_mode::*; pub use agent_state::*; pub use agent_status::*; +pub use authorization::*; pub use height_request::*; pub use id::*; pub use network::*; @@ -24,6 +28,8 @@ pub use node_key::*; pub use node_state::*; pub use node_type::*; pub use port_config::*; +pub use reconcile::*; +pub use transaction_status::*; lazy_static! { static ref NODE_KEY_REGEX: Regex = Regex::new( diff --git a/crates/common/src/state/node_state.rs b/crates/common/src/state/node_state.rs index 81d2a10b..424e280f 100644 --- a/crates/common/src/state/node_state.rs +++ b/crates/common/src/state/node_state.rs @@ -176,24 +176,6 @@ pub enum AgentPeer { External(SocketAddr), } -impl AgentPeer { - /// Get the port from the peer - pub fn port(&self) -> u16 { - match self { - Self::Internal(_, port) => *port, - Self::External(addr) => addr.port(), - } - } - - /// Return a new peer with the given port. - pub fn with_port(&self, port: u16) -> Self { - match self { - Self::Internal(ip, _) => Self::Internal(*ip, port), - Self::External(addr) => Self::External(SocketAddr::new(addr.ip(), port)), - } - } -} - impl DataFormat for KeyState { type Header = u8; const LATEST_HEADER: Self::Header = 1; diff --git a/crates/common/src/state/port_config.rs b/crates/common/src/state/port_config.rs index e47c4ecf..675dbd8c 100644 --- a/crates/common/src/state/port_config.rs +++ b/crates/common/src/state/port_config.rs @@ -1,6 +1,6 @@ use crate::format::{DataFormat, DataFormatReader}; -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, clap::Parser)] +#[derive(Debug, Copy, Clone, serde::Serialize, serde::Deserialize, clap::Parser, Eq, PartialEq)] pub struct PortConfig { /// Specify the IP address and port for the node server #[clap(long = "node", default_value_t = 4130)] diff --git a/crates/common/src/state/reconcile.rs b/crates/common/src/state/reconcile.rs new file mode 100644 index 00000000..d62568bc --- /dev/null +++ b/crates/common/src/state/reconcile.rs @@ -0,0 +1,160 @@ +use std::{fmt::Display, time::Duration}; + +use indexmap::IndexSet; +use serde::{Deserialize, Serialize}; + +use super::TransferId; + +#[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Serialize, Deserialize)] +pub struct ReconcileOptions { + /// When true, the reconciler will fetch the latest env info + pub refetch_info: bool, + /// When true, the reconciler will force the node to shut down + pub force_shutdown: bool, + /// When true, the reconciler will clear the last height + pub clear_last_height: bool, +} + +impl ReconcileOptions { + pub fn union(self, other: Self) -> Self { + Self { + refetch_info: self.refetch_info || other.refetch_info, + force_shutdown: self.force_shutdown || other.force_shutdown, + clear_last_height: self.clear_last_height || other.clear_last_height, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +#[serde(tag = "name", rename_all = "snake_case")] +pub enum ReconcileCondition { + /// A file is being transferred. + PendingTransfer { source: String, id: TransferId }, + /// A process is being spawned / confirmed. Could be starting the node or + /// manipulating the ledger + PendingProcess { process: String }, + /// A tranfer was started and interrupted. + InterruptedTransfer { + source: String, + id: TransferId, + #[serde(default, skip_serializing_if = "Option::is_none")] + reason: Option, + }, + /// A modify operation was started and interrupted. + InterruptedModify { reason: String }, + /// A file is missing and cannot be downloaded at the moment. + MissingFile { path: String }, + /// Waiting to reconnect to the controlplane + PendingConnection, + /// Waiting for the node to be shut down + PendingShutdown, + /// Waiting for the node to start up + PendingStartup, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ReconcileStatus { + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub scopes: Vec, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub inner: Option, + #[serde( + default, + skip_serializing_if = "Option::is_none", + serialize_with = "ser_duration_as_secs", + deserialize_with = "deser_duration_from_secs" + )] + pub requeue_after: Option, + #[serde(default, skip_serializing_if = "IndexSet::is_empty")] + pub conditions: IndexSet, +} + +fn ser_duration_as_secs(duration: &Option, serializer: S) -> Result +where + S: serde::Serializer, +{ + match duration { + Some(duration) => serializer.serialize_some(&duration.as_secs()), + None => serializer.serialize_none(), + } +} + +fn deser_duration_from_secs<'de, D>(deserializer: D) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + let secs = Option::deserialize(deserializer)?; + Ok(secs.map(Duration::from_secs)) +} + +impl Eq for ReconcileStatus {} +impl PartialEq for ReconcileStatus { + fn eq(&self, other: &Self) -> bool { + self.inner == other.inner + && self.conditions == other.conditions + && self.scopes == other.scopes + && self.requeue_after == other.requeue_after + } +} + +impl Default for ReconcileStatus { + fn default() -> Self { + Self::new(Some(Default::default())) + } +} + +impl ReconcileStatus { + pub fn new(inner: Option) -> Self { + Self { + scopes: Vec::new(), + inner, + requeue_after: None, + conditions: IndexSet::new(), + } + } + + pub fn with(inner: T) -> Self { + Self::new(Some(inner)) + } + + pub fn empty() -> Self { + Self::new(None) + } + + pub fn is_requeue(&self) -> bool { + self.requeue_after.is_some() + } + + pub fn replace_inner(self, inner: U) -> ReconcileStatus { + ReconcileStatus { + inner: Some(inner), + scopes: self.scopes, + requeue_after: self.requeue_after, + conditions: self.conditions, + } + } + + pub fn emptied(&self) -> ReconcileStatus { + ReconcileStatus { + inner: None, + scopes: self.scopes.clone(), + requeue_after: self.requeue_after, + conditions: self.conditions.clone(), + } + } + + pub fn requeue_after(mut self, duration: Duration) -> Self { + self.requeue_after = Some(duration); + self + } + + pub fn add_scope(mut self, scope: impl Display) -> Self { + self.scopes.push(scope.to_string()); + self + } + + pub fn add_condition(mut self, condition: ReconcileCondition) -> Self { + self.conditions.insert(condition); + self + } +} diff --git a/crates/common/src/state/snarkos_status.rs b/crates/common/src/state/snarkos_status.rs index cbfd72c0..52903fc8 100644 --- a/crates/common/src/state/snarkos_status.rs +++ b/crates/common/src/state/snarkos_status.rs @@ -20,6 +20,22 @@ pub enum SnarkOSStatus { Halted(Option), } +impl SnarkOSStatus { + pub fn is_started(&self) -> bool { + matches!(self, SnarkOSStatus::Started) + } + + pub fn label(&self) -> &'static str { + match self { + SnarkOSStatus::Starting => "starting", + SnarkOSStatus::LedgerLoading => "loading", + SnarkOSStatus::LedgerFailure(_) => "failure", + SnarkOSStatus::Started => "started", + SnarkOSStatus::Halted(_) => "halted", + } + } +} + /// Messages from snarkos to the agent, containing information about the status /// of the node #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] diff --git a/crates/controlplane/src/cannon/status.rs b/crates/common/src/state/transaction_status.rs similarity index 70% rename from crates/controlplane/src/cannon/status.rs rename to crates/common/src/state/transaction_status.rs index f319ed87..8e9e5ec0 100644 --- a/crates/controlplane/src/cannon/status.rs +++ b/crates/common/src/state/transaction_status.rs @@ -1,55 +1,11 @@ -use std::sync::Arc; - use chrono::{DateTime, Utc}; -use snops_common::{format::DataFormat, state::AgentId}; -use tokio::sync::mpsc::Sender; - -pub struct TransactionStatusSender(Option>); - -impl TransactionStatusSender { - pub fn new(sender: Sender) -> Self { - Self(Some(sender)) - } - - pub fn empty() -> Self { - Self(None) - } +use serde::{Deserialize, Serialize}; - pub fn send(&self, status: TransactionStatusEvent) { - if let Some(sender) = &self.0 { - let _ = sender.try_send(status); - } - } -} - -/// An event that represents the latest status of a transaction. -pub enum TransactionStatusEvent { - /// Authorization has been aborted - ExecuteAborted, - /// Authorization has been queued for execution. - ExecuteQueued, - /// No agents are available for the execution - ExecuteAwaitingCompute, - /// An agent was found and the authorization is being executed - Executing(AgentId), - /// Execute RPC failed - ExecuteFailed(String), - /// Agent has completed the execution - ExecuteComplete(Arc), - // TODO: Implement the following statuses - // /// API has received the transaction broadcast - // BroadcastReceived, - // /// Control plane has forwarded the transaction to a peer - // BroadcastForwarded, - // /// An error occurred while broadcasting the transaction - // BroadcastFailed, - // /// Transaction was found in the network, return the block hash - // TransactionConfirmed(String), -} +use crate::format::DataFormat; /// Status of a transaction as presented internally for tracking and /// preventing data loss. -#[derive(Debug, Clone, Copy, Eq, PartialEq)] +#[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)] pub enum TransactionSendState { /// Authorization has been received. This step is skipped if a /// transaction is created/broadcasted directly. @@ -91,7 +47,7 @@ impl DataFormat for TransactionSendState { fn write_data( &self, writer: &mut W, - ) -> Result { + ) -> Result { Ok(match self { TransactionSendState::Authorized => 0u8.write_data(writer)?, TransactionSendState::Executing(timestamp) => { @@ -109,9 +65,9 @@ impl DataFormat for TransactionSendState { fn read_data( reader: &mut R, header: &Self::Header, - ) -> Result { + ) -> Result { if *header != Self::LATEST_HEADER { - return Err(snops_common::format::DataReadError::unsupported( + return Err(crate::format::DataReadError::unsupported( "CannonTransactionStatus", Self::LATEST_HEADER, *header, @@ -128,7 +84,7 @@ impl DataFormat for TransactionSendState { DateTime::::read_data(reader, &())?, ), _ => { - return Err(snops_common::format::DataReadError::Custom( + return Err(crate::format::DataReadError::Custom( "Invalid CannonTransactionStatus tag".to_string(), )) } @@ -139,9 +95,9 @@ impl DataFormat for TransactionSendState { #[cfg(test)] mod test { use chrono::DateTime; - use snops_common::format::DataFormat; - use crate::cannon::status::TransactionSendState; + use super::TransactionSendState; + use crate::format::DataFormat; macro_rules! case { ($name:ident, $ty:ty, $a:expr, $b:expr) => { diff --git a/crates/controlplane/Cargo.toml b/crates/controlplane/Cargo.toml index 99960d6e..f32a3481 100644 --- a/crates/controlplane/Cargo.toml +++ b/crates/controlplane/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "snops" -version = "0.1.0" +version = "0.2.0" edition = "2021" license = "MIT" description = "The snarkops control plane responsible for managing environments and agents" @@ -21,7 +21,6 @@ axum = { workspace = true, features = [ "macros", ] } bimap = { workspace = true, features = ["serde"] } -bincode.workspace = true chrono = { workspace = true, features = ["serde"] } clap = { workspace = true, features = ["env"] } dashmap = { workspace = true, features = ["serde"] } @@ -39,6 +38,7 @@ rand.workspace = true rand_chacha.workspace = true rayon.workspace = true reqwest = { workspace = true, features = ["json", "stream"] } +semver.workspace = true serde.workspace = true serde_json.workspace = true serde_yaml.workspace = true diff --git a/crates/controlplane/src/agent_version.rs b/crates/controlplane/src/agent_version.rs new file mode 100644 index 00000000..6b2331ca --- /dev/null +++ b/crates/controlplane/src/agent_version.rs @@ -0,0 +1,37 @@ +use std::sync::OnceLock; + +use semver::{Comparator, Prerelease, Version, VersionReq}; + +/// A version requirement that matches the current controlplane version against +/// an agent version +fn cp_version() -> &'static VersionReq { + static CP_VERSION: OnceLock = OnceLock::new(); + + CP_VERSION.get_or_init(|| { + let version = Version::parse(env!("CARGO_PKG_VERSION")) + .expect("Failed to parse controlplane version"); + + VersionReq { + comparators: vec![ + Comparator { + op: semver::Op::GreaterEq, + major: version.major, + minor: Some(version.minor), + patch: Some(0), + pre: Prerelease::EMPTY, + }, + Comparator { + op: semver::Op::Less, + major: version.major, + minor: Some(version.minor + 1), + patch: None, + pre: Prerelease::EMPTY, + }, + ], + } + }) +} + +pub fn agent_version_ok(agent_version: &Version) -> bool { + cp_version().matches(agent_version) +} diff --git a/crates/controlplane/src/cannon/context.rs b/crates/controlplane/src/cannon/context.rs index 0949090e..13833bc8 100644 --- a/crates/controlplane/src/cannon/context.rs +++ b/crates/controlplane/src/cannon/context.rs @@ -5,8 +5,8 @@ use dashmap::DashMap; use futures_util::{stream::FuturesUnordered, StreamExt}; use lazysort::SortedBy; use snops_common::{ - aot_cmds::Authorization, - state::{CannonId, EnvId, NetworkId}, + events::{Event, TransactionAbortReason, TransactionEvent}, + state::{AgentId, Authorization, CannonId, EnvId, NetworkId, TransactionSendState}, }; use tracing::{error, trace, warn}; @@ -15,13 +15,12 @@ use super::{ file::TransactionSink, sink::TxSink, source::TxSource, - status::{TransactionSendState, TransactionStatusEvent, TransactionStatusSender}, tracker::TransactionTracker, CannonReceivers, }; use crate::{ cannon::source::ComputeTarget, - state::{GlobalState, REST_CLIENT}, + state::{EmitEvent, GetGlobalState, GlobalState, REST_CLIENT}, }; /// Information a transaction cannon needs for execution via spawned task @@ -35,7 +34,7 @@ pub struct ExecutionContext { pub(crate) source: TxSource, pub(crate) sink: TxSink, pub(crate) fired_txs: Arc, - pub(crate) transactions: Arc>, + pub(crate) transactions: Arc, TransactionTracker>>, } impl ExecutionContext { @@ -93,29 +92,29 @@ impl ExecutionContext { // ------------------------ // receive authorizations and forward the executions to the compute target - Some((tx_id, events)) = rx.authorizations.recv() => { + Some(tx_id) = rx.authorizations.recv() => { // ensure the transaction tracker exists let Some(tracker) = self.transactions.get(&tx_id) else { error!("cannon {env_id}.{cannon_id} missing transaction tracker for {tx_id}"); - events.send(TransactionStatusEvent::ExecuteAborted); + TransactionEvent::ExecuteAborted(TransactionAbortReason::MissingTracker).with_cannon_ctx(&self, tx_id).emit(&self); continue; }; // ensure the transaction is in the correct state if tracker.status != TransactionSendState::Authorized { error!("cannon {env_id}.{cannon_id} unexpected status for {tx_id}: {:?}", tracker.status); // TODO: remove this auth and log it somewhere - events.send(TransactionStatusEvent::ExecuteAborted); + TransactionEvent::ExecuteAborted(TransactionAbortReason::UnexpectedStatus{ transaction_status: tracker.status}).with_cannon_ctx(&self, tx_id).emit(&self); continue; } // ensure the transaction has an authorization (more than likely unreachable) let Some(auth) = &tracker.authorization else { error!("cannon {env_id}.{cannon_id} missing authorization for {tx_id}"); // TODO: remove the auth anyway - events.send(TransactionStatusEvent::ExecuteAborted); + TransactionEvent::ExecuteAborted(TransactionAbortReason::MissingAuthorization).with_cannon_ctx(&self, tx_id).emit(&self); continue; }; - auth_execs.push(self.execute_auth(tx_id, Arc::clone(auth), &query_path, events)); + auth_execs.push(self.execute_auth(tx_id, Arc::clone(auth), &query_path)); } // receive transaction ids and forward them to the sink target Some(tx) = rx.transactions.recv() => { @@ -147,7 +146,7 @@ impl ExecutionContext { } // write the transaction status to the store and update the transaction tracker - pub fn write_tx_status(&self, tx_id: &str, status: TransactionSendState) { + pub fn write_tx_status(&self, tx_id: &Arc, status: TransactionSendState) { let key = (self.env_id, self.id, tx_id.to_owned()); if let Some(mut tx) = self.transactions.get_mut(tx_id) { if let Err(e) = TransactionTracker::write_status(&self.state, &key, status) { @@ -160,10 +159,10 @@ impl ExecutionContext { } } - pub fn remove_tx_tracker(&self, tx_id: String) { + pub fn remove_tx_tracker(&self, tx_id: Arc) { let _ = self.transactions.remove(&tx_id); if let Err(e) = - TransactionTracker::delete(&self.state, &(self.env_id, self.id, tx_id.to_owned())) + TransactionTracker::delete(&self.state, &(self.env_id, self.id, tx_id.clone())) { error!( "cannon {}.{} failed to delete transaction {tx_id}: {e:?}", @@ -175,28 +174,35 @@ impl ExecutionContext { /// Execute an authorization on the source's compute target async fn execute_auth( &self, - tx_id: String, + tx_id: Arc, auth: Arc, query_path: &str, - events: TransactionStatusSender, - ) -> Result<(), (String, CannonError)> { - events.send(TransactionStatusEvent::ExecuteQueued); + ) -> Result<(), (Arc, CannonError)> { + TransactionEvent::AuthorizationReceived { + authorization: Arc::clone(&auth), + } + .with_cannon_ctx(self, tx_id.clone()) + .emit(self); match self .source .compute - .execute(self, query_path, &tx_id, &auth, &events) + .execute(self, query_path, &tx_id, &auth) .await { // Can't execute the auth if no agents are available. // The transaction task will handle re-appending the auth. Err(CannonError::Source(SourceError::NoAvailableAgents(_))) => { - events.send(TransactionStatusEvent::ExecuteAwaitingCompute); + TransactionEvent::ExecuteAwaitingCompute + .with_cannon_ctx(self, tx_id.clone()) + .emit(self); Ok(()) } Err(e) => { // reset the transaction status to authorized so it can be re-executed self.write_tx_status(&tx_id, TransactionSendState::Authorized); - events.send(TransactionStatusEvent::ExecuteFailed(e.to_string())); + TransactionEvent::ExecuteFailed(e.to_string()) + .with_cannon_ctx(self, tx_id.clone()) + .emit(self); Err((tx_id, e)) } res => res.map_err(|e| (tx_id, e)), @@ -207,8 +213,8 @@ impl ExecutionContext { async fn fire_tx( &self, sink_pipe: Option>, - tx_id: String, - ) -> Result { + tx_id: Arc, + ) -> Result, CannonError> { let latest_height = self .state .get_env_block_info(self.env_id) @@ -216,7 +222,7 @@ impl ExecutionContext { // ensure transaction is being tracked let Some(tracker) = self.transactions.get(&tx_id).map(|v| v.value().clone()) else { - return Err(CannonError::TransactionLost(self.id, tx_id)); + return Err(CannonError::TransactionLost(self.id, tx_id.to_string())); }; // ensure transaction is ready to be broadcasted if !matches!( @@ -225,7 +231,7 @@ impl ExecutionContext { ) { return Err(CannonError::InvalidTransactionState( self.id, - tx_id, + tx_id.to_string(), format!( "expected unsent or broadcasted, got {}", tracker.status.label() @@ -235,7 +241,7 @@ impl ExecutionContext { // ensure transaction blob exists let Some(tx_blob) = tracker.transaction else { - return Err(CannonError::TransactionLost(self.id, tx_id)); + return Err(CannonError::TransactionLost(self.id, tx_id.to_string())); }; let tx_str = match serde_json::to_string(&tx_blob) { @@ -270,11 +276,19 @@ impl ExecutionContext { let network = self.network; // update the transaction status and increment the broadcast attempts - let update_status = || { + let update_status = |agent: Option| { self.write_tx_status( &tx_id, TransactionSendState::Broadcasted(latest_height, Utc::now()), ); + let mut ev = TransactionEvent::Broadcasted { + height: latest_height, + timestamp: Utc::now(), + } + .with_cannon_ctx(self, Arc::clone(&tx_id)); + ev.agent = agent; + ev.emit(self); + if let Err(e) = TransactionTracker::inc_attempts( &self.state, &(env_id, cannon_id, tx_id.to_owned()), @@ -298,7 +312,7 @@ impl ExecutionContext { continue; } - update_status(); + update_status(agent); return Ok(tx_id); } @@ -342,7 +356,7 @@ impl ExecutionContext { } } - update_status(); + update_status(None); return Ok(tx_id); } } @@ -360,3 +374,23 @@ impl ExecutionContext { Ok(tx_id) } } + +impl<'a> GetGlobalState<'a> for &'a ExecutionContext { + fn global_state(self) -> &'a GlobalState { + &self.state + } +} + +pub trait CtxEventHelper { + fn with_cannon_ctx(self, ctx: &ExecutionContext, transaction: Arc) -> Event; +} + +impl> CtxEventHelper for T { + fn with_cannon_ctx(self, ctx: &ExecutionContext, transaction: Arc) -> Event { + let mut event = self.into(); + event.cannon = Some(ctx.id); + event.env = Some(ctx.env_id); + event.transaction = Some(transaction); + event + } +} diff --git a/crates/controlplane/src/cannon/error.rs b/crates/controlplane/src/cannon/error.rs index b80fcc6f..a86f3297 100644 --- a/crates/controlplane/src/cannon/error.rs +++ b/crates/controlplane/src/cannon/error.rs @@ -1,4 +1,4 @@ -use std::path::PathBuf; +use std::{path::PathBuf, sync::Arc}; use axum::http::StatusCode; use serde::{ser::SerializeStruct, Serialize, Serializer}; @@ -12,7 +12,6 @@ use snops_common::{ use strum_macros::AsRefStr; use thiserror::Error; -use super::status::TransactionStatusSender; use crate::{env::error::EnvRequestError, error::StateError}; #[derive(Debug, Error, AsRefStr)] @@ -170,12 +169,12 @@ pub enum CannonError { #[error("send `auth` error for cannon `{0}`: {1}")] SendAuthError( CannonId, - #[source] tokio::sync::mpsc::error::SendError<(String, TransactionStatusSender)>, + #[source] tokio::sync::mpsc::error::SendError>, ), #[error("send `tx` error for cannon `{0}`: {1}")] SendTxError( CannonId, - #[source] tokio::sync::mpsc::error::SendError, + #[source] tokio::sync::mpsc::error::SendError>, ), #[error(transparent)] DatabaseWriteError(#[from] DatabaseError), diff --git a/crates/controlplane/src/cannon/mod.rs b/crates/controlplane/src/cannon/mod.rs index 951bbc20..f70b04ee 100644 --- a/crates/controlplane/src/cannon/mod.rs +++ b/crates/controlplane/src/cannon/mod.rs @@ -5,7 +5,6 @@ mod net; pub mod router; pub mod sink; pub mod source; -pub mod status; pub mod tracker; use std::{ @@ -19,11 +18,10 @@ use std::{ use context::ExecutionContext; use dashmap::DashMap; use snops_common::{ - aot_cmds::{AotCmd, Authorization}, + aot_cmds::AotCmd, format::PackedUint, - state::{CannonId, EnvId, NetworkId, StorageId}, + state::{Authorization, CannonId, EnvId, NetworkId, StorageId, TransactionSendState}, }; -use status::{TransactionSendState, TransactionStatusSender}; use tokio::{ sync::{ mpsc::{UnboundedReceiver, UnboundedSender}, @@ -101,19 +99,19 @@ pub struct CannonInstance { child: Option, /// channel to send transaction ids to the the task - pub(crate) tx_sender: UnboundedSender, + pub(crate) tx_sender: UnboundedSender>, /// channel to send authorizations (by transaction id) to the the task - pub(crate) auth_sender: UnboundedSender<(String, TransactionStatusSender)>, + pub(crate) auth_sender: UnboundedSender>, /// transaction ids that are currently being processed - pub(crate) transactions: Arc>, + pub(crate) transactions: Arc, TransactionTracker>>, pub(crate) received_txs: Arc, pub(crate) fired_txs: Arc, } pub struct CannonReceivers { - transactions: UnboundedReceiver, - authorizations: UnboundedReceiver<(String, TransactionStatusSender)>, + transactions: UnboundedReceiver>, + authorizations: UnboundedReceiver>, } pub type CannonInstanceMeta = (EnvId, NetworkId, StorageId, PathBuf); @@ -127,11 +125,10 @@ impl CannonInstance { txs: &AtomicU64, ) -> u64 { let index = txs.fetch_add(1, std::sync::atomic::Ordering::Relaxed); - if let Err(e) = state - .db - .tx_index - .save(&(env_id, cannon_id, String::new()), &PackedUint(index)) - { + if let Err(e) = state.db.tx_index.save( + &(env_id, cannon_id, Arc::new(String::new())), + &PackedUint(index), + ) { error!("cannon {env_id}.{cannon_id} failed to save received tx count: {e}"); } index @@ -142,22 +139,23 @@ impl CannonInstance { state: &GlobalState, env_id: EnvId, cannon_id: CannonId, - ) -> (DashMap, AtomicU64) { + ) -> (DashMap, TransactionTracker>, AtomicU64) { let transactions = DashMap::new(); // Restore the received transaction count (empty string key for tx_index) - let received_txs = match state - .db - .tx_index - .restore(&(env_id, cannon_id, String::new())) - { - Ok(Some(index)) => AtomicU64::new(index.0), - Ok(None) => AtomicU64::new(0), - Err(e) => { - error!("cannon {env_id}.{cannon_id} failed to parse received tx count: {e}"); - AtomicU64::new(0) - } - }; + let received_txs = + match state + .db + .tx_index + .restore(&(env_id, cannon_id, Arc::new(String::new()))) + { + Ok(Some(index)) => AtomicU64::new(index.0), + Ok(None) => AtomicU64::new(0), + Err(e) => { + error!("cannon {env_id}.{cannon_id} failed to parse received tx count: {e}"); + AtomicU64::new(0) + } + }; let statuses = match state.db.tx_status.read_with_prefix(&(env_id, cannon_id)) { Ok(statuses) => statuses, @@ -369,10 +367,10 @@ impl CannonInstance { /// to the desired sink pub fn proxy_broadcast( &self, - tx_id: String, + tx_id: Arc, body: serde_json::Value, ) -> Result<(), CannonError> { - let key = (self.env_id, self.id, tx_id.to_owned()); + let key = (self.env_id, self.id, Arc::clone(&tx_id)); // if the transaction is in the cache, it has already been broadcasted if let Some(cache) = self.global_state.env_network_cache.get(&self.env_id) { @@ -383,7 +381,10 @@ impl CannonInstance { self.env_id, self.id ); } - return Err(CannonError::TransactionAlreadyExists(self.id, tx_id)); + return Err(CannonError::TransactionAlreadyExists( + self.id, + tx_id.to_string(), + )); } } @@ -391,7 +392,10 @@ impl CannonInstance { let tracker = if let Some(mut tx) = self.transactions.get(&tx_id).as_deref().cloned() { // if we receive a transaction that is not executing, it is a duplicate if !matches!(tx.status, TransactionSendState::Executing(_)) { - return Err(CannonError::TransactionAlreadyExists(self.id, tx_id)); + return Err(CannonError::TransactionAlreadyExists( + self.id, + tx_id.to_string(), + )); } // clear attempts (as this was a successful execute) @@ -438,11 +442,7 @@ impl CannonInstance { } /// Called by axum to forward /cannon//auth to a listen source - pub async fn proxy_auth( - &self, - body: Authorization, - events: TransactionStatusSender, - ) -> Result { + pub async fn proxy_auth(&self, body: Authorization) -> Result, CannonError> { let Some(storage) = self .global_state .get_env(self.env_id) @@ -484,16 +484,19 @@ impl CannonInstance { transaction: None, status: TransactionSendState::Authorized, }; + + let tx_id = Arc::new(tx_id); + // write the transaction to the store to prevent data loss tracker.write( &self.global_state, - &(self.env_id, self.id, tx_id.to_owned()), + &(self.env_id, self.id, Arc::clone(&tx_id)), )?; - self.transactions.insert(tx_id.to_owned(), tracker); + self.transactions.insert(Arc::clone(&tx_id), tracker); trace!("cannon {}.{} received auth {tx_id}", self.env_id, self.id); self.auth_sender - .send((tx_id.to_owned(), events)) + .send(Arc::clone(&tx_id)) .map_err(|e| CannonError::SendAuthError(self.id, e))?; Ok(tx_id) diff --git a/crates/controlplane/src/cannon/router.rs b/crates/controlplane/src/cannon/router.rs index 20c3576e..14cf8e96 100644 --- a/crates/controlplane/src/cannon/router.rs +++ b/crates/controlplane/src/cannon/router.rs @@ -1,4 +1,4 @@ -use std::{str::FromStr, time::Duration}; +use std::{str::FromStr, sync::Arc, time::Duration}; use axum::{ extract::{Path, Query, State}, @@ -11,11 +11,10 @@ use serde::Deserialize; use serde_json::json; use snops_common::{ key_source::KeySource, - state::{id_or_none, KeyState, NetworkId}, + state::{id_or_none, Authorization, KeyState, NetworkId}, }; -use tokio::sync::mpsc; -use super::{source::QueryTarget, status::TransactionStatusSender, Authorization}; +use super::source::QueryTarget; use crate::{ server::{actions::execute::execute_status, error::ServerError}, state::AppState, @@ -324,7 +323,7 @@ async fn transaction( return ServerError::BadRequest("body missing transaction ID".to_owned()).into_response(); }; - match cannon.proxy_broadcast(tx_id, body.take()) { + match cannon.proxy_broadcast(Arc::new(tx_id), body.take()) { Ok(_) => StatusCode::OK.into_response(), Err(e) => ServerError::from(e).into_response(), } @@ -362,22 +361,20 @@ async fn authorization( }; if query.is_async() { - return match cannon - .proxy_auth(body, TransactionStatusSender::empty()) - .await - { + return match cannon.proxy_auth(body).await { Ok(tx_id) => (StatusCode::ACCEPTED, Json(tx_id)).into_response(), Err(e) => ServerError::from(e).into_response(), }; } - let (tx, rx) = mpsc::channel(10); - - match cannon - .proxy_auth(body, TransactionStatusSender::new(tx)) - .await - { - Ok(tx_id) => execute_status(tx_id, rx).await.into_response(), + match cannon.proxy_auth(body).await { + Ok(tx_id) => { + use snops_common::events::EventFilter::*; + let subscriber = state + .events + .subscribe_on(TransactionIs(tx_id.clone()) & EnvIs(env_id) & CannonIs(cannon_id)); + execute_status(tx_id, subscriber).await.into_response() + } Err(e) => ServerError::from(e).into_response(), } } diff --git a/crates/controlplane/src/cannon/source.rs b/crates/controlplane/src/cannon/source.rs index 8bf46d1e..21878b0a 100644 --- a/crates/controlplane/src/cannon/source.rs +++ b/crates/controlplane/src/cannon/source.rs @@ -3,19 +3,20 @@ use std::sync::Arc; use chrono::Utc; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; -use snops_common::{ - aot_cmds::Authorization, lasso::Spur, node_targets::NodeTargets, state::NetworkId, INTERN, -}; +use snops_common::events::{EventHelpers, TransactionEvent}; +use snops_common::state::{Authorization, TransactionSendState}; +use snops_common::{lasso::Spur, node_targets::NodeTargets, state::NetworkId, INTERN}; use tracing::error; +use super::context::CtxEventHelper; use super::{ error::{CannonError, SourceError}, net::get_available_port, - status::{TransactionSendState, TransactionStatusEvent, TransactionStatusSender}, tracker::TransactionTracker, ExecutionContext, }; use crate::env::set::find_compute_agent; +use crate::state::EmitEvent; /// Represents an instance of a local query service. #[derive(Clone, Debug, Serialize, Deserialize)] @@ -29,7 +30,7 @@ pub struct LocalService { /// if the node is out of sync, it will corrupt the ledger... /// /// requires cannon to have an associated env_id - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default, skip_serializing_if = "Option::is_none")] pub sync_from: Option, } @@ -153,9 +154,8 @@ impl ComputeTarget { &self, ctx: &ExecutionContext, query_path: &str, - tx_id: &str, + tx_id: &Arc, auth: &Authorization, - events: &TransactionStatusSender, ) -> Result<(), CannonError> { match self { ComputeTarget::Agent { labels } => { @@ -165,7 +165,10 @@ impl ComputeTarget { .ok_or(SourceError::NoAvailableAgents("authorization"))?; // emit status updates & increment attempts - events.send(TransactionStatusEvent::Executing(agent_id)); + TransactionEvent::Executing + .with_cannon_ctx(ctx, Arc::clone(tx_id)) + .with_agent_id(agent_id) + .emit(ctx); ctx.write_tx_status(tx_id, TransactionSendState::Executing(Utc::now())); if let Err(e) = TransactionTracker::inc_attempts( &ctx.state, @@ -191,9 +194,12 @@ impl ComputeTarget { let transaction = match serde_json::from_str::>(&transaction_json) { Ok(transaction) => transaction, Err(e) => { - events.send(TransactionStatusEvent::ExecuteFailed(format!( - "failed to parse transaction JSON: {transaction_json}", - ))); + TransactionEvent::ExecuteFailed(format!( + "failed to parse transaction JSON: {e}\n{transaction_json}" + )) + .with_cannon_ctx(ctx, Arc::clone(tx_id)) + .with_agent_id(agent_id) + .emit(ctx); return Err(CannonError::Source(SourceError::Json( "parse compute tx", e, @@ -235,7 +241,12 @@ impl ComputeTarget { tx.status = TransactionSendState::Unsent; tx.transaction = Some(Arc::clone(&transaction)); } - events.send(TransactionStatusEvent::ExecuteComplete(transaction)); + TransactionEvent::ExecuteComplete { + transaction: Arc::clone(&transaction), + } + .with_cannon_ctx(ctx, Arc::clone(tx_id)) + .with_agent_id(agent_id) + .emit(ctx); Ok(()) } @@ -266,39 +277,3 @@ impl ComputeTarget { } } } - -// I use this to generate example yaml... -/* #[cfg(test)] -mod test { - use super::*; - use crate::{ - cannon::source::{ComputeTarget, CreditsTxMode, LocalService, TxMode}, - schema::nodes::KeySource, - }; - use std::str::FromStr; - - #[test] - fn what_does_it_look_like() { - println!( - "{}", - serde_yaml::to_string(&TxSource::Playback { - file_name: "test".to_string(), - }) - .unwrap() - ); - println!( - "{}", - serde_yaml::to_string(&TxSource::RealTime { - query: QueryTarget::Local(LocalService { sync_from: None }), - compute: ComputeTarget::Agent { labels: None }, - tx_modes: [TxMode::Credits(CreditsTxMode::TransferPublic)] - .into_iter() - .collect(), - private_keys: vec![KeySource::from_str("committee.$").unwrap()], - addresses: vec![KeySource::from_str("committee.$").unwrap()], - }) - .unwrap() - ); - } -} - */ diff --git a/crates/controlplane/src/cannon/tracker.rs b/crates/controlplane/src/cannon/tracker.rs index f954de2f..04a89305 100644 --- a/crates/controlplane/src/cannon/tracker.rs +++ b/crates/controlplane/src/cannon/tracker.rs @@ -1,8 +1,11 @@ use std::sync::Arc; -use snops_common::{aot_cmds::Authorization, format::PackedUint}; +use snops_common::{ + format::PackedUint, + state::{Authorization, TransactionSendState}, +}; -use super::{error::CannonError, status::TransactionSendState}; +use super::error::CannonError; use crate::{db::TxEntry, state::GlobalState}; #[derive(Debug, Clone)] diff --git a/crates/controlplane/src/db.rs b/crates/controlplane/src/db.rs index 1ea705e4..b9600c17 100644 --- a/crates/controlplane/src/db.rs +++ b/crates/controlplane/src/db.rs @@ -1,19 +1,17 @@ -use std::path::Path; +use std::{path::Path, sync::Arc}; use snops_common::{ - aot_cmds::Authorization, db::{error::DatabaseError, tree::DbTree, Database as DatabaseTrait}, format::PackedUint, - state::{AgentId, CannonId, EnvId, NetworkId, StorageId}, + state::{AgentId, Authorization, CannonId, EnvId, NetworkId, StorageId, TransactionSendState}, }; use crate::{ - cannon::status::TransactionSendState, persist::{PersistEnv, PersistStorage}, state::Agent, }; -pub type TxEntry = (EnvId, CannonId, String); +pub type TxEntry = (EnvId, CannonId, Arc); pub struct Database { #[allow(unused)] @@ -40,7 +38,6 @@ pub struct Database { pub(crate) tx_index: DbTree, /// Number of attempts for the transaction's current state pub(crate) tx_attempts: DbTree, - // TODO: tx_attempts for tracking retries (of broadcast and execution) } impl DatabaseTrait for Database { diff --git a/crates/controlplane/src/env/cache.rs b/crates/controlplane/src/env/cache.rs index 0272dcbc..3a954837 100644 --- a/crates/controlplane/src/env/cache.rs +++ b/crates/controlplane/src/env/cache.rs @@ -177,6 +177,11 @@ impl NetworkCache { self.transaction_to_block_hash.contains_key(tx_id) } + /// Find a block hash given a transaction id + pub fn find_transaction(&self, tx_id: &str) -> Option<&ABlockHash> { + self.transaction_to_block_hash.get(tx_id) + } + /// Check if the latest stored info is within the range of the provided /// height pub fn is_recent_block(&self, height: u32) -> bool { diff --git a/crates/controlplane/src/env/error.rs b/crates/controlplane/src/env/error.rs index fdc9ee84..a97dfa98 100644 --- a/crates/controlplane/src/env/error.rs +++ b/crates/controlplane/src/env/error.rs @@ -13,7 +13,6 @@ use tokio::task::JoinError; use crate::{ cannon::error::{AuthorizeError, CannonError}, schema::error::{SchemaError, StorageError}, - state::error::BatchReconcileError, }; #[derive(Debug, Error, AsRefStr)] @@ -63,8 +62,6 @@ pub enum ExecutionError { Cannon(#[from] CannonError), #[error(transparent)] Join(#[from] JoinError), - #[error(transparent)] - Reconcile(#[from] BatchReconcileError), #[error("env `{0}` timeline `{1}` not found")] TimelineNotFound(EnvId, TimelineId), #[error("env timeline is already being executed")] @@ -79,7 +76,6 @@ pub enum ExecutionError { impl_into_status_code!(ExecutionError, |value| match value { Cannon(e) => e.into(), - Reconcile(e) => e.into(), Storage(e) => e.into(), _ => StatusCode::INTERNAL_SERVER_ERROR, }); @@ -190,8 +186,6 @@ impl_into_status_code!(CleanupError, |_| StatusCode::NOT_FOUND); #[derive(Debug, Error, AsRefStr)] pub enum ReconcileError { - #[error(transparent)] - Batch(#[from] BatchReconcileError), #[error("env `{0}` not found")] EnvNotFound(EnvId), #[error("expected internal agent peer for node with key {key}")] @@ -199,7 +193,6 @@ pub enum ReconcileError { } impl_into_status_code!(ReconcileError, |value| match value { - Batch(e) => e.into(), EnvNotFound(_) | ExpectedInternalAgentPeer { .. } => StatusCode::NOT_FOUND, }); diff --git a/crates/controlplane/src/env/mod.rs b/crates/controlplane/src/env/mod.rs index 0f9775c1..3da59028 100644 --- a/crates/controlplane/src/env/mod.rs +++ b/crates/controlplane/src/env/mod.rs @@ -6,13 +6,15 @@ use std::{ use bimap::BiMap; use dashmap::DashMap; +use futures_util::future::join_all; use indexmap::{map::Entry, IndexMap, IndexSet}; use serde::{Deserialize, Serialize}; use snops_common::{ - api::EnvInfo, + api::{AgentEnvInfo, EnvInfo}, node_targets::NodeTargets, state::{ - AgentId, AgentPeer, AgentState, CannonId, EnvId, NetworkId, NodeKey, NodeState, TxPipeId, + AgentId, AgentPeer, AgentState, CannonId, EnvId, NetworkId, NodeKey, NodeState, + ReconcileOptions, TxPipeId, }, }; use tokio::sync::Semaphore; @@ -37,11 +39,9 @@ use crate::{ state::{Agent, GlobalState}, }; +pub mod cache; pub mod error; -mod reconcile; pub mod set; -pub use reconcile::*; -pub mod cache; #[derive(Debug)] pub struct Environment { @@ -108,17 +108,17 @@ impl Environment { .collect() } - /// Prepare a test. This will set the current test on the GlobalState. + /// Apply an environment spec. This will attempt to delegate the given node + /// configurations to available agents, or update existing agents with new + /// configurations. /// /// **This will error if the current env is not unset before calling to /// ensure tests are properly cleaned up.** - pub async fn prepare( + pub async fn apply( env_id: EnvId, documents: Vec, state: Arc, - ) -> Result { - state.prom_httpsd.lock().await.set_dirty(); - + ) -> Result, EnvError> { let prev_env = state.get_env(env_id); let mut storage_doc = None; @@ -178,6 +178,7 @@ impl Environment { // maps of states and peers that are new to this environment let mut incoming_states = IndexMap::default(); + let mut updated_states = IndexMap::::default(); let mut incoming_peers = BiMap::default(); // set of resolved keys that will be present (new and old) @@ -205,13 +206,18 @@ impl Environment { // nodes in flattened_nodes have replicas unset doc_node.replicas.take(); - // TODO: compare existing agent state with old node state - // where the agent state is the same, insert the new state - // otherwise keep the old state + // replace the key with a new one + let mut node = doc_node.to_owned(); + if let Some(key) = node.key.as_mut() { + *key = key.with_index(i); + } // Skip delegating nodes that are already present in the node map + // Agents are able to determine what updates need to be applied + // based on their resolved node states. if node_peers.contains_left(&node_key) { - info!("{env_id}: skipping node {node_key} - already configured"); + info!("{env_id}: updating node {node_key}"); + updated_states.insert(node_key, EnvNodeState::Internal(node)); continue; } @@ -219,14 +225,7 @@ impl Environment { Entry::Occupied(ent) => { Err(PrepareError::DuplicateNodeKey(ent.key().clone()))? } - Entry::Vacant(ent) => { - // replace the key with a new one - let mut node = doc_node.to_owned(); - if let Some(key) = node.key.as_mut() { - *key = key.with_index(i); - } - ent.insert(EnvNodeState::Internal(node)) - } + Entry::Vacant(ent) => ent.insert(EnvNodeState::Internal(node)), }; } } @@ -339,6 +338,7 @@ impl Environment { node_peers.extend(incoming_peers.into_iter()); node_states.extend(incoming_states.into_iter()); + node_states.extend(updated_states.into_iter()); } _ => warn!("ignored unimplemented document type"), @@ -378,6 +378,12 @@ impl Environment { .collect(), )?; + let storage_changed = prev_env + .as_ref() + .is_some_and(|prev| prev.storage.info() != storage.info()); + + let clear_last_height = prev_env.is_none() && !storage.persist; + let env = Arc::new(Environment { id: env_id, storage, @@ -400,64 +406,112 @@ impl Environment { agents_to_inventory.len() ); // reconcile agents that are freed up from the delta between environments - if let Err(e) = state - .reconcile_agents( + state + .update_agent_states( agents_to_inventory .into_iter() - .map(|id| (id, state.get_client(id), AgentState::Inventory)), + .map(|id| (id, AgentState::Inventory)), ) - .await - { - error!("an error occurred while attempting to inventory newly freed agents: {e}"); - } + .await; } - // reconcile the nodes - initial_reconcile(env_id, &state, prev_env.is_none()).await?; + // Emit state changes to all agents within this environment + env.update_all_agents( + &state, + ReconcileOptions { + refetch_info: storage_changed, + clear_last_height, + ..Default::default() + }, + ) + .await + } + + async fn update_all_agents( + &self, + state: &GlobalState, + opts: ReconcileOptions, + ) -> Result, EnvError> { + let mut pending_changes = vec![]; + let mut node_map = HashMap::new(); + + for entry in self.node_states.iter() { + let key = entry.key(); + let node = entry.value(); + let EnvNodeState::Internal(node) = node else { + continue; + }; + let Some(agent_id) = self.get_agent_by_key(key) else { + continue; + }; + let Some(agent) = state.pool.get(&agent_id) else { + continue; + }; + + let mut next_state = self.resolve_node_state(state, agent_id, key, node); + + // determine if this reconcile will reset the agent's height (and potentially + // trigger a ledger wipe) + match agent.state() { + // new environment -> reset height + AgentState::Node(old_env, _) if *old_env != self.id => {} + // height request is the same -> keep the height + AgentState::Node(_, prev_state) if prev_state.height.1 == next_state.height.1 => { + next_state.height.0 = prev_state.height.0; + } + // otherwise, reset height + AgentState::Node(_, _) => {} + // moving from inventory -> reset height + AgentState::Inventory => {} + } - Ok(env_id) + node_map.insert(next_state.node_key.clone(), agent_id); + + let agent_state = AgentState::Node(self.id, Box::new(next_state)); + pending_changes.push((agent_id, agent_state)); + } + + state.update_agent_states_opts(pending_changes, opts).await; + Ok(node_map) } pub async fn cleanup(id: EnvId, state: &GlobalState) -> Result<(), EnvError> { // clear the env state - info!("[env {id}] deleting persistence..."); + info!("{id}: Deleting persistence..."); let env = state.remove_env(id).ok_or(CleanupError::EnvNotFound(id))?; if let Err(e) = state.db.envs.delete(&id) { - error!("[env {id}] failed to delete env persistence: {e}"); + error!("{id}: Failed to delete env persistence: {e}"); } // TODO: write all of these values to a file before deleting them // cleanup cannon transaction trackers if let Err(e) = state.db.tx_attempts.delete_with_prefix(&id) { - error!("[env {id}] failed to delete env tx_attempts persistence: {e}"); + error!("{id}: Failed to delete env tx_attempts persistence: {e}"); } if let Err(e) = state.db.tx_auths.delete_with_prefix(&id) { - error!("[env {id}] failed to delete env tx_auths persistence: {e}"); + error!("{id}: Failed to delete env tx_auths persistence: {e}"); } if let Err(e) = state.db.tx_blobs.delete_with_prefix(&id) { - error!("[env {id}] failed to delete env tx_blobs persistence: {e}"); + error!("{id}: Failed to delete env tx_blobs persistence: {e}"); } if let Err(e) = state.db.tx_index.delete_with_prefix(&id) { - error!("[env {id}] failed to delete env tx_index persistence: {e}"); + error!("{id}: Failed to delete env tx_index persistence: {e}"); } if let Err(e) = state.db.tx_status.delete_with_prefix(&id) { - error!("[env {id}] failed to delete env tx_status persistence: {e}"); + error!("{id}: Failed to delete env tx_status persistence: {e}"); } if let Some(storage) = state.try_unload_storage(env.network, env.storage.id) { - info!("[env {id}] unloaded storage {}", storage.id); + info!("{id}: Unloaded storage {}", storage.id); } - trace!("[env {id}] marking prom as dirty"); - state.prom_httpsd.lock().await.set_dirty(); - - trace!("[env {id}] inventorying agents..."); + trace!("{id}: Inventorying agents..."); - if let Err(e) = state - .reconcile_agents( + state + .update_agent_states( env.node_peers .right_values() // find all agents associated with the env @@ -465,16 +519,13 @@ impl Environment { EnvPeer::Internal(id) => Some(*id), _ => None, }) - .map(|id| (id, state.get_client(id), AgentState::Inventory)) + .map(|id| (id, AgentState::Inventory)) // this collect is necessary because the iter sent to reconcile_agents // must be owned by this thread. Without this, the iter would hold a reference // to the env.node_peers.right_values(), which is NOT Send .collect::>(), ) - .await - { - error!("an error occurred while attempting to inventory newly freed agents: {e}"); - } + .await; Ok(()) } @@ -558,6 +609,105 @@ impl Environment { }) } + fn nodes_with_peer<'a>( + &'a self, + key: &'a NodeKey, + ) -> impl Iterator> { + self.node_states.iter().filter(move |s| { + // Only internal nodes can be agents + let EnvNodeState::Internal(node) = s.value() else { + return false; + }; + + // Ignore self-reference + if s.key() == key { + return false; + } + + // Only agents that reference the node are relevant + node.peers.matches(key) || node.validators.matches(key) + }) + } + + pub async fn update_peer_addr( + &self, + state: &GlobalState, + agent_id: AgentId, + is_port_change: bool, + is_ip_change: bool, + ) { + let Some(key) = self.get_node_key_by_agent(agent_id) else { + return; + }; + let pending_reconciles = self + .nodes_with_peer(key) + .filter_map(|ent| { + let EnvNodeState::Internal(env_node) = ent.value() else { + return None; + }; + + // Lookup agent and get current state + let agent_id = self.get_agent_by_key(ent.key())?; + + // If the port didn't change, we're not updating the agents' states + if !is_port_change { + return Some((agent_id, None)); + } + + let agent = state.pool.get(&agent_id)?; + + let AgentState::Node(env_id, node_state) = agent.state() else { + return None; + }; + + // Determine if the node's peers and validators have changed + let (peers, validators) = self.resolve_node_peers(&state.pool, agent_id, env_node); + if peers == node_state.peers && validators == node_state.validators { + return None; + } + + // Update the node's peers and validators + let mut new_state = node_state.clone(); + new_state.peers = peers; + new_state.validators = validators; + + Some((agent_id, Some(AgentState::Node(*env_id, new_state)))) + }) + .collect::>(); + + // Call the clear peer addr RPC for all agents that reference the node + if is_ip_change { + join_all(pending_reconciles.iter().filter_map(|(id, _)| { + let client = state.get_client(*id)?; + + Some(tokio::spawn(async move { + client.clear_peer_addr(agent_id).await + })) + })) + .await; + } + + // Update the agent states if there's a port change + if is_port_change { + state + .update_agent_states( + pending_reconciles + .into_iter() + .filter_map(|(id, state)| state.map(|s| (id, s))), + ) + .await; + + // Otherwise do a normal reconcile + } else { + state + .queue_many_reconciles( + pending_reconciles.into_iter().map(|(id, _)| id), + Default::default(), + ) + .await; + } + } + pub fn get_cannon(&self, id: CannonId) -> Option> { self.cannons.get(&id).cloned() } @@ -570,6 +720,13 @@ impl Environment { } } + pub fn agent_info(&self) -> AgentEnvInfo { + AgentEnvInfo { + network: self.network, + storage: self.storage.info(), + } + } + /// Resolve node's agent configuration given the context of the environment. pub fn resolve_node_state( &self, @@ -588,23 +745,34 @@ impl Environment { .map(|key| self.storage.lookup_keysource_pk(key)) .unwrap_or_default(); + (node_state.peers, node_state.validators) = self.resolve_node_peers(&state.pool, id, node); + + node_state + } + + pub fn resolve_node_peers( + &self, + pool: &DashMap, + id: AgentId, + node: &Node, + ) -> (Vec, Vec) { // a filter to exclude the current node from the list of peers let not_me = |agent: &AgentPeer| !matches!(agent, AgentPeer::Internal(candidate_id, _) if *candidate_id == id); // resolve the peers and validators from node targets - node_state.peers = self - .matching_nodes(&node.peers, &state.pool, PortType::Node) + let mut peers: Vec<_> = self + .matching_nodes(&node.peers, pool, PortType::Node) .filter(not_me) .collect(); - node_state.peers.sort(); + peers.sort(); - node_state.validators = self - .matching_nodes(&node.validators, &state.pool, PortType::Bft) + let mut validators: Vec<_> = self + .matching_nodes(&node.validators, pool, PortType::Bft) .filter(not_me) .collect(); - node_state.validators.sort(); + validators.sort(); - node_state + (peers, validators) } } diff --git a/crates/controlplane/src/env/reconcile.rs b/crates/controlplane/src/env/reconcile.rs deleted file mode 100644 index d5cb2132..00000000 --- a/crates/controlplane/src/env/reconcile.rs +++ /dev/null @@ -1,72 +0,0 @@ -use snops_common::state::{AgentState, EnvId}; -use tracing::error; - -use super::{error::*, EnvNodeState}; -use crate::{env::Environment, state::GlobalState}; - -/// Reconcile all associated nodes with their initial state. -pub async fn initial_reconcile( - env_id: EnvId, - state: &GlobalState, - is_new_env: bool, -) -> Result<(), EnvError> { - let mut pending_reconciliations = vec![]; - { - let env = state - .get_env(env_id) - .ok_or(ReconcileError::EnvNotFound(env_id))? - .clone(); - - for entry in env.node_states.iter() { - let key = entry.key(); - let node = entry.value(); - let EnvNodeState::Internal(node) = node else { - continue; - }; - - // get the internal agent ID from the node key - let id = env - .get_agent_by_key(key) - .ok_or_else(|| ReconcileError::ExpectedInternalAgentPeer { key: key.clone() })?; - - let mut node_state = env.resolve_node_state(state, id, key, node); - - // determine if this reconcile will reset the agent's height (and potentially - // trigger a ledger wipe) - if let Some(agent) = state.pool.get(&id) { - match agent.state() { - // new environment -> reset height - AgentState::Node(old_env, _) if *old_env != env_id => {} - // height request is the same -> keep the height - AgentState::Node(_, state) if state.height.1 == node_state.height.1 => { - node_state.height.0 = state.height.0; - } - // otherwise, reset height - AgentState::Node(_, _) => {} - // moving from inventory -> reset height - AgentState::Inventory => {} - } - } - - let agent_state = AgentState::Node(env_id, Box::new(node_state)); - - pending_reconciliations.push((id, state.get_client(id), agent_state)); - } - } - - if let Err(e) = state.reconcile_agents(pending_reconciliations).await { - // if this is a patch to an existing environment, avoid inventorying the agents - if !is_new_env { - return Err(ReconcileError::Batch(e).into()); - } - - error!("an error occurred on initial reconciliation, inventorying all agents: {e}"); - if let Err(e) = Environment::cleanup(env_id, state).await { - error!("an error occurred inventorying agents: {e}"); - } - - Err(ReconcileError::Batch(e).into()) - } else { - Ok(()) - } -} diff --git a/crates/controlplane/src/env/set.rs b/crates/controlplane/src/env/set.rs index 8259fcac..8f8ffe3e 100644 --- a/crates/controlplane/src/env/set.rs +++ b/crates/controlplane/src/env/set.rs @@ -207,15 +207,6 @@ pub fn pair_with_nodes( )]); } - // another optimization that could be made is to sort nodes based on the number - // of agents with the specific labels. this would be useful for when some - // agents have unique labels as well as other common labels and - // there are nodes asking for agents with either. - - // TODO: potential performance improvement by splitting this agent map up - // available modes eg. client map, prover map, validator map, then pick by - // the key.ty - // handle the nodes that want specific agents first let agent_map = agents.iter().map(|a| (a.id, a)).collect::>(); diff --git a/crates/controlplane/src/error.rs b/crates/controlplane/src/error.rs index 17dd6bb3..89ad73f2 100644 --- a/crates/controlplane/src/error.rs +++ b/crates/controlplane/src/error.rs @@ -20,8 +20,6 @@ pub enum StateError { Agent(#[from] snops_common::prelude::error::AgentError), #[error("source agent has no addr id: `{0}`")] NoAddress(AgentId), - #[error(transparent)] - Reconcile(#[from] snops_common::prelude::error::ReconcileError), #[error("{0}")] Rpc(#[from] tarpc::client::RpcError), #[error("source agent not found id: `{0}`")] @@ -32,7 +30,6 @@ impl_into_status_code!(StateError); impl_into_type_str!(StateError, |value| match value { Agent(e) => format!("{}.{}", value.as_ref(), e.as_ref()), - Reconcile(e) => format!("{}.{}", value.as_ref(), e.as_ref()), _ => value.as_ref().to_string(), }); diff --git a/crates/controlplane/src/events/mod.rs b/crates/controlplane/src/events/mod.rs new file mode 100644 index 00000000..70f059fd --- /dev/null +++ b/crates/controlplane/src/events/mod.rs @@ -0,0 +1,5 @@ +mod stream; +pub use stream::*; + +#[cfg(test)] +mod test_stream; diff --git a/crates/controlplane/src/events/stream.rs b/crates/controlplane/src/events/stream.rs new file mode 100644 index 00000000..f0d285ef --- /dev/null +++ b/crates/controlplane/src/events/stream.rs @@ -0,0 +1,108 @@ +use std::{sync::Arc, task::Poll}; + +use futures_util::Stream; +use snops_common::events::{Event, EventFilter}; +use tokio::sync::broadcast::{self, error::TryRecvError}; + +#[derive(Debug)] +pub struct Events { + tx: broadcast::Sender>, +} + +impl Events { + pub fn new() -> Self { + Self { + tx: broadcast::channel(1024).0, + } + } + + pub fn emit(&self, event: Event) { + if self.tx.receiver_count() == 0 { + return; + } + // The only way this can fail is a receiver was dropped between the above check + // and this call... + let _ = self.tx.send(Arc::new(event)); + } + + pub fn subscribe(&self) -> EventSubscriber { + EventSubscriber { + rx: self.tx.subscribe(), + filter: EventFilter::Unfiltered, + } + } + + pub fn subscribe_on(&self, filter: impl Into) -> EventSubscriber { + EventSubscriber { + rx: self.tx.subscribe(), + filter: filter.into(), + } + } +} + +impl Default for Events { + fn default() -> Self { + Self::new() + } +} + +pub struct EventSubscriber { + rx: broadcast::Receiver>, + filter: EventFilter, +} + +impl EventSubscriber { + pub async fn next(&mut self) -> Result, broadcast::error::RecvError> { + loop { + match self.rx.recv().await { + Ok(event) if event.matches(&self.filter) => break Ok(event), + // skip events that don't match the filter + Ok(_) => continue, + Err(e) => break Err(e), + } + } + } + + pub fn collect_many(&mut self) -> Vec> { + let mut events = Vec::new(); + loop { + match self.rx.try_recv() { + Ok(event) if event.matches(&self.filter) => events.push(event), + // skip events that don't match the filter + Ok(_) => continue, + Err(TryRecvError::Closed) => break, + Err(TryRecvError::Empty) => break, + Err(TryRecvError::Lagged(n)) => { + tracing::warn!("{n} events dropped by a subscriber"); + } + } + } + events + } + + pub fn set_filter(&mut self, filter: impl Into) { + self.filter = filter.into(); + } +} + +impl Stream for EventSubscriber { + type Item = Arc; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> Poll> { + loop { + match self.rx.try_recv() { + Ok(event) if event.matches(&self.filter) => break Poll::Ready(Some(event)), + // skip events that don't match the filter + Ok(_) => continue, + Err(TryRecvError::Closed) => break Poll::Ready(None), + Err(TryRecvError::Empty) => break Poll::Pending, + Err(TryRecvError::Lagged(n)) => { + tracing::warn!("{n} events dropped by a subscriber"); + } + } + } + } +} diff --git a/crates/controlplane/src/events/test_stream.rs b/crates/controlplane/src/events/test_stream.rs new file mode 100644 index 00000000..f19a8324 --- /dev/null +++ b/crates/controlplane/src/events/test_stream.rs @@ -0,0 +1,43 @@ +use std::str::FromStr; + +use lazy_static::lazy_static; +use snops_common::events::{AgentEvent::*, EventFilter::*, EventHelpers, EventKindFilter::*}; +use snops_common::state::InternedId; + +use crate::events::Events; + +lazy_static! { + static ref A: InternedId = InternedId::from_str("a").unwrap(); + static ref B: InternedId = InternedId::from_str("b").unwrap(); + static ref C: InternedId = InternedId::from_str("c").unwrap(); + static ref D: InternedId = InternedId::from_str("d").unwrap(); +} + +#[test] +fn test_stream_filtering() { + let events = Events::new(); + + let mut sub_all = events.subscribe(); + let mut sub_a = events.subscribe_on(AgentIs(*A)); + let mut sub_b = events.subscribe_on(AgentIs(*B)); + let mut sub_connected = events.subscribe_on(AgentConnected); + + assert_eq!(sub_all.collect_many().len(), 0); + assert_eq!(sub_a.collect_many().len(), 0); + assert_eq!(sub_b.collect_many().len(), 0); + assert_eq!(sub_connected.collect_many().len(), 0); + + events.emit( + Connected { + version: "0.0.0".to_string(), + } + .with_agent_id(*A), + ); + events.emit(Disconnected.with_agent_id(*A)); + events.emit(BlockInfo(Default::default()).with_agent_id(*B)); + + assert_eq!(sub_all.collect_many().len(), 3); + assert_eq!(sub_a.collect_many().len(), 2); + assert_eq!(sub_b.collect_many().len(), 1); + assert_eq!(sub_connected.collect_many().len(), 1); +} diff --git a/crates/controlplane/src/main.rs b/crates/controlplane/src/main.rs index f3c99c03..6f4a65ba 100644 --- a/crates/controlplane/src/main.rs +++ b/crates/controlplane/src/main.rs @@ -10,11 +10,13 @@ use tokio::select; use tracing::{error, info, level_filters::LevelFilter, trace}; use tracing_subscriber::{prelude::*, reload, EnvFilter}; +pub mod agent_version; pub mod cannon; pub mod cli; pub mod db; pub mod env; pub mod error; +pub mod events; pub mod logging; pub mod persist; pub mod schema; diff --git a/crates/controlplane/src/persist/env.rs b/crates/controlplane/src/persist/env.rs index f48f8e2b..4c85889c 100644 --- a/crates/controlplane/src/persist/env.rs +++ b/crates/controlplane/src/persist/env.rs @@ -2,15 +2,14 @@ use std::sync::Arc; use bimap::BiMap; use dashmap::DashMap; +use snops_common::state::TransactionSendState; use snops_common::state::{CannonId, EnvId, NetworkId, NodeKey, StorageId}; use tokio::sync::Semaphore; use super::prelude::*; use super::PersistNode; use crate::{ - cannon::{ - sink::TxSink, source::TxSource, status::TransactionSendState, tracker::TransactionTracker, - }, + cannon::{sink::TxSink, source::TxSource, tracker::TransactionTracker}, env::{ error::{EnvError, PrepareError}, prepare_cannons, EnvNodeState, EnvPeer, Environment, @@ -34,7 +33,6 @@ pub struct PersistEnv { /// List of nodes and their states or external node info pub nodes: Vec<(NodeKey, PersistNode)>, /// Loaded cannon configs in this env - /// TODO: persist cannon pub cannons: Vec<(CannonId, TxSource, TxSink)>, } @@ -197,7 +195,7 @@ impl DataFormat for PersistEnv { type Header = PersistEnvFormatHeader; const LATEST_HEADER: Self::Header = PersistEnvFormatHeader { version: 1, - nodes: PersistNode::LATEST_HEADER, // TODO: use PersistNode::LATEST_HEADER + nodes: PersistNode::LATEST_HEADER, tx_source: TxSource::LATEST_HEADER, tx_sink: TxSink::LATEST_HEADER, network: NetworkId::LATEST_HEADER, diff --git a/crates/controlplane/src/persist/node.rs b/crates/controlplane/src/persist/node.rs index 5feff317..5cd638bc 100644 --- a/crates/controlplane/src/persist/node.rs +++ b/crates/controlplane/src/persist/node.rs @@ -90,7 +90,7 @@ mod tests { use snops_common::{ format::DataFormat, node_targets::NodeTargets, - state::{DocHeightRequest, InternedId}, + state::{HeightRequest, InternedId}, }; use crate::{ @@ -140,7 +140,7 @@ mod tests { online: true, replicas: None, key: None, - height: DocHeightRequest::Top, + height: HeightRequest::Top, labels: Default::default(), agent: None, validators: NodeTargets::None, @@ -156,7 +156,7 @@ mod tests { online: true, replicas: None, key: None, - height: DocHeightRequest::Top, + height: HeightRequest::Top, labels: Default::default(), agent: None, validators: NodeTargets::None, diff --git a/crates/controlplane/src/persist/storage.rs b/crates/controlplane/src/persist/storage.rs index 030657d9..a3df3f37 100644 --- a/crates/controlplane/src/persist/storage.rs +++ b/crates/controlplane/src/persist/storage.rs @@ -1,12 +1,11 @@ use indexmap::IndexMap; -use snops_checkpoint::{CheckpointManager, RetentionPolicy}; +use snops_checkpoint::RetentionPolicy; use snops_common::{ binaries::BinaryEntry, - constant::LEDGER_BASE_DIR, key_source::ACCOUNTS_KEY_ID, state::{InternedId, NetworkId, StorageId}, }; -use tracing::{info, warn}; +use tracing::warn; use super::prelude::*; use crate::{ @@ -85,7 +84,7 @@ impl From<&LoadedStorage> for PersistStorage { version: storage.version, persist: storage.persist, accounts: storage.accounts.keys().cloned().collect(), - retention_policy: storage.checkpoints.as_ref().map(|c| c.policy().clone()), + retention_policy: storage.retention_policy.clone(), native_genesis: storage.native_genesis, binaries: storage.binaries.clone(), } @@ -100,20 +99,6 @@ impl PersistStorage { storage_path.push(id.to_string()); let committee_file = storage_path.join("committee.json"); - let checkpoints = self - .retention_policy - .map(|policy| { - CheckpointManager::load(storage_path.join(LEDGER_BASE_DIR), policy) - .map_err(StorageError::CheckpointManager) - }) - .transpose()?; - - if let Some(checkpoints) = &checkpoints { - info!("storage {id} checkpoint manager loaded {checkpoints}"); - } else { - info!("storage {id} loaded without a checkpoint manager"); - } - let mut accounts = IndexMap::new(); // load accounts json @@ -142,7 +127,7 @@ impl PersistStorage { version: self.version, persist: self.persist, committee: read_to_addrs(pick_commitee_addr, &committee_file).await?, - checkpoints, + retention_policy: self.retention_policy, native_genesis: self.native_genesis, accounts, binaries: self.binaries, diff --git a/crates/controlplane/src/schema/nodes.rs b/crates/controlplane/src/schema/nodes.rs index e94e1d31..da16b984 100644 --- a/crates/controlplane/src/schema/nodes.rs +++ b/crates/controlplane/src/schema/nodes.rs @@ -8,7 +8,7 @@ use snops_common::{ lasso::Spur, node_targets::NodeTargets, set::{MaskBit, MASK_PREFIX_LEN}, - state::{AgentId, DocHeightRequest, InternedId, NetworkId, NodeState}, + state::{AgentId, HeightRequest, InternedId, NetworkId, NodeState}, INTERN, }; @@ -181,7 +181,7 @@ pub struct Node { /// * When zero, the ledger is empty and only the genesis block is /// inherited. #[serde(default)] - pub height: DocHeightRequest, + pub height: HeightRequest, /// When specified, agents must have these labels #[serde( @@ -217,7 +217,7 @@ impl Node { NodeState { node_key, private_key: Default::default(), - height: (0, self.height.into()), + height: (0, self.height), online: self.online, env: self.env.clone(), binary: self.binary, @@ -252,7 +252,7 @@ impl Node { #[derive(Debug, Clone)] pub struct NodeFormatHeader { pub(crate) key_source: DataHeaderOf, - pub(crate) height_request: DataHeaderOf, + pub(crate) height_request: DataHeaderOf, pub(crate) node_targets: DataHeaderOf, pub has_binaries: bool, } @@ -285,7 +285,7 @@ impl DataFormat for NodeFormatHeader { } let key_source = KeySource::read_header(reader)?; - let height_request = DocHeightRequest::read_header(reader)?; + let height_request = HeightRequest::read_header(reader)?; let node_targets = NodeTargets::read_header(reader)?; Ok(NodeFormatHeader { key_source, @@ -300,7 +300,7 @@ impl DataFormat for Node { type Header = NodeFormatHeader; const LATEST_HEADER: Self::Header = NodeFormatHeader { key_source: KeySource::LATEST_HEADER, - height_request: DocHeightRequest::LATEST_HEADER, + height_request: HeightRequest::LATEST_HEADER, node_targets: NodeTargets::LATEST_HEADER, has_binaries: true, }; diff --git a/crates/controlplane/src/schema/storage/loaded.rs b/crates/controlplane/src/schema/storage/loaded.rs index 721e193e..8a2b3991 100644 --- a/crates/controlplane/src/schema/storage/loaded.rs +++ b/crates/controlplane/src/schema/storage/loaded.rs @@ -4,9 +4,9 @@ use futures_util::StreamExt; use indexmap::IndexMap; use rand::seq::IteratorRandom; use sha2::{Digest, Sha256}; -use snops_checkpoint::CheckpointManager; +use snops_checkpoint::RetentionPolicy; use snops_common::{ - api::{CheckpointMeta, StorageInfo}, + api::StorageInfo, binaries::{BinaryEntry, BinarySource}, key_source::KeySource, state::{InternedId, KeyState, NetworkId, StorageId}, @@ -33,7 +33,7 @@ pub struct LoadedStorage { /// other accounts files lookup pub accounts: IndexMap, /// storage of checkpoints - pub checkpoints: Option, + pub retention_policy: Option, /// whether agents using this storage should persist it pub persist: bool, /// whether to use the network's native genesis block @@ -146,23 +146,6 @@ impl LoadedStorage { } pub fn info(&self) -> StorageInfo { - let checkpoints = self - .checkpoints - .as_ref() - .map(|c| { - c.checkpoints() - .filter_map(|(c, path)| { - path.file_name() - .and_then(|s| s.to_str()) - .map(|filename| CheckpointMeta { - filename: filename.to_string(), - height: c.block_height, - timestamp: c.timestamp, - }) - }) - .collect() - }) - .unwrap_or_default(); let mut binaries: IndexMap<_, _> = self .binaries .iter() @@ -182,8 +165,7 @@ impl LoadedStorage { StorageInfo { id: self.id, version: self.version, - retention_policy: self.checkpoints.as_ref().map(|c| c.policy().clone()), - checkpoints, + retention_policy: self.retention_policy.clone(), persist: self.persist, native_genesis: self.native_genesis, binaries, diff --git a/crates/controlplane/src/schema/storage/mod.rs b/crates/controlplane/src/schema/storage/mod.rs index 0e7d4424..eb06cdd0 100644 --- a/crates/controlplane/src/schema/storage/mod.rs +++ b/crates/controlplane/src/schema/storage/mod.rs @@ -1,17 +1,12 @@ -use std::{ - ops::Deref, - path::PathBuf, - process::{ExitStatus, Stdio}, - sync::Arc, -}; +use std::{ops::Deref, path::PathBuf, process::Stdio, sync::Arc}; use indexmap::IndexMap; use serde::{Deserialize, Serialize}; -use snops_checkpoint::{CheckpointManager, RetentionPolicy}; +use snops_checkpoint::RetentionPolicy; use snops_common::{ aot_cmds::error::CommandError, binaries::{BinaryEntry, BinarySource}, - constant::{LEDGER_BASE_DIR, LEDGER_STORAGE_FILE, SNARKOS_GENESIS_FILE, VERSION_FILE}, + constant::{SNARKOS_GENESIS_FILE, VERSION_FILE}, key_source::ACCOUNTS_KEY_ID, state::{InternedId, NetworkId, StorageId}, }; @@ -74,7 +69,8 @@ pub struct StorageGeneration { pub transactions: Vec, } -// TODO: I don't know what this type should look like +// TODO: Convert this into a struct similar to the execute action, then use +// compute agents to assemble these on the fly #[derive(Deserialize, Debug, Clone, Serialize)] pub struct Transaction { pub file: PathBuf, @@ -87,7 +83,6 @@ pub struct Transaction { #[derive(Deserialize, Debug, Clone, Serialize)] #[serde(rename_all = "kebab-case")] pub struct GenesisGeneration { - // TODO: bonded balances mode, seed, genesis_key pub private_key: Option, pub seed: Option, pub additional_accounts: Option, @@ -151,10 +146,6 @@ impl Document { ) -> Result, SchemaError> { let id = self.id; - // todo: maybe update the loaded storage in global state if the hash - // of the storage document is different I guess... - // that might interfere with running tests, so I don't know - // add the prepared storage to the storage map if state.storage.contains_key(&(network, id)) { @@ -174,19 +165,19 @@ impl Document { // warn if an existing block/ledger already exists if exists { - warn!("the specified storage ID {id} already exists"); + warn!("The specified storage ID {id} already exists"); } let old_version = get_version_from_path(&version_file).await?; info!( - "storage {id} has version {old_version:?}. incoming version is {}", + "Storage {id} has version {old_version:?}. incoming version is {}", self.regen ); // wipe old storage when the version changes if old_version != Some(self.regen) && exists { - info!("storage {id} version changed, removing old storage"); + info!("Storage {id} version changed, removing old storage"); tokio::fs::remove_dir_all(&base) .await .map_err(|e| StorageError::RemoveStorage(version_file.clone(), e))?; @@ -212,7 +203,7 @@ impl Document { *p = canon } } - info!("resolved binary {id}: {entry}"); + info!("Resolved binary {id}: {entry}"); binaries.insert(id, entry); } @@ -232,7 +223,7 @@ impl Document { // generate the block and ledger if we have generation params if let (Some(generation), false) = (self.generate.as_ref(), exists) { - tracing::debug!("generating storage for {id}"); + tracing::debug!("Generating storage for {id}"); // generate the genesis block using the aot cli let output = base.join(SNARKOS_GENESIS_FILE); @@ -270,9 +261,7 @@ impl Document { .env("NETWORK", network.to_string()) .arg("genesis") .arg("--output") - .arg(&output) - .arg("--ledger") - .arg(base.join(LEDGER_BASE_DIR)); + .arg(&output); // conditional seed flag if let Some(seed) = genesis.seed { @@ -349,7 +338,7 @@ impl Document { .arg(balance.to_string()); } - info!("{command:?}"); + info!("Generating genesis for {id} with command: {command:?}"); let res = command .spawn() @@ -380,45 +369,6 @@ impl Document { } } - // tar the ledger so that it can be served to agents - // the genesis block is not compressed because it is already binary and might - // not be served independently - let ledger_exists = matches!( - tokio::fs::try_exists(base.join(LEDGER_BASE_DIR)).await, - Ok(true) - ); - let ledger_tar_exists = matches!( - tokio::fs::try_exists(base.join(LEDGER_STORAGE_FILE)).await, - Ok(true) - ); - - if ledger_exists && !ledger_tar_exists { - let mut child = Command::new("tar") - .current_dir(&base) - .arg("czf") - .arg(LEDGER_STORAGE_FILE) - .arg(LEDGER_BASE_DIR) - .kill_on_drop(true) - .spawn() - .map_err(|e| { - StorageError::Command(CommandError::action("spawning", "tar ledger", e), id) - })?; - - if !child - .wait() - .await - .as_ref() - .map(ExitStatus::success) - .unwrap_or(false) - { - error!("failed to compress ledger"); - } - - tokio::fs::try_exists(&base.join(LEDGER_STORAGE_FILE)) - .await - .map_err(|e| StorageError::FailedToTarLedger(id, e))?; - } - let mut accounts = IndexMap::new(); accounts.insert( *ACCOUNTS_KEY_ID, @@ -476,20 +426,6 @@ impl Document { .await .map_err(|e| StorageError::WriteVersion(version_file.clone(), e))?; - let checkpoints = self - .retention_policy - .map(|policy| { - CheckpointManager::load(base.join(LEDGER_BASE_DIR), policy) - .map_err(StorageError::CheckpointManager) - }) - .transpose()?; - - if let Some(checkpoints) = &checkpoints { - info!("storage {id} checkpoint manager loaded {checkpoints}"); - } else { - info!("storage {id} loaded without a checkpoint manager"); - } - let committee_file = base.join("committee.json"); // if the committee was specified in the generation params, use that @@ -531,7 +467,7 @@ impl Document { network, committee, accounts, - checkpoints, + retention_policy: self.retention_policy, persist: self.persist, native_genesis, binaries, diff --git a/crates/controlplane/src/server/actions/config.rs b/crates/controlplane/src/server/actions/config.rs index 777b714c..77aebab9 100644 --- a/crates/controlplane/src/server/actions/config.rs +++ b/crates/controlplane/src/server/actions/config.rs @@ -27,7 +27,7 @@ pub async fn config( #[allow(unused_variables)] match pending.entry($agent.id()) { Entry::Occupied(mut ent) => { - match ent.get_mut().2 { + match ent.get_mut().1 { AgentState::Inventory => (), AgentState::Node(_, ref mut n) => { $({ @@ -40,7 +40,6 @@ pub async fn config( Entry::Vacant(ent) => { ent.insert(( $agent.id(), - $agent.client_owned(), $agent.state().clone().map_node(|mut n| { $({ let $key = &mut n.$key; @@ -57,7 +56,6 @@ pub async fn config( for WithTargets { nodes, data } in configs { for agent in env.matching_agents(&nodes, &state.pool) { if let Some(h) = data.height { - let h = h.into(); set_node_field!(agent, height = (height.0 + 1, h)); } @@ -119,13 +117,6 @@ pub async fn config( let pending = pending.into_values().collect::>(); let node_map = pending_reconcile_node_map(pending.iter()); - let res = state - .reconcile_agents(pending) - .await - .map_err(ServerError::from); - - match res { - Ok(_) => Json(node_map).into_response(), - e => e.into_response(), - } + state.update_agent_states(pending).await; + Json(node_map).into_response() } diff --git a/crates/controlplane/src/server/actions/deploy.rs b/crates/controlplane/src/server/actions/deploy.rs index 20e5cd33..43175054 100644 --- a/crates/controlplane/src/server/actions/deploy.rs +++ b/crates/controlplane/src/server/actions/deploy.rs @@ -8,14 +8,13 @@ use axum::{ use http::StatusCode; use snops_common::{ action_models::DeployAction, - aot_cmds::{AotCmd, Authorization}, - state::KeyState, + aot_cmds::AotCmd, + state::{Authorization, KeyState}, }; -use tokio::sync::mpsc; use super::{execute::execute_status, Env}; use crate::{ - cannon::{error::AuthorizeError, router::AuthQuery, status::TransactionStatusSender}, + cannon::{error::AuthorizeError, router::AuthQuery}, env::{error::ExecutionError, Environment}, server::error::ServerError, state::GlobalState, @@ -28,33 +27,23 @@ pub async fn deploy( Json(action): Json, ) -> Response { let query_addr = env.cannons.get(&action.cannon).map(|c| c.get_local_query()); + let cannon_id = action.cannon; if query.is_async() { - return match deploy_inner( - &state, - action, - &env, - TransactionStatusSender::empty(), - query_addr, - ) - .await - { + return match deploy_inner(&state, action, &env, query_addr).await { Ok(tx_id) => (StatusCode::ACCEPTED, Json(tx_id)).into_response(), Err(e) => ServerError::from(e).into_response(), }; } - let (tx, rx) = mpsc::channel(10); - match deploy_inner( - &state, - action, - &env, - TransactionStatusSender::new(tx), - query_addr, - ) - .await - { - Ok(tx_id) => execute_status(tx_id, rx).await.into_response(), + match deploy_inner(&state, action, &env, query_addr).await { + Ok(tx_id) => { + use snops_common::events::EventFilter::*; + let subscriber = state + .events + .subscribe_on(TransactionIs(tx_id.clone()) & EnvIs(env.id) & CannonIs(cannon_id)); + execute_status(tx_id, subscriber).await.into_response() + } Err(e) => ServerError::from(e).into_response(), } } @@ -63,9 +52,8 @@ pub async fn deploy_inner( state: &GlobalState, action: DeployAction, env: &Environment, - events: TransactionStatusSender, query: Option, -) -> Result { +) -> Result, ExecutionError> { let DeployAction { cannon: cannon_id, private_key, @@ -111,6 +99,8 @@ pub async fn deploy_inner( query.as_ref(), priority_fee, fee_record.as_ref(), + // use cost_v1 when we are not using the native genesis + !env.storage.native_genesis, ) .await?; @@ -119,7 +109,7 @@ pub async fn deploy_inner( serde_json::from_str(&auth_str).map_err(AuthorizeError::Json)?; // proxy it to a listen cannon - let tx_id = cannon.proxy_auth(authorization, events).await?; + let tx_id = cannon.proxy_auth(authorization).await?; Ok(tx_id) } diff --git a/crates/controlplane/src/server/actions/execute.rs b/crates/controlplane/src/server/actions/execute.rs index dfdbc0e2..854fb0cc 100644 --- a/crates/controlplane/src/server/actions/execute.rs +++ b/crates/controlplane/src/server/actions/execute.rs @@ -9,28 +9,26 @@ use http::StatusCode; use serde_json::json; use snops_common::{ action_models::{AleoValue, ExecuteAction}, - aot_cmds::{AotCmd, Authorization}, - state::KeyState, + aot_cmds::AotCmd, + events::{Event, EventKind}, + state::{Authorization, KeyState}, }; -use tokio::{select, sync::mpsc}; +use tokio::select; use super::Env; use crate::{ - cannon::{ - error::AuthorizeError, - router::AuthQuery, - status::{TransactionStatusEvent, TransactionStatusSender}, - }, + cannon::{error::AuthorizeError, router::AuthQuery}, env::{error::ExecutionError, Environment}, + events::EventSubscriber, server::error::{ActionError, ServerError}, state::GlobalState, }; pub async fn execute_status( - tx_id: String, - mut rx: mpsc::Receiver, + tx_id: Arc, + mut rx: EventSubscriber, ) -> Result, ActionError> { - use TransactionStatusEvent::*; + use snops_common::events::TransactionEvent::*; let mut timeout = Box::pin(tokio::time::sleep(std::time::Duration::from_secs(30))); let mut agent_id = None; @@ -39,23 +37,35 @@ pub async fn execute_status( loop { select! { _ = &mut timeout => { - return Err(ActionError::ExecuteStatusTimeout { tx_id, agent_id, retries }); + return Err(ActionError::ExecuteStatusTimeout { tx_id: tx_id.to_string(), agent_id, retries }); }, - Some(msg) = rx.recv() => { - match msg { - ExecuteAborted => { - return Err(ActionError::ExecuteStatusAborted { tx_id, retries}); + Ok(ev) = rx.next() => { + let Event{ content: EventKind::Transaction(ev), agent, .. } = ev.as_ref() else { + continue; + }; + + match ev { + ExecuteAborted(reason) => { + return Err(ActionError::ExecuteStatusAborted { + tx_id: tx_id.to_string(), + retries, + reason: reason.clone(), + }); }, - ExecuteFailed(msg) => { - return Err(ActionError::ExecuteStatusFailed { message: msg, tx_id, retries }); + ExecuteFailed(message) => { + return Err(ActionError::ExecuteStatusFailed { + message: message.to_string(), + tx_id: tx_id.to_string(), + retries, + }); }, - Executing(id) => { - agent_id = Some(id.to_string()); + Executing => { + agent_id = agent.map(|id| id.to_string()); }, ExecuteAwaitingCompute => { retries += 1; }, - ExecuteComplete(transaction) => { + ExecuteComplete { transaction } => { return Ok(Json(json!({ "agent_id": agent_id, "retries": retries, @@ -76,33 +86,23 @@ pub async fn execute( Json(action): Json, ) -> Response { let query_addr = env.cannons.get(&action.cannon).map(|c| c.get_local_query()); + let cannon_id = action.cannon; if query.is_async() { - return match execute_inner( - &state, - action, - &env, - TransactionStatusSender::empty(), - query_addr, - ) - .await - { + return match execute_inner(&state, action, &env, query_addr).await { Ok(tx_id) => (StatusCode::ACCEPTED, Json(tx_id)).into_response(), Err(e) => ServerError::from(e).into_response(), }; } - let (tx, rx) = mpsc::channel(10); - match execute_inner( - &state, - action, - &env, - TransactionStatusSender::new(tx), - query_addr, - ) - .await - { - Ok(tx_id) => execute_status(tx_id, rx).await.into_response(), + match execute_inner(&state, action, &env, query_addr).await { + Ok(tx_id) => { + use snops_common::events::EventFilter::*; + let subscriber = state + .events + .subscribe_on(TransactionIs(tx_id.clone()) & EnvIs(env.id) & CannonIs(cannon_id)); + execute_status(tx_id, subscriber).await.into_response() + } Err(e) => ServerError::from(e).into_response(), } } @@ -111,9 +111,8 @@ pub async fn execute_inner( state: &GlobalState, action: ExecuteAction, env: &Environment, - events: TransactionStatusSender, query: Option, -) -> Result { +) -> Result, ExecutionError> { let ExecuteAction { cannon: cannon_id, private_key, @@ -177,6 +176,8 @@ pub async fn execute_inner( query.as_ref(), priority_fee, fee_record.as_ref(), + // use cost_v1 when we are not using the native genesis + !env.storage.native_genesis, ) .await?; @@ -185,7 +186,7 @@ pub async fn execute_inner( serde_json::from_str(&auth_str).map_err(AuthorizeError::Json)?; // proxy it to a listen cannon - let tx_id = cannon.proxy_auth(authorization, events).await?; + let tx_id = cannon.proxy_auth(authorization).await?; Ok(tx_id) } diff --git a/crates/controlplane/src/server/actions/power.rs b/crates/controlplane/src/server/actions/power.rs index 2c32fd7b..1cc79f97 100644 --- a/crates/controlplane/src/server/actions/power.rs +++ b/crates/controlplane/src/server/actions/power.rs @@ -1,81 +1,137 @@ +use std::collections::{HashMap, HashSet}; + use axum::{ response::{IntoResponse, Response}, Json, }; -use snops_common::action_models::WithTargets; +use snops_common::{ + action_models::WithTargets, + node_targets::NodeTargets, + state::{AgentId, AgentState, EnvId, ReconcileOptions}, +}; +use tracing::info; use super::Env; -use crate::{ - server::error::ServerError, - state::{pending_reconcile_node_map, Agent}, -}; +use crate::state::{pending_reconcile_node_map, GlobalState}; + +async fn wait_for_nodes( + state: &GlobalState, + env_id: EnvId, + nodes: NodeTargets, + pending: Vec<(AgentId, AgentState)>, +) -> Response { + let mut awaiting_agents = pending.iter().map(|a| a.0).collect::>(); + let node_map = pending_reconcile_node_map(pending.iter()); + + // create the subscriber before updating agent states in order to + // avoid missing any events + use snops_common::events::prelude::*; + let mut subscriber = state + .events + .subscribe_on(NodeTargetIs(nodes) & EnvIs(env_id) & AgentReconcileComplete); + + state.update_agent_states(pending).await; + + // wait at most 30 seconds for all agents to reconcile + let expires = tokio::time::Instant::now() + std::time::Duration::from_secs(30); + while !awaiting_agents.is_empty() { + tokio::select! { + _ = tokio::time::sleep_until(expires) => { + break; + } + Ok(event) = subscriber.next() => { + if let Some(agent) = event.agent { + awaiting_agents.remove(&agent); + } + } + } + } + + Json(node_map).into_response() +} pub async fn online( Env { env, state, .. }: Env, Json(WithTargets { nodes, .. }): Json, ) -> Response { + info!("env {} invoked online action for {nodes}", env.id); let pending = env .matching_agents(&nodes, &state.pool) .filter_map(|a| { - let agent: &Agent = a.value(); - agent.filter_map_to_reconcile(|mut s| { + a.value().filter_map_to_reconcile(|mut s| { (!s.online).then(|| { s.online = true; s }) }) }) - .collect::>(); // TODO - - let node_map = pending_reconcile_node_map(pending.iter()); - - let res = state - .reconcile_agents(pending) - .await - .map_err(ServerError::from); + .collect::>(); - match res { - Ok(_) => Json(node_map).into_response(), - e => e.into_response(), - } + wait_for_nodes(&state, env.id, nodes, pending).await } pub async fn offline( Env { env, state, .. }: Env, Json(WithTargets { nodes, .. }): Json, ) -> Response { + info!("env {} invoked offline action for {nodes}", env.id); let pending = env .matching_agents(&nodes, &state.pool) .filter_map(|a| { - let agent: &Agent = a.value(); - agent.filter_map_to_reconcile(|mut s| { + a.value().filter_map_to_reconcile(|mut s| { s.online.then(|| { s.online = false; s }) }) }) - .collect::>(); // TODO + .collect::>(); - let node_map = pending_reconcile_node_map(pending.iter()); + wait_for_nodes(&state, env.id, nodes, pending).await +} - let res = state - .reconcile_agents(pending) - .await - .map_err(ServerError::from); +pub async fn reboot( + Env { env, state, .. }: Env, + Json(WithTargets { nodes, .. }): Json, +) -> Response { + let node_map = env + .matching_agents(&nodes, &state.pool) + .filter_map(|a| a.node_key().map(|k| (k.clone(), a.id))) + .collect::>(); - match res { - Ok(_) => Json(node_map).into_response(), - e => e.into_response(), - } -} + let mut awaiting_agents = node_map.values().copied().collect::>(); + + // create the subscriber before updating agent states in order to + // avoid missing any events + use snops_common::events::prelude::*; + let mut subscriber = state + .events + .subscribe_on(NodeTargetIs(nodes) & EnvIs(env.id) & AgentReconcileComplete); -pub async fn reboot(env: Env, json: Json) -> Response { - let offline_res = offline(env.clone(), json.clone()).await; + state + .queue_many_reconciles( + awaiting_agents.iter().copied(), + ReconcileOptions { + force_shutdown: true, + ..Default::default() + }, + ) + .await; - if !offline_res.status().is_success() { - offline_res - } else { - online(env, json).await + // wait at most 30 seconds for all agents to reconcile + let expires = tokio::time::Instant::now() + std::time::Duration::from_secs(30); + while !awaiting_agents.is_empty() { + tokio::select! { + _ = tokio::time::sleep_until(expires) => { + break; + } + Ok(event) = subscriber.next() => { + if let Some(agent) = event.agent { + awaiting_agents.remove(&agent); + } + } + } } + + Json(node_map).into_response() } diff --git a/crates/controlplane/src/server/agent_ws.rs b/crates/controlplane/src/server/agent_ws.rs new file mode 100644 index 00000000..e249c82b --- /dev/null +++ b/crates/controlplane/src/server/agent_ws.rs @@ -0,0 +1,386 @@ +use std::sync::Arc; + +use ::jwt::VerifyWithKey; +use axum::{ + extract::{ + ws::{Message, WebSocket}, + Query, State, WebSocketUpgrade, + }, + http::HeaderMap, + response::{IntoResponse, Response}, +}; +use futures_util::stream::StreamExt; +use http::StatusCode; +use semver::Version; +use serde::Deserialize; +use snops_common::events::AgentEvent; +use snops_common::{ + constant::HEADER_AGENT_KEY, + prelude::*, + rpc::control::{ + agent::{AgentServiceClient, Handshake}, + ControlService, + }, +}; +use tarpc::{context, server::Channel}; +use tokio::select; +use tracing::{error, info, warn}; + +use super::{jwt::Claims, rpc::ControlRpcServer}; +use crate::{ + agent_version::agent_version_ok, + server::{ + jwt::JWT_SECRET, + rpc::{MuxedMessageIncoming, MuxedMessageOutgoing}, + }, + state::{Agent, AgentEventHelpers, AgentFlags, AppState, EmitEvent}, +}; + +#[derive(Debug, Deserialize)] +pub struct AgentWsQuery { + pub id: Option, + pub version: Option, + #[serde(flatten)] + pub flags: AgentFlags, +} + +pub async fn agent_ws_handler( + ws: WebSocketUpgrade, + headers: HeaderMap, + State(state): State, + Query(query): Query, +) -> Response { + // Ensure agent version is compatible + if query.version.as_ref().is_none_or(|v| !agent_version_ok(v)) { + return StatusCode::UPGRADE_REQUIRED.into_response(); + }; + + match (&state.agent_key, headers.get(HEADER_AGENT_KEY)) { + // assert key equals passed header + (Some(key), Some(header)) if key == header.to_str().unwrap_or_default() => (), + + // forbid if key is incorrect + (Some(_), _) => { + warn!("An agent has attempted to connect with a mismatching agent key"); + return StatusCode::UNAUTHORIZED.into_response(); + } + + // allow if no key is present + _ => (), + } + + ws.on_upgrade(|socket| handle_socket(socket, headers, state, query)) +} + +async fn handle_socket( + mut socket: WebSocket, + headers: HeaderMap, + state: AppState, + query: AgentWsQuery, +) { + // Safe because handle socket is only called if version is Some + let agent_version = query.version.unwrap(); + + let claims = headers + .get("Authorization") + .and_then(|auth| -> Option { + let auth = auth.to_str().ok()?; + if !auth.starts_with("Bearer ") { + return None; + } + + let token = &auth[7..]; + + // get claims out of the specified JWT + token.verify_with_key(&*JWT_SECRET).ok() + }) + .filter(|claims| { + // ensure the id is correct + if let Some(id) = query.id { + if claims.id != id { + warn!("connecting agent specified an id different than the claim"); + return false; + } + } + + true + }); + + // TODO: the client should provide us with some information about itself (num + // cpus, etc.) before we categorize it and add it as an agent to the agent pool + + // set up the RPC channels + let (client_response_in, client_transport, mut client_request_out) = RpcTransport::new(); + let (server_request_in, server_transport, mut server_response_out) = RpcTransport::new(); + + // set up the client, facing the agent server + let client = + AgentServiceClient::new(tarpc::client::Config::default(), client_transport).spawn(); + + let (id, handshake) = 'insertion: { + let client = client.clone(); + let mut handshake = Handshake { + loki: state.cli.loki.as_ref().map(|u| u.to_string()), + // Encourage the agent to refetch its info on connect + reconcile_opts: ReconcileOptions { + refetch_info: true, + ..Default::default() + }, + ..Default::default() + }; + + // attempt to reconnect if claims were passed + 'reconnect: { + if let Some(claims) = claims { + let Some(mut agent) = state.pool.get_mut(&claims.id) else { + warn!("Connecting agent is trying to identify as an unrecognized agent"); + break 'reconnect; + }; + + let id = agent.id(); + if agent.is_connected() { + warn!( + "Connecting agent is trying to identify as an already-connected agent {id}" + ); + break 'reconnect; + } + + // compare the stored nonce with the JWT's nonce + if agent.claims().nonce != claims.nonce { + warn!("Connecting agent {id} is trying to identify with an invalid nonce"); + break 'reconnect; + } + AgentEvent::Connected { + version: agent_version.to_string(), + } + .with_agent(&agent) + .emit(&state); + + match agent.env() { + Some(env) if !state.envs.contains_key(&env) => { + info!("setting agent {id} to Inventory state due to missing env {env}"); + agent.set_state(AgentState::Inventory); + } + _ => {} + } + + // attach the current known agent state to the handshake + agent.state().clone_into(&mut handshake.state); + + // mark the agent as connected, update the flags as well + agent.mark_connected(client.clone(), query.flags); + + info!("Agent {id} reconnected with version {agent_version}"); + if let Err(e) = state.db.agents.save(&id, &agent) { + error!("failed to save agent {id} to the database: {e}"); + } + + break 'insertion (id, handshake); + } + } + + // otherwise, we need to create an agent and give it a new JWT + // TODO: remove unnamed agents + let id = query.id.unwrap_or_else(AgentId::rand); + + // check if an agent with this id is already online + if state + .pool + .get(&id) + .map(|a| a.is_connected()) + .unwrap_or_default() + { + warn!("An agent is trying to identify as an already-connected agent {id}"); + let _ = socket.send(Message::Close(None)).await; + return; + } + + // create a new agent + let agent = Agent::new(client.to_owned(), id, query.flags); + + // sign the jwt + let signed_jwt = agent.sign_jwt(); + handshake.jwt = Some(signed_jwt); + + // insert a new agent into the pool + if let Err(e) = state.db.agents.save(&id, &agent) { + error!("failed to save agent {id} to the database: {e}"); + } + state.pool.insert(id, agent); + + info!( + "Agent {id} connected with version {agent_version}; pool is now {} nodes", + state.pool.len() + ); + + (id, handshake) + }; + + // Handshake with the client in a separate task because we don't want to hold up + // pool insertion + let state2 = Arc::clone(&state); + let client2 = client.clone(); + tokio::spawn(async move { + let agent = state2.pool.get(&id)?; + let event = AgentEvent::HandshakeComplete.with_agent(&agent); + + // Prevent readonly agent from being held over the handshake RPC + drop(agent); + + match client2.handshake(context::current(), handshake).await { + Ok(()) => event.emit(&state2), + Err(e) => error!("failed to perform agent {id} handshake: {e}"), + } + + Some(()) + }); + + // Fetch the agent's network addresses on connect/reconnect + let state2 = Arc::clone(&state); + tokio::spawn(async move { + let Ok((ports, external, internal)) = client.get_addrs(context::current()).await else { + return; + }; + let Some(mut agent) = state2.pool.get_mut(&id) else { + return; + }; + + info!( + "Agent {id} [{}], labels: {:?}, addrs: {external:?} {internal:?} @ {ports}, local pk: {}", + agent.modes(), + agent.str_labels(), + if agent.has_local_pk() { "yes" } else { "no" }, + ); + + let is_port_change = agent.set_ports(ports); + let is_ip_change = agent.set_addrs(external, internal); + + if let Err(e) = state2.db.agents.save(&id, &agent) { + error!("failed to save agent {id} to the database: {e}"); + } + + if !is_ip_change && !is_port_change { + return; + } + let Some(env_id) = agent.env() else { return }; + + // Prevent mutable agent from being held over the network address update RPC + drop(agent); + + let Some(env) = state2.get_env(env_id) else { + return; + }; + + info!("Agent {id} updated its network addresses... Submitting changes to associated peers"); + env.update_peer_addr(&state2, id, is_port_change, is_ip_change) + .await; + }); + + // set up the server, for incoming RPC requests + let server = tarpc::server::BaseChannel::with_defaults(server_transport); + let server_handle = tokio::spawn( + server + .execute( + ControlRpcServer { + state: state.to_owned(), + agent: id, + } + .serve(), + ) + .for_each(|r| async move { + tokio::spawn(r); + }), + ); + + loop { + select! { + // handle incoming messages + msg = socket.recv() => { + match msg { + Some(Err(e)) => { + error!("Agent {id} failed to receive a message: {e}"); + break; + } + None => break, + Some(Ok(Message::Binary(bin))) => { + let msg = match snops_common::rpc::codec::decode(&bin) { + Ok(msg) => msg, + Err(e) => { + error!("Agent {id} failed to deserialize a message: {e}"); + break; + } + }; + + match msg { + MuxedMessageIncoming::Parent(msg) => { + if let Err(e) = server_request_in.send(msg) { + error!("Agent {id} internal RPC channel closed: {e}"); + break; + } + }, + MuxedMessageIncoming::Child(msg) => { + if let Err(e) = client_response_in.send(msg) { + error!("Agent {id} internal RPC channel closed: {e}"); + break; + } + } + } + } + _ => (), + } + } + + // handle outgoing requests + msg = client_request_out.recv() => { + let Some(msg) = msg else { + error!("Agent {id} internal RPC channel closed"); + break; + }; + let bin = match snops_common::rpc::codec::encode(&MuxedMessageOutgoing::Child(msg)) { + Ok(bin) => bin, + Err(e) => { + error!("Agent {id} failed to serialize request: {e}"); + break; + } + }; + if let Err(e) = socket.send(Message::Binary(bin)).await { + error!("Agent {id} failed to send request to agent {id}: {e}"); + break; + } + } + + // handle outgoing responses + msg = server_response_out.recv() => { + let Some(msg) = msg else { + error!("Agent {id} internal RPC channel closed"); + break; + }; + let bin = match snops_common::rpc::codec::encode(&MuxedMessageOutgoing::Parent(msg)) { + Ok(bin) => bin, + Err(e) => { + error!("Agent {id} failed to serialize response: {e}"); + break; + } + }; + if let Err(e) = socket.send(Message::Binary(bin)).await { + error!("Agent {id} failed to send response to agent {id}: {e}"); + break; + } + } + } + } + + // abort the RPC server handle + server_handle.abort(); + + // remove the client from the agent in the agent pool + if let Some(mut agent) = state.pool.get_mut(&id) { + agent.mark_disconnected(); + + state + .events + .emit(AgentEvent::Disconnected.with_agent(&agent)); + } + + info!("Agent {id} disconnected"); +} diff --git a/crates/controlplane/src/server/api.rs b/crates/controlplane/src/server/api.rs index af1c895e..128ca728 100644 --- a/crates/controlplane/src/server/api.rs +++ b/crates/controlplane/src/server/api.rs @@ -19,10 +19,11 @@ use snops_common::{ }; use tarpc::context; -use super::{actions, error::ServerError, models::AgentStatusResponse, AppState}; +use super::{actions, error::ServerError, event_ws, models::AgentStatusResponse}; use crate::{ cannon::{router::redirect_cannon_routes, source::QueryTarget}, make_env_filter, + state::AppState, }; use crate::{ env::{EnvPeer, Environment}, @@ -41,6 +42,7 @@ macro_rules! unwrap_or_not_found { pub(super) fn routes() -> Router { Router::new() + .route("/events", get(event_ws::event_ws_handler)) .route("/log/:level", post(set_log_level)) .route("/agents", get(get_agents)) .route("/agents/:id", get(get_agent)) @@ -66,7 +68,7 @@ pub(super) fn routes() -> Router { // get(get_env_agent_key), // ) // .route("/env/:env_id/metric/:prom_ql", get()) - .route("/env/:env_id/prepare", post(post_env_prepare)) + .route("/env/:env_id/apply", post(post_env_apply)) .route("/env/:env_id/info", get(get_env_info)) .route("/env/:env_id/height", get(get_latest_height)) .route("/env/:env_id/block_info", get(get_env_block_info)) @@ -147,14 +149,6 @@ async fn set_log_level(Path(level): Path, state: State) -> Res status_ok() } -#[derive(Deserialize)] -#[serde(rename_all = "lowercase")] -enum StorageType { - Genesis, - Ledger, - Binary, -} - async fn get_env_info(Path(env_id): Path, state: State) -> Response { let env_id = unwrap_or_not_found!(id_or_none(&env_id)); let env = unwrap_or_not_found!(state.get_env(env_id)); @@ -604,7 +598,7 @@ async fn get_env_agent_key( Json(AgentStatusResponse::from(agent.value())).into_response() } -async fn post_env_prepare( +async fn post_env_apply( // This env_id is allowed to be in the Path because it would be allocated // anyway Path(env_id): Path, @@ -616,11 +610,8 @@ async fn post_env_prepare( Err(e) => return ServerError::from(e).into_response(), }; - // TODO: some live state to report to the calling CLI or something would be - // really nice - - match Environment::prepare(env_id, documents, state).await { - Ok(env_id) => (StatusCode::OK, Json(json!({ "id": env_id }))).into_response(), + match Environment::apply(env_id, documents, state).await { + Ok(node_map) => Json(json!(node_map)).into_response(), Err(e) => ServerError::from(e).into_response(), } } diff --git a/crates/controlplane/src/server/content.rs b/crates/controlplane/src/server/content.rs index 682bb066..c6066ee5 100644 --- a/crates/controlplane/src/server/content.rs +++ b/crates/controlplane/src/server/content.rs @@ -15,14 +15,13 @@ use snops_common::{ use tower::Service; use tower_http::services::ServeFile; -use super::AppState; use crate::{ schema::{ error::StorageError, storage::{DEFAULT_AGENT_BINARY, DEFAULT_AOT_BINARY}, }, server::error::ServerError, - state::GlobalState, + state::{AppState, GlobalState}, unwrap_or_not_found, }; @@ -82,25 +81,20 @@ async fn serve_binary( ) -> Response { let storage = unwrap_or_not_found!(state.storage.get(&(network, storage_id))).clone(); - let (id, entry) = match storage.resolve_binary_entry(binary_id) { - Ok(res) => res, - Err(e) => return ServerError::from(e).into_response(), - }; - - respond_from_entry(id, entry, req).await + match storage.resolve_binary_entry(binary_id) { + Ok((id, entry)) => respond_from_entry(id, entry, req).await, + Err(e) => ServerError::from(e).into_response(), + } } /// Given a binary entry, respond with the binary or a redirect to the binary async fn respond_from_entry(id: InternedId, entry: &BinaryEntry, req: Request) -> Response { match &entry.source { BinarySource::Url(url) => Redirect::temporary(url.as_str()).into_response(), - BinarySource::Path(file) => { - if !file.exists() { - return ServerError::from(StorageError::BinaryFileMissing(id, file.clone())) - .into_response(); - } - ServeFile::new(file).call(req).await.into_response() + BinarySource::Path(file) if !file.exists() => { + ServerError::from(StorageError::BinaryFileMissing(id, file.clone())).into_response() } + BinarySource::Path(file) => ServeFile::new(file).call(req).await.into_response(), } } @@ -110,7 +104,6 @@ async fn serve_file( req: Request, ) -> Response { let storage = unwrap_or_not_found!(state.storage.get(&(network, storage_id))).clone(); - let file_path = storage.path(&state).join(&file); match file.as_str() { // ensure genesis is only served if native genesis is disabled @@ -119,16 +112,12 @@ async fn serve_file( return StatusCode::NOT_FOUND.into_response(); } } - // allow ledger.tar.gz to be served - "ledger.tar.gz" => {} - // allow checkpoints to be served - _ if file.ends_with(".checkpoint") => {} - // serve the version file - "version" => {} // otherwise, return a 404 _ => return StatusCode::NOT_FOUND.into_response(), } + let file_path = storage.path(&state).join(&file); + // ensure the file exists if !file_path.exists() { return StatusCode::NOT_FOUND.into_response(); diff --git a/crates/controlplane/src/server/error.rs b/crates/controlplane/src/server/error.rs index b27a1c08..8c37c1ee 100644 --- a/crates/controlplane/src/server/error.rs +++ b/crates/controlplane/src/server/error.rs @@ -3,7 +3,8 @@ use http::StatusCode; use serde::{ser::SerializeStruct, Serialize, Serializer}; use serde_json::json; use snops_common::{ - aot_cmds::AotCmdError, db::error::DatabaseError, impl_into_status_code, impl_into_type_str, + aot_cmds::AotCmdError, db::error::DatabaseError, events::TransactionAbortReason, + impl_into_status_code, impl_into_type_str, }; use thiserror::Error; @@ -12,13 +13,10 @@ use crate::{ env::error::{EnvError, EnvRequestError, ExecutionError}, error::DeserializeError, schema::error::{SchemaError, StorageError}, - state::error::BatchReconcileError, }; #[derive(Debug, Error, strum_macros::AsRefStr)] pub enum ServerError { - #[error(transparent)] - BatchReconcile(#[from] BatchReconcileError), #[error("Content resource `{0}` not found")] ContentNotFound(String), #[error(transparent)] @@ -50,7 +48,6 @@ pub enum ServerError { } impl_into_status_code!(ServerError, |value| match value { - BatchReconcile(e) => e.into(), ContentNotFound(_) => axum::http::StatusCode::NOT_FOUND, Cannon(e) => e.into(), Deserialize(e) => e.into(), @@ -68,7 +65,6 @@ impl_into_status_code!(ServerError, |value| match value { }); impl_into_type_str!(ServerError, |value| match value { - BatchReconcile(e) => format!("{}.{e}", value.as_ref()), Cannon(e) => format!("{}.{}", value.as_ref(), String::from(e)), Env(e) => format!("{}.{}", value.as_ref(), String::from(e)), Execute(e) => format!("{}.{}", value.as_ref(), String::from(e)), @@ -121,7 +117,11 @@ pub enum ActionError { retries: i32, }, #[error("execution aborted")] - ExecuteStatusAborted { tx_id: String, retries: i32 }, + ExecuteStatusAborted { + tx_id: String, + retries: i32, + reason: TransactionAbortReason, + }, #[error("execution failed")] ExecuteStatusFailed { message: String, diff --git a/crates/controlplane/src/server/event_ws.rs b/crates/controlplane/src/server/event_ws.rs new file mode 100644 index 00000000..b0af7513 --- /dev/null +++ b/crates/controlplane/src/server/event_ws.rs @@ -0,0 +1,120 @@ +use std::collections::HashMap; + +use axum::{ + extract::{ + ws::{Message, WebSocket}, + Query, State, WebSocketUpgrade, + }, + response::Response, +}; +use serde::Deserialize; +use snops_common::events::{EventFilter, EventWsRequest}; +use tokio::select; + +use crate::{events::EventSubscriber, state::AppState}; + +#[derive(Debug, Deserialize)] +pub struct EventWsQuery { + #[serde(default)] + pub filter: Option, +} + +pub async fn event_ws_handler( + ws: WebSocketUpgrade, + State(state): State, + Query(query): Query, +) -> Response { + ws.on_upgrade(|socket| async { + EventWsHandler::new(state, query.filter) + .handle_ws(socket) + .await + }) +} + +struct EventWsHandler { + base_filter: Option, + subscriber: EventSubscriber, + extra_filters: HashMap, +} + +impl EventWsHandler { + fn new(state: AppState, base_filter: Option) -> Self { + let subscriber = match base_filter.clone() { + Some(filter) => state.events.subscribe_on(filter), + // Listen to no events by default + None => state.events.subscribe_on(!EventFilter::Unfiltered), + }; + Self { + base_filter, + subscriber, + extra_filters: Default::default(), + } + } + + /// Update the subscriber filter based on the base filter and extra filters + fn update_subscriber(&mut self) { + if self.extra_filters.is_empty() && self.base_filter.is_none() { + self.subscriber.set_filter(!EventFilter::Unfiltered); + return; + } + + let base_filter = self.base_filter.clone().unwrap_or(EventFilter::Unfiltered); + + self.subscriber.set_filter( + base_filter + & EventFilter::AnyOf(self.extra_filters.values().cloned().collect::>()), + ); + } + + /// Handle a request from the websocket to subscribe or unsubscribe from + /// events + fn handle_request(&mut self, req: EventWsRequest) { + match req { + EventWsRequest::Subscribe { id, filter } => { + self.extra_filters.insert(id, filter); + self.update_subscriber(); + } + EventWsRequest::Unsubscribe { id } => { + self.extra_filters.remove(&id); + self.update_subscriber(); + } + } + } + + /// Handle the websocket connection, sending events to the client and + /// handling requests to subscribe or unsubscribe from the client + async fn handle_ws(&mut self, mut socket: WebSocket) { + loop { + select! { + msg = socket.recv() => { + // Parse the message + let req = match msg { + Some(Ok(Message::Text(text))) => serde_json::from_str::(&text), + Some(Ok(Message::Binary(bin))) => serde_json::from_slice::(&bin), + Some(Err(_)) | None => break, + _ => continue, + }; + // Handle the request + match req { + Ok(req) => self.handle_request(req), + Err(_e) => break, + } + } + // Forward events to the client + Ok(event) = self.subscriber.next() => { + let json = match serde_json::to_string(&event) { + Ok(json) => json, + Err(e) => { + tracing::error!("failed to serialize event for websocket: {e}"); + break; + } + }; + if let Err(e) = socket.send(Message::Text(json)).await { + tracing::error!("failed to send event to websocket: {e}"); + break; + } + } + } + } + } +} diff --git a/crates/controlplane/src/server/mod.rs b/crates/controlplane/src/server/mod.rs index 5138b738..277cceaf 100644 --- a/crates/controlplane/src/server/mod.rs +++ b/crates/controlplane/src/server/mod.rs @@ -1,47 +1,19 @@ -use std::{net::SocketAddr, sync::Arc, time::Duration}; +use std::{net::SocketAddr, sync::Arc}; -use ::jwt::VerifyWithKey; -use axum::{ - extract::{ - ws::{Message, WebSocket}, - Query, State, WebSocketUpgrade, - }, - http::HeaderMap, - middleware, - response::{IntoResponse, Response}, - routing::get, - Extension, Router, -}; -use futures_util::stream::StreamExt; -use http::StatusCode; -use serde::Deserialize; -use snops_common::{ - constant::HEADER_AGENT_KEY, - prelude::*, - rpc::control::{ - agent::{AgentServiceClient, Handshake}, - ControlService, - }, -}; -use tarpc::server::Channel; -use tokio::select; -use tracing::{error, info, warn}; +use axum::{middleware, routing::get, Extension, Router}; -use self::{ - error::StartError, - jwt::{Claims, JWT_SECRET}, - rpc::ControlRpcServer, -}; +use self::error::StartError; use crate::{ logging::{log_request, req_stamp}, - server::rpc::{MuxedMessageIncoming, MuxedMessageOutgoing}, - state::{Agent, AgentFlags, AppState, GlobalState}, + state::GlobalState, }; pub mod actions; +mod agent_ws; mod api; mod content; pub mod error; +mod event_ws; pub mod jwt; pub mod models; pub mod prometheus; @@ -49,7 +21,7 @@ mod rpc; pub async fn start(state: Arc, socket_addr: SocketAddr) -> Result<(), StartError> { let app = Router::new() - .route("/agent", get(agent_ws_handler)) + .route("/agent", get(agent_ws::agent_ws_handler)) .nest("/api/v1", api::routes()) .nest("/prometheus", prometheus::routes()) .nest("/content", content::init_routes(&state).await) @@ -68,292 +40,3 @@ pub async fn start(state: Arc, socket_addr: SocketAddr) -> Result<( Ok(()) } - -#[derive(Debug, Deserialize)] -struct AgentWsQuery { - id: Option, - #[serde(flatten)] - flags: AgentFlags, -} - -async fn agent_ws_handler( - ws: WebSocketUpgrade, - headers: HeaderMap, - State(state): State, - Query(query): Query, -) -> Response { - match (&state.agent_key, headers.get(HEADER_AGENT_KEY)) { - // assert key equals passed header - (Some(key), Some(header)) if key == header.to_str().unwrap_or_default() => (), - - // forbid if key is incorrect - (Some(_), _) => { - warn!("an agent has attempted to connect with a mismatching agent key"); - return StatusCode::UNAUTHORIZED.into_response(); - } - - // allow if no key is present - _ => (), - } - - ws.on_upgrade(|socket| handle_socket(socket, headers, state, query)) - .into_response() -} - -async fn handle_socket( - mut socket: WebSocket, - headers: HeaderMap, - state: AppState, - query: AgentWsQuery, -) { - let claims = headers - .get("Authorization") - .and_then(|auth| -> Option { - let auth = auth.to_str().ok()?; - if !auth.starts_with("Bearer ") { - return None; - } - - let token = &auth[7..]; - - // get claims out of the specified JWT - token.verify_with_key(&*JWT_SECRET).ok() - }) - .filter(|claims| { - // ensure the id is correct - if let Some(id) = query.id { - if claims.id != id { - warn!("connecting agent specified an id different than the claim"); - return false; - } - } - - true - }); - - // TODO: the client should provide us with some information about itself (num - // cpus, etc.) before we categorize it and add it as an agent to the agent pool - - // set up the RPC channels - let (client_response_in, client_transport, mut client_request_out) = RpcTransport::new(); - let (server_request_in, server_transport, mut server_response_out) = RpcTransport::new(); - - // set up the client, facing the agent server - let client = - AgentServiceClient::new(tarpc::client::Config::default(), client_transport).spawn(); - - let id: AgentId = 'insertion: { - let client = client.clone(); - let mut handshake = Handshake { - loki: state.cli.loki.as_ref().map(|u| u.to_string()), - ..Default::default() - }; - - // attempt to reconnect if claims were passed - 'reconnect: { - if let Some(claims) = claims { - let Some(mut agent) = state.pool.get_mut(&claims.id) else { - warn!("connecting agent is trying to identify as an unrecognized agent"); - break 'reconnect; - }; - - let id = agent.id(); - if agent.is_connected() { - warn!( - "connecting agent is trying to identify as an already-connected agent {id}" - ); - break 'reconnect; - } - - // compare the stored nonce with the JWT's nonce - if agent.claims().nonce != claims.nonce { - warn!("connecting agent {id} is trying to identify with an invalid nonce"); - break 'reconnect; - } - - if let AgentState::Node(env, _) = agent.state() { - if !state.envs.contains_key(env) { - info!("setting agent {id} to Inventory state due to missing env {env}"); - agent.set_state(AgentState::Inventory); - } - } - - // attach the current known agent state to the handshake - agent.state().clone_into(&mut handshake.state); - - // mark the agent as connected, update the flags as well - agent.mark_connected(client, query.flags); - - info!("agent {id} reconnected"); - if let Err(e) = state.db.agents.save(&id, &agent) { - error!("failed to save agent {id} to the database: {e}"); - } - - // handshake with client - // note: this may cause a reconciliation, so this *may* be non-instant - // unwrap safety: this agent was just `mark_connected` with a valid client - let client = agent.rpc().cloned().unwrap(); - - // drop agent ref to allow for mutable borrow in handshake requests - drop(agent); - - tokio::spawn(async move { - // we do this in a separate task because we don't want to hold up pool insertion - let mut ctx = tarpc::context::current(); - ctx.deadline += Duration::from_secs(300); - match client.handshake(ctx, handshake).await { - Ok(Ok(())) => (), - Ok(Err(e)) => { - error!("failed to perform agent {id} handshake reconciliation: {e}") - } - Err(e) => error!("failed to perform agent {id} handshake: {e}"), - } - }); - - break 'insertion id; - } - } - - // otherwise, we need to create an agent and give it a new JWT - // TODO: remove unnamed agents - let id = query.id.unwrap_or_else(AgentId::rand); - - // check if an agent with this id is already online - if state - .pool - .get(&id) - .map(|a| a.is_connected()) - .unwrap_or_default() - { - warn!("an agent is trying to identify as an already-connected agent {id}"); - let _ = socket.send(Message::Close(None)).await; - return; - } - - // create a new agent - let agent = Agent::new(client.to_owned(), id, query.flags); - - // sign the jwt - let signed_jwt = agent.sign_jwt(); - handshake.jwt = Some(signed_jwt); - - // handshake with the client - tokio::spawn(async move { - // we do this in a separate task because we don't want to hold up pool insertion - let mut ctx = tarpc::context::current(); - ctx.deadline += Duration::from_secs(300); - match client.handshake(ctx, handshake).await { - Ok(Ok(())) => (), - Ok(Err(e)) => error!("failed to perform agent {id} handshake reconciliation: {e}"), - Err(e) => error!("failed to perform agent {id} handshake: {e}"), - } - }); - - // insert a new agent into the pool - if let Err(e) = state.db.agents.save(&id, &agent) { - error!("failed to save agent {id} to the database: {e}"); - } - state.pool.insert(id, agent); - - info!( - "agent {id} connected; pool is now {} nodes", - state.pool.len() - ); - - id - }; - - // fetch the agent's network addresses on connect/reconnect - let state2 = Arc::clone(&state); - tokio::spawn(async move { - if let Ok((ports, external, internal)) = client.get_addrs(tarpc::context::current()).await { - if let Some(mut agent) = state2.pool.get_mut(&id) { - info!( - "agent {id} [{}], labels: {:?}, addrs: {external:?} {internal:?} @ {ports}, local pk: {}", - agent.modes(), - agent.str_labels(), - if agent.has_local_pk() { "yes" } else { "no" }, - ); - agent.set_ports(ports); - agent.set_addrs(external, internal); - if let Err(e) = state2.db.agents.save(&id, &agent) { - error!("failed to save agent {id} to the database: {e}"); - } - } - } - }); - - // set up the server, for incoming RPC requests - let server = tarpc::server::BaseChannel::with_defaults(server_transport); - let server_handle = tokio::spawn( - server - .execute( - ControlRpcServer { - state: state.to_owned(), - agent: id, - } - .serve(), - ) - .for_each(|r| async move { - tokio::spawn(r); - }), - ); - - loop { - select! { - // handle incoming messages - msg = socket.recv() => { - match msg { - Some(Err(_)) | None => break, - Some(Ok(Message::Binary(bin))) => { - let msg = match bincode::deserialize(&bin) { - Ok(msg) => msg, - Err(e) => { - error!("failed to deserialize a message from agent {id}: {e}"); - continue; - } - }; - - match msg { - MuxedMessageIncoming::Parent(msg) => server_request_in.send(msg).expect("internal RPC channel closed"), - MuxedMessageIncoming::Child(msg) => client_response_in.send(msg).expect("internal RPC channel closed"), - } - } - _ => (), - } - } - - // handle outgoing requests - msg = client_request_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&MuxedMessageOutgoing::Child(msg)).expect("failed to serialize request"); - if socket.send(Message::Binary(bin)).await.is_err() { - break; - } - } - - // handle outgoing responses - msg = server_response_out.recv() => { - let msg = msg.expect("internal RPC channel closed"); - let bin = bincode::serialize(&MuxedMessageOutgoing::Parent(msg)).expect("failed to serialize response"); - if socket.send(Message::Binary(bin)).await.is_err() { - break; - } - } - } - } - - // abort the RPC server handle - server_handle.abort(); - - // remove the client from the agent in the agent pool - { - // TODO: remove agent after 10 minutes of inactivity - - if let Some(mut agent) = state.pool.get_mut(&id) { - agent.mark_disconnected(); - } - - info!("agent {id} disconnected"); - } -} diff --git a/crates/controlplane/src/server/prometheus.rs b/crates/controlplane/src/server/prometheus.rs index 8ba53962..af76bb53 100644 --- a/crates/controlplane/src/server/prometheus.rs +++ b/crates/controlplane/src/server/prometheus.rs @@ -1,12 +1,11 @@ -use std::{collections::HashMap, fmt::Write}; +use std::collections::HashMap; use axum::{extract::State, response::IntoResponse, routing::get, Json, Router}; +use rayon::iter::{ParallelBridge, ParallelIterator}; use serde::Serialize; use snops_common::state::AgentState; -use tracing::debug; -use super::AppState; -use crate::{cli::PrometheusLocation, env::EnvPeer}; +use crate::{cli::PrometheusLocation, state::AppState}; pub(super) fn routes() -> Router { Router::new().route("/httpsd", get(get_httpsd)) } @@ -14,105 +13,51 @@ pub(super) fn routes() -> Router { #[derive(Debug, Clone, Serialize)] pub struct StaticConfig { pub targets: [String; 1], - pub labels: HashMap, -} - -/// Caching container for the Prometheus HTTP service discovery response. Marked -/// 'dirty' when environment agents are reallocated. -#[derive(Debug, Clone, Default)] -pub enum HttpsdResponse { - #[default] - Dirty, - Clean(Vec), -} - -impl HttpsdResponse { - pub fn set_dirty(&mut self) { - *self = Self::Dirty; - } + pub labels: HashMap<&'static str, String>, } async fn get_httpsd(State(state): State) -> impl IntoResponse { - let mut prom_httpsd = state.prom_httpsd.lock().await; - - let static_configs = match &*prom_httpsd { - // use the cached response - HttpsdResponse::Clean(static_configs) => static_configs.to_owned(), - - // recompute the response and save it - HttpsdResponse::Dirty => { - debug!("httpsd response is dirty, regenerating..."); - let mut static_configs = vec![]; - - for agent in state.pool.iter() { - let Some(mut agent_addr) = - (match (state.cli.prometheus_location, agent.has_label_str("local")) { - // agent is external: serve its external IP - (_, false) => agent - .addrs() - .and_then(|addrs| addrs.external.as_ref()) - .map(ToString::to_string), - - // prometheus and agent are local: use internal IP - (PrometheusLocation::Internal, true) => agent - .addrs() - .and_then(|addrs| addrs.internal.first()) - .map(ToString::to_string), - - // prometheus in docker but agent is local: use host.docker.internal - (PrometheusLocation::Docker, true) => { - Some(String::from("host.docker.internal")) - } - - // prometheus is external but agent is local: agent might not be forwarded; - // TODO - (PrometheusLocation::External, true) => continue, - }) - else { - continue; - }; - - match agent.state() { - AgentState::Node(env_id, _) => { - // get the environment this agent belongs to - let Some(env) = state.get_env(*env_id) else { - continue; - }; - - // get the node key that corresponds to this agent - let Some(node_key) = - env.node_peers.get_by_right(&EnvPeer::Internal(agent.id())) - else { - continue; - }; - - agent_addr - .write_fmt(format_args!(":{}", agent.metrics_port())) - .unwrap(); - - static_configs.push(StaticConfig { - targets: [agent_addr], - labels: [ - ("env_id".into(), env_id.to_string()), - ("node_key".into(), node_key.to_string()), - ] - .into_iter() - .collect(), - }); - } - - _ => { - // future-proofing; this comment also disables the - // clippy lint - } - } - } - - *prom_httpsd = HttpsdResponse::Clean(static_configs.to_owned()); - - static_configs - } - }; + let static_configs = state + .pool + .iter() + .par_bridge() + .filter_map(|agent| { + let agent_addr = (match (state.cli.prometheus_location, agent.has_label_str("local")) { + // agent is external: serve its external IP + (_, false) => agent + .addrs() + .and_then(|addrs| addrs.external.as_ref()) + .map(ToString::to_string), + + // prometheus and agent are local: use internal IP + (PrometheusLocation::Internal, true) => agent + .addrs() + .and_then(|addrs| addrs.internal.first()) + .map(ToString::to_string), + + // prometheus in docker but agent is local: use host.docker.internal + (PrometheusLocation::Docker, true) => Some(String::from("host.docker.internal")), + + // prometheus is external but agent is local: agent might not be forwarded; + // TODO + (PrometheusLocation::External, true) => return None, + })?; + + let AgentState::Node(env_id, node) = agent.state() else { + return None; + }; + + Some(StaticConfig { + targets: [format!("{agent_addr}:{}", agent.metrics_port())], + labels: [ + ("env_id", env_id.to_string()), + ("node_key", node.node_key.to_string()), + ] + .into_iter() + .collect(), + }) + }) + .collect::>(); Json(static_configs) } diff --git a/crates/controlplane/src/server/rpc.rs b/crates/controlplane/src/server/rpc.rs index cbadd2b3..367a12b5 100644 --- a/crates/controlplane/src/server/rpc.rs +++ b/crates/controlplane/src/server/rpc.rs @@ -1,12 +1,11 @@ -use std::{ - collections::{HashMap, HashSet}, - net::IpAddr, -}; +use std::{collections::HashMap, net::IpAddr, time::Instant}; use chrono::Utc; +use snops_common::events::AgentEvent; use snops_common::{ - api::EnvInfo, + api::AgentEnvInfo, define_rpc_mux, + prelude::{error::ReconcileError, ReconcileStatus}, rpc::{ control::{ agent::{AgentServiceRequest, AgentServiceResponse}, @@ -22,10 +21,10 @@ use snops_common::{ use tarpc::context; use tracing::warn; -use super::AppState; +use crate::state::{AgentEventHelpers, EmitEvent}; use crate::{ error::StateError, - state::{AddrMap, AgentAddrs}, + state::{AddrMap, AgentAddrs, AppState, GetGlobalState, GlobalState}, }; define_rpc_mux!(parent; @@ -43,20 +42,20 @@ impl ControlService for ControlRpcServer { async fn resolve_addrs( self, _: context::Context, - mut peers: HashSet, + mut peers: Vec, ) -> Result, ResolveError> { - peers.insert(self.agent); + peers.push(self.agent); let addr_map = self .state - .get_addr_map(Some(&peers)) + .get_addr_map(&peers) .await .map_err(|_| ResolveError::AgentHasNoAddresses)?; resolve_addrs(&addr_map, self.agent, &peers).map_err(|_| ResolveError::SourceAgentNotFound) } - async fn get_env_info(self, _: context::Context, env_id: EnvId) -> Option { - Some(self.state.get_env(env_id)?.info(&self.state)) + async fn get_env_info(self, _: context::Context, env_id: EnvId) -> Option { + Some(self.state.get_env(env_id)?.agent_info()) } async fn post_transfer_status( @@ -81,6 +80,7 @@ impl ControlService for ControlRpcServer { downloaded_bytes: 0, total_bytes: total, interruption: None, + handle: None, }, ); } @@ -144,6 +144,10 @@ impl ControlService for ControlRpcServer { update_time: Utc::now(), }; + AgentEvent::BlockInfo(info.clone()) + .with_agent(&agent) + .emit(&self); + agent.status.block_info = Some(info.clone()); let agent_id = agent.id(); let client = agent.client_owned().clone(); @@ -198,7 +202,60 @@ impl ControlService for ControlRpcServer { return; }; - agent.status.node_status = status; + // Prevent redundant events + if agent.status.node_status == status { + return; + } + + agent.status.node_status = status.clone(); + AgentEvent::NodeStatus(status) + .with_agent(&agent) + .emit(&self); + } + + async fn post_reconcile_status( + self, + _: context::Context, + status: Result, ReconcileError>, + ) { + let Some(mut agent) = self.state.pool.get_mut(&self.agent) else { + return; + }; + + agent.status.reconcile = Some((Instant::now(), status.clone())); + + // Emit events for this reconcile + + let ev = AgentEvent::ReconcileComplete.with_agent(&agent); + let is_complete = status + .as_ref() + .is_ok_and(|e| e.requeue_after.is_none() && e.inner.is_some()); + + ev.replace_content(match status { + Ok(res) => AgentEvent::Reconcile(res), + Err(err) => AgentEvent::ReconcileError(err), + }) + .emit(&self); + + if is_complete { + ev.emit(&self); + } + } +} + +pub fn resolve_one_addr(src_addrs: &AgentAddrs, target_addrs: &AgentAddrs) -> Option { + match ( + src_addrs.external, + target_addrs.external, + target_addrs.internal.first(), + ) { + // if peers have the same external address, use the first internal address + (Some(src_ext), Some(peer_ext), Some(peer_int)) if src_ext == peer_ext => Some(*peer_int), + // if both peers have only internal addresses, use the internal address + (None, None, Some(peer_int)) => Some(*peer_int), + // otherwise use the external address + (_, Some(peer_ext), _) => Some(peer_ext), + _ => None, } } @@ -207,16 +264,12 @@ impl ControlService for ControlRpcServer { fn resolve_addrs( addr_map: &AddrMap, src: AgentId, - peers: &HashSet, + peers: &[AgentId], ) -> Result, StateError> { let src_addrs = addr_map .get(&src) .ok_or_else(|| StateError::SourceAgentNotFound(src))?; - let all_internal = addr_map - .values() - .all(|AgentAddrs { external, .. }| external.is_none()); - Ok(peers .iter() .filter_map(|id| { @@ -225,24 +278,13 @@ fn resolve_addrs( return None; } - // if the agent has no addresses, skip it - let addrs = addr_map.get(id)?; - - // if there are no external addresses in the entire addr map, - // use the first internal address - if all_internal { - return addrs.internal.first().copied().map(|addr| (*id, addr)); - } - - match (src_addrs.external, addrs.external, addrs.internal.first()) { - // if peers have the same external address, use the first internal address - (Some(src_ext), Some(peer_ext), Some(peer_int)) if src_ext == peer_ext => { - Some((*id, *peer_int)) - } - // otherwise use the external address - (_, Some(peer_ext), _) => Some((*id, peer_ext)), - _ => None, - } + Some((*id, resolve_one_addr(src_addrs, addr_map.get(id)?)?)) }) .collect()) } + +impl<'a> GetGlobalState<'a> for &'a ControlRpcServer { + fn global_state(self) -> &'a GlobalState { + &self.state + } +} diff --git a/crates/controlplane/src/state/agent.rs b/crates/controlplane/src/state/agent.rs index 57c854a3..7e25f87b 100644 --- a/crates/controlplane/src/state/agent.rs +++ b/crates/controlplane/src/state/agent.rs @@ -11,9 +11,12 @@ use rand::{Rng, SeedableRng}; use rand_chacha::ChaChaRng; use serde::{Deserialize, Serialize}; use snops_common::{ + events::Event, lasso::Spur, rpc::control::agent::AgentServiceClient, - state::{AgentId, AgentModeOptions, AgentState, AgentStatus, EnvId, NodeState, PortConfig}, + state::{ + AgentId, AgentModeOptions, AgentState, AgentStatus, EnvId, NodeKey, NodeState, PortConfig, + }, INTERN, }; @@ -177,6 +180,13 @@ impl Agent { } } + pub fn node_key(&self) -> Option<&NodeKey> { + match &self.state { + AgentState::Node(_, state) => Some(&state.node_key), + _ => None, + } + } + /// The ID of this agent. pub fn id(&self) -> AgentId { self.id @@ -235,8 +245,10 @@ impl Agent { } /// Set the ports of the agent. This does **not** trigger a reconcile - pub fn set_ports(&mut self, ports: PortConfig) { + pub fn set_ports(&mut self, ports: PortConfig) -> bool { + let changed = self.ports.as_ref() != Some(&ports); self.ports = Some(ports); + changed } // Gets the bft port of the agent. Assumes the agent is ready, returns 0 if not. @@ -278,8 +290,11 @@ impl Agent { /// Set the external and internal addresses of the agent. This does **not** /// trigger a reconcile - pub fn set_addrs(&mut self, external: Option, internal: Vec) { - self.addrs = Some(AgentAddrs { external, internal }); + pub fn set_addrs(&mut self, external: Option, internal: Vec) -> bool { + let addrs = AgentAddrs { external, internal }; + let changed = self.addrs.as_ref() != Some(&addrs); + self.addrs = Some(addrs); + changed } pub fn map_to_reconcile(&self, f: F) -> PendingAgentReconcile @@ -288,7 +303,6 @@ impl Agent { { ( self.id(), - self.client_owned(), match &self.state { AgentState::Node(id, state) => AgentState::Node(*id, Box::new(f(*state.clone()))), s => s.clone(), @@ -302,7 +316,6 @@ impl Agent { { Some(( self.id(), - self.client_owned(), match &self.state { AgentState::Node(id, state) => AgentState::Node(*id, Box::new(f(*state.clone())?)), _ => return None, @@ -318,7 +331,7 @@ pub enum AgentConnection { } /// This is the representation of a public addr or a list of internal addrs. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct AgentAddrs { pub external: Option, pub internal: Vec, @@ -336,3 +349,19 @@ impl AgentAddrs { self.external.is_some() || !self.internal.is_empty() } } + +pub trait AgentEventHelpers { + fn with_agent(self, agent: &Agent) -> Event; +} + +impl> AgentEventHelpers for T { + fn with_agent(self, agent: &Agent) -> Event { + let mut event = self.into(); + event.agent = Some(agent.id); + if let AgentState::Node(env_id, node) = &agent.state { + event.node_key = Some(node.node_key.clone()); + event.env = Some(*env_id); + } + event + } +} diff --git a/crates/controlplane/src/state/error.rs b/crates/controlplane/src/state/error.rs deleted file mode 100644 index 3728ed08..00000000 --- a/crates/controlplane/src/state/error.rs +++ /dev/null @@ -1,10 +0,0 @@ -use snops_common::impl_into_status_code; -use thiserror::Error; - -#[derive(Debug, Error)] -#[error("batch reconciliation failed with `{failures}` failed reconciliations")] -pub struct BatchReconcileError { - pub failures: usize, -} - -impl_into_status_code!(BatchReconcileError); diff --git a/crates/controlplane/src/state/global.rs b/crates/controlplane/src/state/global.rs index 5061fb43..f945bc71 100644 --- a/crates/controlplane/src/state/global.rs +++ b/crates/controlplane/src/state/global.rs @@ -1,4 +1,4 @@ -use std::{collections::HashSet, fmt::Display, net::SocketAddr, path::PathBuf, sync::Arc}; +use std::{fmt::Display, net::SocketAddr, path::PathBuf, sync::Arc}; use chrono::Utc; use dashmap::DashMap; @@ -7,13 +7,14 @@ use prometheus_http_query::Client as PrometheusClient; use serde::de::DeserializeOwned; use snops_common::{ constant::ENV_AGENT_KEY, + events::Event, node_targets::NodeTargets, state::{ AgentId, AgentPeer, AgentState, EnvId, LatestBlockInfo, NetworkId, NodeType, StorageId, }, util::OpaqueDebug, }; -use tokio::sync::{Mutex, Semaphore}; +use tokio::sync::Semaphore; use tracing::info; use super::{ @@ -25,8 +26,9 @@ use crate::{ db::Database, env::{cache::NetworkCache, error::EnvRequestError, Environment, PortType}, error::StateError, + events::Events, schema::storage::{LoadedStorage, STORAGE_DIR}, - server::{error::StartError, prometheus::HttpsdResponse}, + server::error::StartError, ReloadHandler, }; @@ -44,8 +46,8 @@ pub struct GlobalState { pub storage: StorageMap, pub envs: EnvMap, pub env_network_cache: OpaqueDebug>, + pub events: Events, - pub prom_httpsd: Mutex, pub prometheus: OpaqueDebug>, pub log_level_handler: ReloadHandler, @@ -95,7 +97,7 @@ impl GlobalState { pool, storage, envs: EnvMap::default(), - prom_httpsd: Default::default(), + events: Default::default(), prometheus: OpaqueDebug(prometheus), db: OpaqueDebug(db), env_network_cache: Default::default(), @@ -161,13 +163,10 @@ impl GlobalState { /// Get a peer-to-addr mapping for a set of agents /// Locks pools for reading - pub async fn get_addr_map( - &self, - filter: Option<&HashSet>, - ) -> Result { - self.pool + pub async fn get_addr_map(&self, filter: &[AgentId]) -> Result { + filter .iter() - .filter(|agent| filter.is_none() || filter.is_some_and(|p| p.contains(&agent.id()))) + .filter_map(|id| self.pool.get(id)) .map(|agent| { let addrs = agent .addrs @@ -359,3 +358,31 @@ impl GlobalState { Err(EnvRequestError::NoResponsiveNodes) } } + +pub trait GetGlobalState<'a> { + /// Returns the global state. + fn global_state(self) -> &'a GlobalState; +} + +impl<'a> GetGlobalState<'a> for &'a GlobalState { + fn global_state(self) -> &'a GlobalState { + self + } +} + +impl<'a> GetGlobalState<'a> for &'a Arc { + fn global_state(self) -> &'a GlobalState { + self + } +} + +pub trait EmitEvent { + fn emit<'a>(self, state: impl GetGlobalState<'a>); +} + +impl EmitEvent for Event { + #[inline] + fn emit<'a>(self, state: impl GetGlobalState<'a>) { + state.global_state().events.emit(self); + } +} diff --git a/crates/controlplane/src/state/mod.rs b/crates/controlplane/src/state/mod.rs index 97e5bd32..3773fd45 100644 --- a/crates/controlplane/src/state/mod.rs +++ b/crates/controlplane/src/state/mod.rs @@ -5,7 +5,6 @@ use snops_common::state::{AgentId, EnvId, NetworkId, StorageId}; mod agent; mod agent_flags; -pub mod error; pub mod external_peers; mod global; mod reconcile; diff --git a/crates/controlplane/src/state/reconcile.rs b/crates/controlplane/src/state/reconcile.rs index 0549af6e..103d1cd4 100644 --- a/crates/controlplane/src/state/reconcile.rs +++ b/crates/controlplane/src/state/reconcile.rs @@ -1,20 +1,20 @@ use std::collections::HashMap; use futures_util::future::join_all; -use snops_common::state::{AgentId, AgentState, NodeKey}; +use snops_common::state::{AgentId, AgentState, NodeKey, ReconcileOptions}; use tracing::{error, info}; -use super::{error::BatchReconcileError, AgentClient, GlobalState}; +use super::GlobalState; /// The tuple to pass into `reconcile_agents`. -pub type PendingAgentReconcile = (AgentId, Option, AgentState); +pub type PendingAgentReconcile = (AgentId, AgentState); /// Get a node map (key => agent ID) from an agent reconciliation iterator. pub fn pending_reconcile_node_map<'a>( pending: impl Iterator, ) -> HashMap { pending - .map(|(id, _, state)| match state { + .map(|(id, state)| match state { AgentState::Node(_, node) => (node.node_key.clone(), *id), _ => unreachable!(), }) @@ -22,24 +22,22 @@ pub fn pending_reconcile_node_map<'a>( } impl GlobalState { + pub async fn update_agent_states(&self, iter: impl IntoIterator) { + self.update_agent_states_opts(iter, Default::default()) + .await; + } + /// Reconcile a bunch of agents at once. - pub async fn reconcile_agents( + pub async fn update_agent_states_opts( &self, iter: impl IntoIterator, - ) -> Result<(), BatchReconcileError> { - let mut handles = vec![]; + opts: ReconcileOptions, + ) { let mut agent_ids = vec![]; - for (id, client, target) in iter { - agent_ids.push(id); - - // if the client is present, queue a reconcile - if let Some(client) = client { - handles.push(tokio::spawn(async move { client.reconcile(target).await })); - - // otherwise just change the agent state so it'll inventory on - // reconnect - } else if let Some(mut agent) = self.pool.get_mut(&id) { + for (id, target) in iter { + if let Some(mut agent) = self.pool.get_mut(&id) { + agent_ids.push(id); agent.set_state(target); if let Err(e) = self.db.agents.save(&id, &agent) { error!("failed to save agent {id} to the database: {e}"); @@ -47,54 +45,56 @@ impl GlobalState { } } + self.queue_many_reconciles(agent_ids, opts).await; + } + + pub async fn queue_many_reconciles( + &self, + iter: impl IntoIterator, + opts: ReconcileOptions, + ) -> (usize, usize) { + let mut handles = vec![]; + let mut agent_ids = vec![]; + + for id in iter { + let agent = self.pool.get(&id); + let Some(agent) = agent else { + continue; + }; + let Some(client) = agent.client_owned() else { + continue; + }; + + agent_ids.push(id); + let target = agent.state.clone(); + + handles.push(tokio::spawn(async move { + client.set_agent_state(target, opts).await + })); + } + if handles.is_empty() { - return Ok(()); + return (0, 0); } - let num_reconciliations = handles.len(); + let num_reqs = handles.len(); - info!("beginning reconciliation..."); + info!("Requesting reconcile from {num_reqs} agents..."); let reconciliations = join_all(handles).await; - info!("reconciliation complete, updating agent states..."); let mut success = 0; for (agent_id, result) in agent_ids.into_iter().zip(reconciliations) { - let Some(mut agent) = self.pool.get_mut(&agent_id) else { - continue; - }; - match result { - Ok(Ok(Ok(agent_state))) => { - agent.set_state(agent_state); - if let Err(e) = self.db.agents.save(&agent_id, &agent) { - error!("failed to save agent {agent_id} to the database: {e}"); - } - + Ok(Ok(())) => { success += 1; } - Ok(Ok(Err(e))) => error!( - "agent {} experienced a reconcilation error: {e}", - agent.id(), - ), - - Ok(Err(e)) => error!("agent {} experienced a rpc error: {e}", agent.id(),), - Err(e) => error!("agent {} experienced a join error: {e}", agent.id(),), + Ok(Err(e)) => error!("agent {agent_id} experienced a rpc error: {e}"), + Err(e) => error!("join error during agent {agent_id} reconcile request: {e}"), } } - info!( - "reconciliation result: {success}/{} nodes reconciled", - num_reconciliations - ); - - self.prom_httpsd.lock().await.set_dirty(); + info!("Requested {success}/{num_reqs} agents"); - if success == num_reconciliations { - Ok(()) - } else { - Err(BatchReconcileError { - failures: num_reconciliations - success, - }) - } + (success, num_reqs) } } diff --git a/crates/controlplane/src/state/rpc.rs b/crates/controlplane/src/state/rpc.rs index 6123d9b2..f55f82ae 100644 --- a/crates/controlplane/src/state/rpc.rs +++ b/crates/controlplane/src/state/rpc.rs @@ -2,11 +2,10 @@ use std::{fmt::Display, time::Duration}; use serde::de::DeserializeOwned; use snops_common::{ - rpc::{ - control::agent::AgentServiceClient, - error::{ReconcileError, SnarkosRequestError}, + rpc::{control::agent::AgentServiceClient, error::SnarkosRequestError}, + state::{ + snarkos_status::SnarkOSLiteBlock, AgentId, AgentState, EnvId, NetworkId, ReconcileOptions, }, - state::{snarkos_status::SnarkOSLiteBlock, AgentState, EnvId, NetworkId}, }; use tarpc::{client::RpcError, context}; @@ -16,16 +15,16 @@ use crate::error::StateError; pub struct AgentClient(pub(crate) AgentServiceClient); impl AgentClient { - pub async fn reconcile( + pub async fn set_agent_state( &self, to: AgentState, - ) -> Result, RpcError> { - let mut ctx = context::current(); - ctx.deadline += Duration::from_secs(300); - self.0 - .reconcile(ctx, to.clone()) - .await - .map(|res| res.map(|_| to)) + opts: ReconcileOptions, + ) -> Result<(), RpcError> { + self.0.set_agent_state(context::current(), to, opts).await + } + + pub async fn clear_peer_addr(&self, peer: AgentId) -> Result<(), RpcError> { + self.0.clear_peer_addr(context::current(), peer).await } pub async fn snarkos_get( diff --git a/crates/controlplane/src/state/transactions.rs b/crates/controlplane/src/state/transactions.rs index 18f0205f..6f54d1a1 100644 --- a/crates/controlplane/src/state/transactions.rs +++ b/crates/controlplane/src/state/transactions.rs @@ -2,15 +2,15 @@ use std::{sync::Arc, time::Duration}; use chrono::{TimeDelta, Utc}; use futures_util::future; -use snops_common::state::{CannonId, EnvId}; +use snops_common::{ + events::{EventHelpers, TransactionEvent}, + state::{CannonId, EnvId, TransactionSendState}, +}; use tokio::time::timeout; use tracing::{info, trace}; -use super::GlobalState; -use crate::cannon::{ - status::{TransactionSendState, TransactionStatusSender}, - tracker::TransactionTracker, -}; +use super::{EmitEvent, GlobalState}; +use crate::cannon::tracker::TransactionTracker; /// This task re-sends all transactions that have not been confirmed, /// re-computes all transactions that have not been computed, and removes @@ -33,7 +33,7 @@ pub async fn tracking_task(state: Arc) { for tx_id in pending.to_execute { if let Err(e) = cannon .auth_sender - .send((tx_id.clone(), TransactionStatusSender::empty())) + .send(tx_id.clone()) { tracing::error!( "cannon {env_id}.{cannon_id} failed to send auth {tx_id} to cannon: {e:?}" @@ -57,30 +57,33 @@ pub async fn tracking_task(state: Arc) { let state = state.clone(); let cannon_target = cannon.sink.target.as_ref(); async move { - if let Some(cache) = state.env_network_cache.get(&env_id) { - if cache.has_transaction(&tx_id) { - trace!("cannon {env_id}.{cannon_id} confirmed transaction {tx_id} (cache hit)"); - return Some(tx_id); - } + let (tx_id, hash) = if let Some(hash) = state.env_network_cache.get(&env_id).and_then(|cache| cache.find_transaction(&tx_id).cloned()) { + trace!("cannon {env_id}.{cannon_id} confirmed transaction {tx_id} (cache hit)"); + (tx_id, hash.to_string()) } // check if the transaction not is in the cache, then check the peers - if let Some(target) = cannon_target { + else if let Some(target) = cannon_target { match timeout(Duration::from_secs(1), state.snarkos_get::>(env_id, format!("/find/blockHash/{tx_id}"), target)).await { - Ok(Ok(Some(_hash))) => { + Ok(Ok(Some(hash))) => { trace!("cannon {env_id}.{cannon_id} confirmed transaction {tx_id} (get request)"); - return Some(tx_id) - } - Ok(Ok(None)) => { - // the transaction is not in the cache + (tx_id, hash) } - _ => {} + // the transaction is not in the cache + _ => return None, } + } else { + return None; + }; - } + // Emit a confirmed event + TransactionEvent::Confirmed { hash } + .with_cannon(cannon_id) + .with_env_id(env_id) + .with_transaction(Arc::clone(&tx_id)).emit(&state); - None + Some(tx_id) }})).await; // remove all the transactions that are confirmed or expired @@ -100,10 +103,10 @@ pub async fn tracking_task(state: Arc) { } struct PendingTransactions { - to_execute: Vec, - to_broadcast: Vec, - to_remove: Vec, - to_confirm: Vec<(String, Option)>, + to_execute: Vec>, + to_broadcast: Vec>, + to_remove: Vec>, + to_confirm: Vec<(Arc, Option)>, } /// Get a list of transactions that need to be executed, broadcasted, removed, @@ -125,15 +128,22 @@ fn get_pending_transactions(state: &GlobalState) -> Vec<((EnvId, CannonId), Pend for tx in cannon.transactions.iter() { let tx_id = tx.key().to_owned(); - let key = (env_id, cannon_id, tx_id.to_owned()); + let key = (env_id, cannon_id, Arc::clone(&tx_id)); let attempts = TransactionTracker::get_attempts(state, &key); + let ev = TransactionEvent::Executing + .with_cannon(cannon_id) + .with_env_id(env_id) + .with_transaction(Arc::clone(&tx_id)); + match tx.status { // any authorized transaction that is not started should be queued TransactionSendState::Authorized => { if cannon.sink.authorize_attempts.is_some_and(|a| attempts > a) { info!("cannon {env_id}.{cannon_id} removed auth {tx_id} (too many attempts)"); to_remove.push(tx_id); + ev.replace_content(TransactionEvent::ExecuteExceeded { attempts }) + .emit(state); } else { to_execute.push((tx_id, tx.index)); } @@ -145,6 +155,8 @@ fn get_pending_transactions(state: &GlobalState) -> Vec<((EnvId, CannonId), Pend { if cannon.sink.authorize_attempts.is_some_and(|a| attempts > a) { info!("cannon {env_id}.{cannon_id} removed auth {tx_id} (too many attempts)"); + ev.replace_content(TransactionEvent::ExecuteExceeded { attempts }) + .emit(state); to_remove.push(tx_id); } else { to_execute.push((tx_id, tx.index)); @@ -154,6 +166,8 @@ fn get_pending_transactions(state: &GlobalState) -> Vec<((EnvId, CannonId), Pend TransactionSendState::Unsent => { if cannon.sink.broadcast_attempts.is_some_and(|a| attempts > a) { info!("cannon {env_id}.{cannon_id} removed broadcast {tx_id} (too many attempts)"); + ev.replace_content(TransactionEvent::BroadcastExceeded { attempts }) + .emit(state); to_remove.push(tx_id); } else { to_broadcast.push((tx_id, tx.index)); @@ -190,6 +204,10 @@ fn get_pending_transactions(state: &GlobalState) -> Vec<((EnvId, CannonId), Pend { if cannon.sink.broadcast_attempts.is_some_and(|a| attempts > a) { info!("cannon {env_id}.{cannon_id} removed broadcast {tx_id} (too many attempts)"); + ev.replace_content(TransactionEvent::BroadcastExceeded { + attempts, + }) + .emit(state); to_remove.push(tx_id); } else { to_broadcast.push((tx_id, tx.index)); diff --git a/crates/xtask/src/main.rs b/crates/xtask/src/main.rs index 2c482662..6b395844 100644 --- a/crates/xtask/src/main.rs +++ b/crates/xtask/src/main.rs @@ -268,7 +268,11 @@ fn clipages(sh: &Shell) -> Result<()> { fn manpages(sh: &Shell) -> Result<()> { cmd!(sh, "cargo run -p snarkos-aot --features=docpages -- man").run()?; cmd!(sh, "cargo run -p snops --features=docpages -- man").run()?; - cmd!(sh, "cargo run -p snops-agent --features=docpages -- man").run()?; + cmd!( + sh, + "cargo run -p snops-agent --features=docpages -- --id foo man" + ) + .run()?; cmd!(sh, "cargo run -p snops-cli --features=docpages -- man").run()?; Ok(()) } diff --git a/index.html b/index.html index 10cc8d26..33179779 100644 --- a/index.html +++ b/index.html @@ -208,17 +208,17 @@

Easy Setup

run snarkOS, or what transactions to execute.

  • -

    In another terminal, build the cli: cargo install --path ./crates/snops-cli

    +

    In another terminal, install the cli: cargo install --path ./crates/snops-cli, or build with cargo xtask build cli and use from target/release-big/snops-cli.

    The cli is used to interact with the controlplane and manage environments. It provides JSON based output. We recommend pairing our cli with jq when leveraging other scripts and tools

  • -

    Build the agent: cargo build --profile release-big -p snops-agent

    +

    Build the agent: cargo xtask build agent

    The agent is a lightweight service that starts up snarkos-aot which automatically configures snarkos nodes, or executes transactions.

  • -

    Build snarkos-aot (for running nodes): cargo build --profile release-big -p snarkos-aot

    +

    Build snarkos-aot (for running nodes): cargo xtask build aot

    snarkos-aot is an alternative snarkOS CLI providing more developer-oriented features as well as tooling for distributed transaction generation and execution.

  • @@ -233,14 +233,14 @@

    Easy Setup

    agent configuration should connect to a locally operated controlplane.

    -

    Local Isonets

    -

    This example requires 4 agents and the control plane to be running.

    +

    Local Isolated Networks (Isonets)

    +

    This example requires 4 agents and the control plane to be running. It allows you to run a devnet with a custom genesis block.

      -
    1. Start the environment: snops-cli env prepare specs/test-4-validators.yaml
    2. +
    3. Start the environment: snops-cli env apply specs/test-4-validators.yaml
    4. Check the current network height: snops-cli env height
    5. Look at the latest block: snops-cli env block
    6. Look at the genesis block: snops-cli env block 0
    7. -
    8. Stop the environment: snops-cli env clean
    9. +
    10. Stop the environment: snops-cli env delete

    Isonet Transfers

    Using the setup for a Local Isonet, executing Aleo programs has never @@ -328,7 +328,7 @@

    SnarkOS-aot Quickstart

    snarkos-aot provides various CLI tools to help with developing and executing Aleo programs as well as interact with snarkOS ledgers.

    -

    Build snarkos-aot with: cargo install --profile release-big -p snarkos-aot. +

    Build snarkos-aot with: cargo xtask build aot. The compiled binary can be found in target/release-big/snarkos-aot.

    Use the NETWORK environment variable to specify mainnet (default), testnet, or canary.

    diff --git a/sdk_ts/index.ts b/sdk_ts/index.ts index 0e7f215d..b34db8a1 100644 --- a/sdk_ts/index.ts +++ b/sdk_ts/index.ts @@ -141,8 +141,8 @@ class SnopsApi { return await this.get(`env/${env_id}/agents/${node_ty}/${node_key}`); } - async envPrepare(env_id: string, prepare: any): Promise { - return this.post(`env/${env_id}/prepare`, prepare); + async envApply(env_id: string, prepare: any): Promise { + return this.post(`env/${env_id}/apply`, prepare); } async envInfo(env_id: string): Promise { @@ -332,7 +332,7 @@ class Env { } async prepare(prepare: any) { - return await this.api.envPrepare(this.env_id, prepare); + return await this.api.envApply(this.env_id, prepare); } async info() { diff --git a/snops_book/architecture/CONTROL_PLANE.md b/snops_book/architecture/CONTROL_PLANE.md index 02a2099e..2724e6ab 100644 --- a/snops_book/architecture/CONTROL_PLANE.md +++ b/snops_book/architecture/CONTROL_PLANE.md @@ -83,7 +83,7 @@ have connected to it. Agents have two States: - _Inventoried_: An agent is in inventory mode if it is not currently running a snarkOS node. -- _Associated_: It becomes associated with an **environment** when one is prepared. As the control plane will delegate agents in inventory to the **environment**. +- _Associated_: It becomes associated with an **environment** when one is applied. As the control plane will delegate agents in inventory to the **environment**. ### Metrics and Logging diff --git a/snops_book/user_guide/clis/SNARKOS_AOT.md b/snops_book/user_guide/clis/SNARKOS_AOT.md index 4c46858c..b5254f3f 100644 --- a/snops_book/user_guide/clis/SNARKOS_AOT.md +++ b/snops_book/user_guide/clis/SNARKOS_AOT.md @@ -24,7 +24,6 @@ This document contains the help content for the `snarkos-aot` command-line progr * [`snarkos-aot ledger checkpoint apply`↴](#snarkos-aot-ledger-checkpoint-apply) * [`snarkos-aot ledger checkpoint view`↴](#snarkos-aot-ledger-checkpoint-view) * [`snarkos-aot ledger checkpoint clean`↴](#snarkos-aot-ledger-checkpoint-clean) -* [`snarkos-aot run`↴](#snarkos-aot-run) * [`snarkos-aot auth`↴](#snarkos-aot-auth) * [`snarkos-aot auth execute`↴](#snarkos-aot-auth-execute) * [`snarkos-aot auth program`↴](#snarkos-aot-auth-program) @@ -39,6 +38,7 @@ This document contains the help content for the `snarkos-aot` command-line progr * [`snarkos-aot program cost`↴](#snarkos-aot-program-cost) * [`snarkos-aot man`↴](#snarkos-aot-man) * [`snarkos-aot md`↴](#snarkos-aot-md) +* [`snarkos-aot run`↴](#snarkos-aot-run) ## `snarkos-aot` @@ -51,11 +51,11 @@ The different AOT commands * `genesis` — This command helps generate a custom genesis block given an initial private key, seed, and committee size * `accounts` — Given a seed and a count, generate a number of accounts * `ledger` — Commands for interacting with the ledger -* `run` — A wrapper around the snarkos node run commands that provide additional logging and configurability * `auth` — A command to help generate various different types of authorizations and execute them * `program` — A command to help gather information about a program, including its cost and imports * `man` — For generating cli manpages. Only with the mangen feature enabled * `md` — For generating cli markdown. Only with the clipages feature enabled +* `run` — A wrapper around the snarkos node run commands that provide additional logging and configurability ###### **Options:** @@ -374,46 +374,6 @@ Cleanup old checkpoints -## `snarkos-aot run` - -A wrapper around the snarkos node run commands that provide additional logging and configurability - -**Usage:** `snarkos-aot run [OPTIONS] --ledger --type <--private-key |--private-key-file >` - -###### **Options:** - -* `-g`, `--genesis ` — A path to the genesis block to initialize the ledger from -* `-l`, `--ledger ` — The ledger from which to view a block - - Default value: `./ledger` -* `-t`, `--type ` — The type of node to run: validator, prover, or client -* `--private-key ` — Specify the account private key of the node -* `--private-key-file ` — Specify the account private key of the node -* `--bind ` — Specify the IP(v4 or v6) address to bind to - - Default value: `0.0.0.0` -* `--node ` — Specify the IP address and port for the node server - - Default value: `4130` -* `--bft ` — Specify the IP address and port for the BFT - - Default value: `5000` -* `--rest ` — Specify the IP address and port for the REST server - - Default value: `3030` -* `--metrics ` — Specify the port for the metrics server - - Default value: `9000` -* `--peers ` — Specify the IP address and port of the peer(s) to connect to -* `--validators ` — Specify the IP address and port of the validator(s) to connect to -* `--rest-rps ` — Specify the requests per second (RPS) rate limit per IP for the REST server - - Default value: `1000` -* `--retention-policy ` — The retention policy for the checkpoint manager. i.e. how often to create checkpoints -* `--agent-rpc-port ` — When present, connects to an agent RPC server on the given port - - - ## `snarkos-aot auth` A command to help generate various different types of authorizations and execute them @@ -489,6 +449,9 @@ Authorize a program execution * `--record ` — The record for a private fee * `-q`, `--query ` — Query to load the program with * `--seed ` — The seed to use for the authorization generation +* `--cost-v1` — Enable cost v1 for the transaction cost estimation (v2 by default) + + Default value: `false` @@ -512,6 +475,9 @@ Authorize the fee for a program execution * `-i`, `--id ` — The ID of the deployment or program execution * `-c`, `--cost ` — Estimated cost of the deployment or program execution * `--seed ` — The seed to use for the authorization generation +* `--cost-v1` — Enable cost v1 for the transaction cost estimation (v2 by default) + + Default value: `false` @@ -554,11 +520,14 @@ Estimate the cost of a program execution or deployment ###### **Options:** -* `--query ` — The query to use for the program +* `-q`, `--query ` — The query to use for the program * `-a`, `--auth ` — Authorization for an execution of some kind * `-f`, `--fee-auth ` — The optional fee authorization for said execution * `-o`, `--owner ` — The owner of the program if deploying * `-d`, `--deployment ` — The deployment of the program if deploying +* `--cost-v1` — Enable cost v1 for the transaction cost estimation (v2 by default) + + Default value: `false` @@ -585,6 +554,9 @@ Deploy a program to the network * `--record ` — The record for a private fee * `-q`, `--query ` — The query to use for the program * `--seed ` — The seed to use for the authorization generation +* `--cost-v1` — Enable cost v1 for the transaction cost estimation (v2 by default) + + Default value: `false` @@ -666,6 +638,9 @@ Compute the cost to execute a function in a given program ###### **Options:** * `-q`, `--query ` — Query to load the program with +* `--cost-v1` — Enable cost v1 for the transaction cost estimation (v2 by default) + + Default value: `false` @@ -697,6 +672,46 @@ For generating cli markdown. Only with the clipages feature enabled +## `snarkos-aot run` + +A wrapper around the snarkos node run commands that provide additional logging and configurability + +**Usage:** `snarkos-aot run [OPTIONS] --ledger --type <--private-key |--private-key-file >` + +###### **Options:** + +* `-g`, `--genesis ` — A path to the genesis block to initialize the ledger from +* `-l`, `--ledger ` — The ledger from which to view a block + + Default value: `./ledger` +* `-t`, `--type ` — The type of node to run: validator, prover, or client +* `--private-key ` — Specify the account private key of the node +* `--private-key-file ` — Specify the account private key of the node +* `--bind ` — Specify the IP(v4 or v6) address to bind to + + Default value: `0.0.0.0` +* `--node ` — Specify the IP address and port for the node server + + Default value: `4130` +* `--bft ` — Specify the IP address and port for the BFT + + Default value: `5000` +* `--rest ` — Specify the IP address and port for the REST server + + Default value: `3030` +* `--metrics ` — Specify the port for the metrics server + + Default value: `9000` +* `--peers ` — Specify the IP address and port of the peer(s) to connect to +* `--validators ` — Specify the IP address and port of the validator(s) to connect to +* `--rest-rps ` — Specify the requests per second (RPS) rate limit per IP for the REST server + + Default value: `1000` +* `--retention-policy ` — The retention policy for the checkpoint manager. i.e. how often to create checkpoints +* `--agent-rpc-port ` — When present, connects to an agent RPC server on the given port + + +
    diff --git a/snops_book/user_guide/clis/SNOPS_CLI.md b/snops_book/user_guide/clis/SNOPS_CLI.md index f5d6bf53..305d778e 100644 --- a/snops_book/user_guide/clis/SNOPS_CLI.md +++ b/snops_book/user_guide/clis/SNOPS_CLI.md @@ -12,6 +12,7 @@ This document contains the help content for the `snops-cli` command-line program * [`snops-cli agent kill`↴](#snops-cli-agent-kill) * [`snops-cli agent list`↴](#snops-cli-agent-list) * [`snops-cli agent tps`↴](#snops-cli-agent-tps) +* [`snops-cli agent status`↴](#snops-cli-agent-status) * [`snops-cli agent set-log-level`↴](#snops-cli-agent-set-log-level) * [`snops-cli agent set-snarkos-log-level`↴](#snops-cli-agent-set-snarkos-log-level) * [`snops-cli env`↴](#snops-cli-env) @@ -30,17 +31,18 @@ This document contains the help content for the `snops-cli` command-line program * [`snops-cli env height`↴](#snops-cli-env-height) * [`snops-cli env transaction`↴](#snops-cli-env-transaction) * [`snops-cli env transaction-details`↴](#snops-cli-env-transaction-details) -* [`snops-cli env clean`↴](#snops-cli-env-clean) +* [`snops-cli env delete`↴](#snops-cli-env-delete) * [`snops-cli env info`↴](#snops-cli-env-info) * [`snops-cli env list`↴](#snops-cli-env-list) * [`snops-cli env topology`↴](#snops-cli-env-topology) * [`snops-cli env topology-resolved`↴](#snops-cli-env-topology-resolved) -* [`snops-cli env prepare`↴](#snops-cli-env-prepare) +* [`snops-cli env apply`↴](#snops-cli-env-apply) * [`snops-cli env mapping`↴](#snops-cli-env-mapping) * [`snops-cli env mappings`↴](#snops-cli-env-mappings) * [`snops-cli env program`↴](#snops-cli-env-program) * [`snops-cli env storage`↴](#snops-cli-env-storage) * [`snops-cli set-log-level`↴](#snops-cli-set-log-level) +* [`snops-cli events`↴](#snops-cli-events) * [`snops-cli man`↴](#snops-cli-man) * [`snops-cli md`↴](#snops-cli-md) @@ -53,7 +55,8 @@ This document contains the help content for the `snops-cli` command-line program * `autocomplete` — Generate shell completions * `agent` — For interacting with snop agents * `env` — For interacting with snop environments -* `set-log-level` — +* `set-log-level` — +* `events` — Listen to events from the control plane, optionally filtered * `man` — For generating cli manpages. Only with the mangen feature enabled * `md` — For generating cli markdown. Only with the clipages feature enabled @@ -93,8 +96,9 @@ For interacting with snop agents * `kill` — Kill the specific agent * `list` — List all agents. Ignores the agent id * `tps` — Get the specific agent's TPS -* `set-log-level` — -* `set-snarkos-log-level` — +* `status` — Get the specific agent's status +* `set-log-level` — Set the log level of the agent +* `set-snarkos-log-level` — Set the log level of the node running on an agent ###### **Arguments:** @@ -156,8 +160,18 @@ Get the specific agent's TPS +## `snops-cli agent status` + +Get the specific agent's status + +**Usage:** `snops-cli agent status` + + + ## `snops-cli agent set-log-level` +Set the log level of the agent + **Usage:** `snops-cli agent set-log-level ` ###### **Arguments:** @@ -168,6 +182,8 @@ Get the specific agent's TPS ## `snops-cli agent set-snarkos-log-level` +Set the log level of the node running on an agent + **Usage:** `snops-cli agent set-snarkos-log-level ` ###### **Arguments:** @@ -184,21 +200,21 @@ For interacting with snop environments ###### **Subcommands:** -* `action` — Actions you can apply on a specific environment +* `action` — Run an action on an environment * `agent` — Get an env's specific agent by * `agents` — List an env's agents -* `auth` — +* `auth` — * `balance` — Lookup an account's balance * `block` — Lookup a block or get the latest block * `height` — Get the latest height from all agents in the env * `transaction` — Lookup a transaction's block by a transaction id * `transaction-details` — Lookup a transaction's details by a transaction id -* `clean` — Clean a specific environment +* `delete` — Delete a specific environment * `info` — Get an env's latest block/state root info * `list` — List all environments. Ignores the env id * `topology` — Show the current topology of a specific environment * `topology-resolved` — Show the resolved topology of a specific environment. Shows only internal agents -* `prepare` — Prepare a (test) environment +* `apply` — Apply an environment spec * `mapping` — Lookup a mapping by program id and mapping name * `mappings` — Lookup a program's mappings only * `program` — Lookup a program by its id @@ -214,7 +230,7 @@ For interacting with snop environments ## `snops-cli env action` -Actions you can apply on a specific environment +Run an action on an environment **Usage:** `snops-cli env action ` @@ -233,11 +249,15 @@ Actions you can apply on a specific environment Turn the specified agents(and nodes) offline -**Usage:** `snops-cli env action offline [NODES]...` +**Usage:** `snops-cli env action offline [OPTIONS] [NODES]...` ###### **Arguments:** -* `` +* `` — The nodes to take offline. (eg. `validator/any`) + +###### **Options:** + +* `--async` — When present, don't wait for reconciles to finish before returning @@ -245,11 +265,15 @@ Turn the specified agents(and nodes) offline Turn the specified agents(and nodes) online -**Usage:** `snops-cli env action online [NODES]...` +**Usage:** `snops-cli env action online [OPTIONS] [NODES]...` ###### **Arguments:** -* `` +* `` — The nodes to turn online (eg. `validator/any`) + +###### **Options:** + +* `--async` — When present, don't wait for reconciles to finish before returning @@ -257,11 +281,15 @@ Turn the specified agents(and nodes) online Reboot the specified agents(and nodes) -**Usage:** `snops-cli env action reboot [NODES]...` +**Usage:** `snops-cli env action reboot [OPTIONS] [NODES]...` ###### **Arguments:** -* `` +* `` — The nodes to reboot (eg. `validator/any`) + +###### **Options:** + +* `--async` — When present, don't wait for reconciles to finish before returning @@ -278,7 +306,7 @@ Execute an aleo program function on the environment. i.e. credits.aleo/transfer_ ###### **Options:** -* `-p`, `--private-key ` — Private key to use, can be `committee.0` to use committee member 0's key +* `--private-key ` — Private key to use, can be `committee.0` to use committee member 0's key * `--fee-private-key ` — Private key to use for the fee. Defaults to the same as --private-key * `-c`, `--cannon ` — Desired cannon to fire the transaction * `--priority-fee ` — The optional priority fee to use @@ -316,7 +344,7 @@ Configure the state of the target nodes ###### **Arguments:** -* `` — The nodes to configure +* `` — The nodes to configure. (eg. `validator/any`) ###### **Options:** @@ -327,6 +355,11 @@ Configure the state of the target nodes * `--height ` — Configure the height of the target nodes * `-p`, `--peers ` — Configure the peers of the target nodes, or `none` * `-v`, `--validators ` — Configure the validators of the target nodes, or `none` +* `-e`, `--env ` — Set environment variables for a node: `--env FOO=bar` +* `-d`, `--del-env ` +* `-b`, `--binary ` — Configure the binary for a node +* `--private-key ` — Configure the private key for a node +* `--async` @@ -425,11 +458,11 @@ Lookup a transaction's details by a transaction id -## `snops-cli env clean` +## `snops-cli env delete` -Clean a specific environment +Delete a specific environment -**Usage:** `snops-cli env clean` +**Usage:** `snops-cli env delete` @@ -465,15 +498,19 @@ Show the resolved topology of a specific environment. Shows only internal agents -## `snops-cli env prepare` +## `snops-cli env apply` -Prepare a (test) environment +Apply an environment spec -**Usage:** `snops-cli env prepare ` +**Usage:** `snops-cli env apply [OPTIONS] ` ###### **Arguments:** -* `` — The test spec file +* `` — The environment spec file + +###### **Options:** + +* `--async` — When present, don't wait for reconciles to finish before returning @@ -533,6 +570,20 @@ Get an env's storage info +## `snops-cli events` + +Listen to events from the control plane, optionally filtered + +**Usage:** `snops-cli events [FILTER]` + +###### **Arguments:** + +* `` — The event filter to apply, such as `agent-connected` or `all-of(env-is(default),node-target-is(validator/any))` + + Default value: `unfiltered` + + + ## `snops-cli man` For generating cli manpages. Only with the mangen feature enabled diff --git a/snops_book/user_guide/envs/CANNONS.md b/snops_book/user_guide/envs/CANNONS.md index 0b3d6963..9a28f8de 100644 --- a/snops_book/user_guide/envs/CANNONS.md +++ b/snops_book/user_guide/envs/CANNONS.md @@ -7,7 +7,7 @@ The cannon document is an optional where you can specify: - where to send transactions (to a file, or a node in the topology) -The cannon document is not required for a `environment` to run, but the document needs to be present at `prepare` time to work. +The cannon document is not required for a `environment` to run, but the document needs to be present at `apply` time to work. This document is required if you want to use a [cannon timeline action](TIMELINES.md#cannon). diff --git a/snops_book/user_guide/envs/STORAGE.md b/snops_book/user_guide/envs/STORAGE.md index 15a70aa6..1f2ccfa9 100644 --- a/snops_book/user_guide/envs/STORAGE.md +++ b/snops_book/user_guide/envs/STORAGE.md @@ -94,7 +94,7 @@ The size of the binary in bytes. An optional number used if you want to wipe the old storage. -The recommendation is to increment this number, `clean` the env, and then `prepare` it again. +The recommendation is to increment this number, `delete` the env, and then `apply` it again. The default value is `0`. diff --git a/snops_book/user_guide/running/README.md b/snops_book/user_guide/running/README.md index 77a32e27..6a3ee8f9 100644 --- a/snops_book/user_guide/running/README.md +++ b/snops_book/user_guide/running/README.md @@ -38,7 +38,7 @@ Additionally you can enable [metrics and logging](./METRICS_AND_LOGGING.md), to ### Starting agents -