fix(bandwidth_scheduler) - Remove BandwidthRequestValues which can ne…

…ver be granted (#12747) As @matejpavlovic pointed out in his NEP review, the last few `BandwidthRequestValues` can never be granted because granting them along with base bandwidth would always cause the shard to exceed `max_shard_bandwidth`. In the current configuration with 6 shards the `base_bandwidth` is set to `61139` and the last few values are `4194304, 4278056, 4389028, 4500000`. But `4278056` can never be granted on a link because `4278056 + 5 * 61139 = 4583751`, which is more than `max_shard_bandwidth` (4_500_000). That makes the last three values useless, they just take up space that could be used for more precise bandwidth requests. Let's solve it by adding a new parameter to `BandwidthSchedulerParams`: `max_single_grant` `max_single_grant` describes the maximum amount of bandwidth that can be granted on a single link. `BandwidthRequestValues` now go only up to `max_single_grant`, as there is no point in requesting more than the maximum that can be granted. We can also get rid of the weird logic of setting one value to `max_receipt_size` because the last value always corresponds to `max_single_grant`, and that's enough to send a max size receipt. Base bandwidth is now calculated using `max_single_grant` instead of `max_receipt_size`. I think it makes much more sense than the previous logic with `max_receipt_size`. Decoupling `max_receipt_size` and `max_single_grant` is also more future proof, it'll keep working even if we reduce `max_receipt_size` to say 1MB in the future. The PR is divided into commits for easier review. --------- Co-authored-by: Matej Pavlovic <[email protected]>
near · Jan 17, 2025 · 515b5fa · 515b5fa
1 parent b6f82d6
commit 515b5fa
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 88 deletions.
diff --git a/core/primitives/src/bandwidth_scheduler.rs b/core/primitives/src/bandwidth_scheduler.rs
@@ -145,10 +145,11 @@ impl BandwidthRequest {
         let mut bitmap = BandwidthRequestBitmap::new();
         let values = BandwidthRequestValues::new(params).values;
 
+        // Find the first value which allows to send a max size receipt
         let max_receipt_size_value_pos = values
             .iter()
-            .position(|&value| value == params.max_receipt_size)
-            .expect("max_receipt_size should be in the values list");
+            .position(|&value| value >= params.max_receipt_size)
+            .expect("max_receipt_size is less than max_single_grant, a value should be found");
         bitmap.set_bit(max_receipt_size_value_pos, true);
 
         BandwidthRequest { to_shard: to_shard.into(), requested_values_bitmap: bitmap }
@@ -177,7 +178,7 @@ fn interpolate(min: u64, max: u64, i: u64, n: u64) -> u64 {
 impl BandwidthRequestValues {
     pub fn new(params: &BandwidthSchedulerParams) -> BandwidthRequestValues {
         // values[-1] = base_bandwidth
-        // values[values.len() - 1] = max_shard_bandwidth
+        // values[values.len() - 1] = max_single_grant
         // values[i] = linear interpolation between values[-1] and values[values.len() - 1]
         // TODO(bandwidth_scheduler) - consider using exponential interpolation.
         let mut values = [0; BANDWIDTH_REQUEST_VALUES_NUM];
@@ -187,32 +188,8 @@ impl BandwidthRequestValues {
         for i in 0..values.len() {
             let i_u64: u64 = i.try_into().expect("Converting usize to u64 shouldn't fail");
 
-            values[i] = interpolate(
-                params.base_bandwidth,
-                params.max_shard_bandwidth,
-                i_u64 + 1,
-                values_len,
-            );
-        }
-
-        // The value that is closest to max_receipt_size is set to max_receipt_size.
-        // Without this we could end up in a situation where one shard wants to send
-        // a maximum size receipt to another and requests the corresponding value from the list,
-        // but the sum of this value and base bandwidth exceeds max_shard_bandwidth.
-        // There's a guarantee that (num_shards - 1) * base_bandwidth + max_receipt_size <= max_shard_bandwidth,
-        // but there's no guarantee that (num_shards - 1) * base_bandwidth + request_value <= max_shard_bandwidth.
-        let mut closest_to_max: Bandwidth = 0;
-        for value in &values {
-            if value.abs_diff(params.max_receipt_size)
-                < closest_to_max.abs_diff(params.max_receipt_size)
-            {
-                closest_to_max = *value;
-            }
-        }
-        for value in values.iter_mut() {
-            if *value == closest_to_max {
-                *value = params.max_receipt_size;
-            }
+            values[i] =
+                interpolate(params.base_bandwidth, params.max_single_grant, i_u64 + 1, values_len);
         }
 
         BandwidthRequestValues { values }
@@ -335,9 +312,13 @@ pub struct LinkAllowance {
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub struct BandwidthSchedulerParams {
     /// This much bandwidth is granted by default.
+    /// base_bandwidth = (max_shard_bandwidth - max_single_grant) / (num_shards - 1)
     pub base_bandwidth: Bandwidth,
     /// The maximum amount of data that a shard can send or receive at a single height.
     pub max_shard_bandwidth: Bandwidth,
+    /// The maximum amount of bandwidth that can be granted on a single link.
+    /// Should be at least as big as `max_receipt_size`.
+    pub max_single_grant: Bandwidth,
     /// Maximum size of a single receipt.
     pub max_receipt_size: Bandwidth,
     /// Maximum bandwidth allowance that a link can accumulate.
@@ -348,36 +329,76 @@ impl BandwidthSchedulerParams {
     /// Calculate values of scheduler params based on the current configuration
     pub fn new(num_shards: NonZeroU64, runtime_config: &RuntimeConfig) -> BandwidthSchedulerParams {
         // TODO(bandwidth_scheduler) - put these parameters in RuntimeConfig.
-        let max_shard_bandwidth: Bandwidth = 4_500_000;
+        let max_shard_bandwidth = 4_500_000;
+        let max_single_grant = 4 * 1024 * 1024;
         let max_allowance = max_shard_bandwidth;
         let max_base_bandwidth = 100_000;
-
         let max_receipt_size = runtime_config.wasm_config.limit_config.max_receipt_size;
 
-        if max_shard_bandwidth < max_receipt_size {
-            panic!(
-                "max_shard_bandwidth is smaller than max_receipt_size! ({} < {}).
-                Invalid configuration - it'll be impossible to send a maximum size receipt.",
-                max_shard_bandwidth, max_receipt_size
-            );
-        }
+        Self::calculate(
+            max_shard_bandwidth,
+            max_single_grant,
+            max_allowance,
+            max_base_bandwidth,
+            max_receipt_size,
+            num_shards.get(),
+        )
+    }
 
-        // A receipt with maximum size must still be able to get through
-        // after base bandwidth is granted to everyone. We have to ensure that:
-        // base_bandwidth * (num_shards - 1) + max_receipt_size <= max_shard_bandwidth
-        let available_bandwidth = max_shard_bandwidth - max_receipt_size;
-        let mut base_bandwidth = available_bandwidth / std::cmp::max(1, num_shards.get() - 1);
+    fn calculate(
+        max_shard_bandwidth: Bandwidth,
+        max_single_grant: Bandwidth,
+        max_allowance: Bandwidth,
+        max_base_bandwidth: Bandwidth,
+        max_receipt_size: Bandwidth,
+        num_shards: u64,
+    ) -> BandwidthSchedulerParams {
+        assert!(
+            max_single_grant >= max_receipt_size,
+            "A max_single_grant can't be lower than max_receipt_size - it'll be impossible to send a max size receipt"
+        );
+        assert!(
+            max_single_grant <= max_shard_bandwidth,
+            "A single grant must not be greater than max_shard_bandwidth"
+        );
+
+        // Granting `max_single_grant` on one link and `base_bandwidth` on all other links can't
+        // exceed `max_shard_bandwidth`, we have to ensure that:
+        // base_bandwidth * (num_shards - 1) + max_single_grant <= max_shard_bandwidth
+        // Base bandwidth is calculated by taking the bandwidth that would remain available after
+        // granting `max_single_grant` on one link and dividing it equally between the other links.
+        let available_bandwidth = max_shard_bandwidth - max_single_grant;
+        let mut base_bandwidth = available_bandwidth / std::cmp::max(1, num_shards - 1);
         if base_bandwidth > max_base_bandwidth {
             base_bandwidth = max_base_bandwidth;
         }
 
         BandwidthSchedulerParams {
             base_bandwidth,
             max_shard_bandwidth,
+            max_single_grant,
             max_receipt_size,
             max_allowance,
         }
     }
+
+    /// Example params, used only in tests
+    pub fn for_test(num_shards: u64) -> BandwidthSchedulerParams {
+        let max_shard_bandwidth = 4_500_000;
+        let max_single_grant = 4 * 1024 * 1024;
+        let max_allowance = max_shard_bandwidth;
+        let max_base_bandwidth = 100_000;
+        let max_receipt_size = 4 * 1024 * 1024;
+
+        Self::calculate(
+            max_shard_bandwidth,
+            max_single_grant,
+            max_allowance,
+            max_base_bandwidth,
+            max_receipt_size,
+            num_shards,
+        )
+    }
 }
 
 #[cfg(test)]
@@ -428,6 +449,7 @@ mod tests {
         let expected = BandwidthSchedulerParams {
             base_bandwidth: 100_000,
             max_shard_bandwidth: 4_500_000,
+            max_single_grant: 4 * 1024 * 1024,
             max_receipt_size,
             max_allowance: 4_500_000,
         };
@@ -446,6 +468,7 @@ mod tests {
         let expected = BandwidthSchedulerParams {
             base_bandwidth: (4_500_000 - max_receipt_size) / 5,
             max_shard_bandwidth: 4_500_000,
+            max_single_grant: 4 * 1024 * 1024,
             max_receipt_size,
             max_allowance: 4_500_000,
         };
@@ -495,18 +518,17 @@ mod tests {
         let values = BandwidthRequestValues::new(&params);
 
         assert!(values.values[0] > params.base_bandwidth);
-        assert_eq!(values.values[BANDWIDTH_REQUEST_VALUES_NUM - 1], params.max_shard_bandwidth);
-        assert!(values.values.contains(&max_receipt_size));
+        assert_eq!(values.values[BANDWIDTH_REQUEST_VALUES_NUM - 1], params.max_single_grant);
 
         assert_eq!(params.base_bandwidth, 61139);
         assert_eq!(
             values.values,
             [
-                172110, 283082, 394053, 505025, 615996, 726968, 837939, 948911, 1059882, 1170854,
-                1281825, 1392797, 1503768, 1614740, 1725711, 1836683, 1947654, 2058626, 2169597,
-                2280569, 2391541, 2502512, 2613484, 2724455, 2835427, 2946398, 3057370, 3168341,
-                3279313, 3390284, 3501256, 3612227, 3723199, 3834170, 3945142, 4056113, 4194304,
-                4278056, 4389028, 4500000
+                164468, 267797, 371126, 474455, 577784, 681113, 784442, 887772, 991101, 1094430,
+                1197759, 1301088, 1404417, 1507746, 1611075, 1714405, 1817734, 1921063, 2024392,
+                2127721, 2231050, 2334379, 2437708, 2541038, 2644367, 2747696, 2851025, 2954354,
+                3057683, 3161012, 3264341, 3367671, 3471000, 3574329, 3677658, 3780987, 3884316,
+                3987645, 4090974, 4194304
             ]
         );
     }

diff --git a/core/store/src/trie/outgoing_metadata.rs b/core/store/src/trie/outgoing_metadata.rs
@@ -690,24 +690,15 @@ mod tests {
     }
 
     /// The diff between two consecutive values that can be requested in a bandwidth request
-    /// is ~110 kB. The upper bound of the receipt group size is 100 kB. The groups are small
+    /// is ~103 kB. The upper bound of the receipt group size is 100 kB. The groups are small
     /// enough that bandwidth requests produced from group sizes are optimal, optimal meaning
     /// the same as if the requests were made based on individual receipt sizes.
-    /// The only exception is around `max_receipt_size` - the value that is the closest to
-    /// `max_receipt_size` is set to `max_receipt_size`, which causes the diff to be smaller.
     #[test]
     fn test_receipt_groups_produce_optimal_bandwidth_request() {
-        let scheduler_params = BandwidthSchedulerParams {
-            base_bandwidth: 50_000,
-            max_shard_bandwidth: 4_500_000,
-            max_receipt_size: 4 * 1024 * 1024,
-            max_allowance: 4_500_000,
-        };
+        let scheduler_params = BandwidthSchedulerParams::for_test(6);
         let request_values = BandwidthRequestValues::new(&scheduler_params).values;
-        assert!(request_values[1] - request_values[0] > 110_000);
+        assert!(request_values[1] - request_values[0] > 102_000);
 
-        let max_receipt_size_pos =
-            request_values.iter().position(|v| v == &scheduler_params.max_receipt_size).unwrap();
         let groups_config =
             ReceiptGroupsConfig { size_upper_bound: ByteSize::kb(100), gas_upper_bound: Gas::MAX };
 
@@ -743,19 +734,11 @@ mod tests {
                     &scheduler_params,
                 )
                 .unwrap();
-                if let Some(mut ideal_request) = ideal_bandwidth_request {
-                    let mut groups_request = groups_bandwidth_request.unwrap();
 
-                    // Zero out everything after `max_receipt_size` in both requests before comparison.
-                    for i in max_receipt_size_pos..request_values.len() {
-                        ideal_request.requested_values_bitmap.set_bit(i, false);
-                        groups_request.requested_values_bitmap.set_bit(i, false);
-                    }
-                    assert_eq!(ideal_request, groups_request);
-                }
+                // Bandwidth request produced from receipt groups should be optimal (same as from
+                // individual receipt sizes).
+                assert_eq!(ideal_bandwidth_request, groups_bandwidth_request);
 
-                // Bandwidth request produced from receipt groups should be optimal (same as from individual
-                // receipt sizes). (up to `max_receipt_size`)
                 let new_receipt_size = ByteSize::b(get_random_receipt_size_for_test(rng));
                 buffered_receipts.push_back(new_receipt_size);
                 test_queue.update_on_receipt_pushed(new_receipt_size, 1, &groups_config);

diff --git a/runtime/runtime/src/bandwidth_scheduler/scheduler.rs b/runtime/runtime/src/bandwidth_scheduler/scheduler.rs
@@ -763,12 +763,7 @@ mod tests {
         let shard_layout = ShardLayout::multi_shard(num_shards, 0);
 
         // Standard params
-        let params = BandwidthSchedulerParams {
-            base_bandwidth: 50_000,
-            max_shard_bandwidth: 4_500_000,
-            max_receipt_size: 4 * 1024 * 1024,
-            max_allowance: 4_500_000,
-        };
+        let params = BandwidthSchedulerParams::for_test(num_shards);
 
         // Every link has a different allowance, which should make the `requests_by_allowance` BTreeMap
         // as large as possible.

diff --git a/runtime/runtime/src/bandwidth_scheduler/simulator.rs b/runtime/runtime/src/bandwidth_scheduler/simulator.rs
@@ -419,12 +419,7 @@ impl ChainSimulator {
 
     fn scheduler_params(&self) -> BandwidthSchedulerParams {
         // TODO(bandwidth_scheduler) - use current RuntimeConfig?
-        BandwidthSchedulerParams {
-            base_bandwidth: (4_500_000 - 4 * 1024 * 1024) / self.shard_layout.num_shards() as u64,
-            max_shard_bandwidth: 4_500_000,
-            max_receipt_size: 4 * 1024 * 1024,
-            max_allowance: 4_500_000,
-        }
+        BandwidthSchedulerParams::for_test(self.shard_layout.num_shards())
     }
 
     /// Make sure that post state of previous chunk application is always pre state for the next application.
@@ -592,9 +587,9 @@ fn test_bandwidth_scheduler_simulator_missing_chunks() {
         .missing_chunk_probability(0.1)
         .build();
     let summary = run_scenario(scenario);
-    assert!(summary.bandwidth_utilization > 0.6); // 75% utilization
-    assert!(summary.link_imbalance_ratio < 1.4); // < 40% difference on links
-    assert!(summary.worst_link_estimation_ratio > 0.55); // 55% of estimated link throughput
+    assert!(summary.bandwidth_utilization > 0.7); // > 70% utilization
+    assert!(summary.link_imbalance_ratio < 1.6); // < 60% difference on links
+    assert!(summary.worst_link_estimation_ratio > 0.50); // 50% of estimated link throughput
 
     // Incoming max_shard_bandwidth is not respected! When a chunk is missing, the receipts that
     // were sent previously will arrive later and they can mix with other incoming receipts, and the