diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
index 195ad733f..0406d6c0e 100644
--- a/.github/workflows/check.yml
+++ b/.github/workflows/check.yml
@@ -175,6 +175,9 @@ jobs:
     - uses: actions/checkout@v4
     - uses: ./.github/actions/rm
     - uses: dtolnay/rust-toolchain@nightly
+      with:
+        # More recent nightlies don't work, probably https://github.com/rust-lang/rust/issues/122399
+        toolchain: nightly-2024-03-11
     - name: Add Miri
       run: rustup component add miri
     - name: Setup Miri
diff --git a/ipa-core/Cargo.toml b/ipa-core/Cargo.toml
index 033a56931..6b064d09f 100644
--- a/ipa-core/Cargo.toml
+++ b/ipa-core/Cargo.toml
@@ -149,7 +149,7 @@ tower = { version = "0.4.13", optional = true }
 tower-http = { version = "0.4.0", optional = true, features = ["trace"] }
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
-typenum = "1.16"
+typenum = { version = "1.17", features = ["i128"] }
 # hpke is pinned to it
 x25519-dalek = "2.0.0-rc.3"
 
diff --git a/ipa-core/src/ff/boolean_array.rs b/ipa-core/src/ff/boolean_array.rs
index 78265ca27..4137d0eeb 100644
--- a/ipa-core/src/ff/boolean_array.rs
+++ b/ipa-core/src/ff/boolean_array.rs
@@ -1,3 +1,5 @@
+use std::fmt::{Debug, Formatter};
+
 use bitvec::{
     prelude::{BitArr, Lsb0},
     slice::Iter,
@@ -254,9 +256,16 @@ macro_rules! boolean_array_impl {
     type Store = BitArr!(for $bits, in u8, Lsb0);
 
             /// A Boolean array with $bits bits.
-            #[derive(Clone, Copy, PartialEq, Eq, Debug)]
+            #[derive(Clone, Copy, PartialEq, Eq)]
             pub struct $name(pub(super) Store);
 
+            impl Debug for $name {
+                fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+                    f.write_str(stringify!($name))?;
+                    self.0.data.fmt(f)
+                }
+            }
+
             impl $name {
                 #[cfg(all(test, unit_test))]
                 const STORE_LEN: usize = bitvec::mem::elts::<u8>($bits);
@@ -697,6 +706,13 @@ macro_rules! boolean_array_impl {
                         "Failed to deserialize a valid value: {ba:?}"
                     );
                 }
+
+                #[test]
+                fn debug() {
+                    let expected = format!("{}{:?}", stringify!($name), $name::ZERO.0.data);
+                    let actual = format!("{:?}", $name::ZERO);
+                    assert_eq!(expected, actual);
+                }
             }
         }
 
diff --git a/ipa-core/src/ff/prime_field.rs b/ipa-core/src/ff/prime_field.rs
index 589f9b2c1..f15530f77 100644
--- a/ipa-core/src/ff/prime_field.rs
+++ b/ipa-core/src/ff/prime_field.rs
@@ -1,4 +1,4 @@
-use std::fmt::Display;
+use std::{fmt::Display, mem};
 
 use generic_array::GenericArray;
 
@@ -14,6 +14,42 @@ pub trait PrimeField: Field + U128Conversions {
     type PrimeInteger: Into<u128>;
 
     const PRIME: Self::PrimeInteger;
+
+    /// Invert function that returns the multiplicative inverse
+    /// the default implementation uses the extended Euclidean algorithm,
+    /// follows inversion algorithm in
+    /// (with the modification that it works for unsigned integers by keeping track of `sign`):
+    /// `https://en.wikipedia.org/wiki/Extended_Euclidean_algorithm`
+    ///
+    /// The function operates on `u128` rather than field elements since we need divisions
+    ///
+    /// ## Panics
+    /// When `self` is `Zero`
+    #[must_use]
+    fn invert(&self) -> Self {
+        assert_ne!(*self, Self::ZERO);
+
+        let mut t = 0u128;
+        let mut newt = 1u128;
+        let mut r = Self::PRIME.into();
+        let mut newr = self.as_u128();
+        let mut sign = 1u128;
+
+        while newr != 0 {
+            let quotient = r / newr;
+            mem::swap(&mut t, &mut newt);
+            mem::swap(&mut r, &mut newr);
+            newt += quotient * t;
+            newr -= quotient * r;
+
+            // flip sign
+            sign = 1 - sign;
+        }
+
+        // when sign is negative, output `PRIME-t` otherwise `t`
+        // unwrap is safe
+        Self::try_from((1 - sign) * t + sign * (Self::PRIME.into() - t)).unwrap()
+    }
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -295,6 +331,14 @@ macro_rules! field_impl {
                     let err = $field::deserialize(&buf).unwrap_err();
                     assert!(matches!(err, GreaterThanPrimeError(..)))
                 }
+
+                #[test]
+                fn invert(element: $field) {
+                    if element != $field::ZERO
+                    {
+                        assert_eq!($field::ONE,element * element.invert() );
+                    }
+                }
             }
         }
 
diff --git a/ipa-core/src/helpers/futures.rs b/ipa-core/src/helpers/futures.rs
new file mode 100644
index 000000000..70bcf4ca3
--- /dev/null
+++ b/ipa-core/src/helpers/futures.rs
@@ -0,0 +1,44 @@
+use std::{
+    future::Future,
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+use pin_project::pin_project;
+
+#[pin_project(project = MaybeFutureProj)]
+pub enum MaybeFuture<Fut: Future> {
+    Future(#[pin] Fut),
+    Value(Option<Fut::Output>),
+}
+
+impl<Fut: Future> Future for MaybeFuture<Fut> {
+    type Output = Fut::Output;
+
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        match self.project() {
+            MaybeFutureProj::Future(fut) => fut.poll(cx),
+            MaybeFutureProj::Value(val) => Poll::Ready(val.take().unwrap()),
+        }
+    }
+}
+
+impl<Fut: Future> MaybeFuture<Fut> {
+    pub fn future(fut: Fut) -> Self {
+        MaybeFuture::Future(fut)
+    }
+
+    pub fn value(val: Fut::Output) -> Self {
+        MaybeFuture::Value(Some(val))
+    }
+}
+
+impl<Fut: Future<Output = Result<(), E>>, E> MaybeFuture<Fut> {
+    pub fn future_or_ok<F: FnOnce() -> Fut>(condition: bool, f: F) -> Self {
+        if condition {
+            MaybeFuture::Future(f())
+        } else {
+            MaybeFuture::Value(Some(Ok(())))
+        }
+    }
+}
diff --git a/ipa-core/src/helpers/mod.rs b/ipa-core/src/helpers/mod.rs
index dbf9348ac..f9f9acc42 100644
--- a/ipa-core/src/helpers/mod.rs
+++ b/ipa-core/src/helpers/mod.rs
@@ -8,6 +8,7 @@ use generic_array::GenericArray;
 
 mod buffers;
 mod error;
+mod futures;
 mod gateway;
 pub(crate) mod prss_protocol;
 mod transport;
@@ -18,6 +19,7 @@ use std::ops::{Index, IndexMut};
 #[cfg(test)]
 pub use buffers::OrderingSender;
 pub use error::Error;
+pub use futures::MaybeFuture;
 
 #[cfg(feature = "stall-detection")]
 mod gateway_exports {
diff --git a/ipa-core/src/protocol/basics/mod.rs b/ipa-core/src/protocol/basics/mod.rs
index 33ef5979c..7e0ba176f 100644
--- a/ipa-core/src/protocol/basics/mod.rs
+++ b/ipa-core/src/protocol/basics/mod.rs
@@ -13,7 +13,7 @@ pub use check_zero::check_zero;
 pub use if_else::{if_else, select};
 pub use mul::{BooleanArrayMul, MultiplyZeroPositions, SecureMul, ZeroPositions};
 pub use reshare::Reshare;
-pub use reveal::Reveal;
+pub use reveal::{reveal, Reveal};
 pub use share_known_value::ShareKnownValue;
 pub use sum_of_product::SumOfProducts;
 
diff --git a/ipa-core/src/protocol/basics/reveal.rs b/ipa-core/src/protocol/basics/reveal.rs
index fc7f1ab87..4b9c21da5 100644
--- a/ipa-core/src/protocol/basics/reveal.rs
+++ b/ipa-core/src/protocol/basics/reveal.rs
@@ -1,5 +1,7 @@
-use async_trait::async_trait;
+use std::future::Future;
+
 use embed_doc_image::embed_doc_image;
+use futures::TryFutureExt;
 
 use crate::{
     error::Error,
@@ -11,6 +13,7 @@ use crate::{
 };
 #[cfg(feature = "descriptive-gate")]
 use crate::{
+    helpers::MaybeFuture,
     protocol::context::UpgradedMaliciousContext,
     secret_sharing::replicated::malicious::{
         AdditiveShare as MaliciousReplicated, ExtendableField,
@@ -18,23 +21,50 @@ use crate::{
 };
 
 /// Trait for reveal protocol to open a shared secret to all helpers inside the MPC ring.
-#[async_trait]
 pub trait Reveal<C: Context, const N: usize>: Sized {
     type Output;
-    /// reveal the secret to all helpers in MPC circuit. Note that after method is called,
-    /// it must be assumed that the secret value has been revealed to at least one of the helpers.
-    /// Even in case when method never terminates, returns an error, etc.
-    async fn reveal<'fut>(&self, ctx: C, record_id: RecordId) -> Result<Self::Output, Error>
+    /// Reveal a shared secret to all helpers in the MPC ring.
+    ///
+    /// Note that after method is called, it must be assumed that the secret value has been
+    /// revealed to at least one of the helpers.  Even in case when method never terminates,
+    /// returns an error, etc.
+    fn reveal<'fut>(
+        &'fut self,
+        ctx: C,
+        record_id: RecordId,
+    ) -> impl Future<Output = Result<Self::Output, Error>> + Send + 'fut
     where
-        C: 'fut;
+        C: 'fut,
+    {
+        // Passing `excluded = None` guarantees any ok result is `Some`.
+        self.generic_reveal(ctx, record_id, None)
+            .map_ok(Option::unwrap)
+    }
 
-    /// partial reveal protocol to open a shared secret to all helpers except helper `left_out` inside the MPC ring.
-    async fn partial_reveal<'fut>(
-        &self,
+    /// Partial reveal protocol to open a shared secret to all helpers except helper `excluded` inside the MPC ring.
+    fn partial_reveal<'fut>(
+        &'fut self,
         ctx: C,
         record_id: RecordId,
-        left_out: Role,
-    ) -> Result<Option<Self::Output>, Error>
+        excluded: Role,
+    ) -> impl Future<Output = Result<Option<Self::Output>, Error>> + Send + 'fut
+    where
+        C: 'fut,
+    {
+        self.generic_reveal(ctx, record_id, Some(excluded))
+    }
+
+    /// Generic reveal implementation usable for both `reveal` and `partial_reveal`.
+    ///
+    /// When `excluded` is `None`, open a shared secret to all helpers in the MPC ring.
+    /// When `excluded` is `Some`, open a shared secret to all helpers except the helper
+    /// specified in `excluded`.
+    fn generic_reveal<'fut>(
+        &'fut self,
+        ctx: C,
+        record_id: RecordId,
+        excluded: Option<Role>,
+    ) -> impl Future<Output = Result<Option<Self::Output>, Error>> + Send + 'fut
     where
         C: 'fut;
 }
@@ -50,43 +80,17 @@ pub trait Reveal<C: Context, const N: usize>: Sized {
 /// ![Reveal steps][reveal]
 /// Each helper sends their left share to the right helper. The helper then reconstructs their secret by adding the three shares
 /// i.e. their own shares and received share.
-#[async_trait]
 #[embed_doc_image("reveal", "images/reveal.png")]
 impl<C: Context, V: SharedValue + Vectorizable<N>, const N: usize> Reveal<C, N>
     for Replicated<V, N>
 {
     type Output = <V as Vectorizable<N>>::Array;
 
-    async fn reveal<'fut>(
-        &self,
-        ctx: C,
-        record_id: RecordId,
-    ) -> Result<<V as Vectorizable<N>>::Array, Error>
-    where
-        C: 'fut,
-    {
-        let left = self.left_arr();
-        let right = self.right_arr();
-
-        ctx.send_channel::<<V as Vectorizable<N>>::Array>(ctx.role().peer(Direction::Right))
-            .send(record_id, left)
-            .await?;
-
-        // Sleep until `helper's left` sends their share
-        let share: <V as Vectorizable<N>>::Array = ctx
-            .recv_channel(ctx.role().peer(Direction::Left))
-            .receive(record_id)
-            .await?;
-
-        Ok(share + left + right)
-    }
-
-    /// TODO: implement reveal through partial reveal where `left_out` is optional
-    async fn partial_reveal<'fut>(
-        &self,
+    async fn generic_reveal<'fut>(
+        &'fut self,
         ctx: C,
         record_id: RecordId,
-        left_out: Role,
+        excluded: Option<Role>,
     ) -> Result<Option<<V as Vectorizable<N>>::Array>, Error>
     where
         C: 'fut,
@@ -94,16 +98,17 @@ impl<C: Context, V: SharedValue + Vectorizable<N>, const N: usize> Reveal<C, N>
         let left = self.left_arr();
         let right = self.right_arr();
 
-        // send except to left_out
-        if ctx.role().peer(Direction::Right) != left_out {
+        // Send shares, unless the target helper is excluded
+        if Some(ctx.role().peer(Direction::Right)) != excluded {
             ctx.send_channel::<<V as Vectorizable<N>>::Array>(ctx.role().peer(Direction::Right))
                 .send(record_id, left)
                 .await?;
         }
 
-        if ctx.role() == left_out {
+        if Some(ctx.role()) == excluded {
             Ok(None)
         } else {
+            // Sleep until `helper's left` sends their share
             let share: <V as Vectorizable<N>>::Array = ctx
                 .recv_channel(ctx.role().peer(Direction::Left))
                 .receive(record_id)
@@ -119,15 +124,15 @@ impl<C: Context, V: SharedValue + Vectorizable<N>, const N: usize> Reveal<C, N>
 /// to both helpers (right and left) and upon receiving 2 shares from peers it validates that they
 /// indeed match.
 #[cfg(feature = "descriptive-gate")]
-#[async_trait]
 impl<'a, F: ExtendableField> Reveal<UpgradedMaliciousContext<'a, F>, 1> for MaliciousReplicated<F> {
     type Output = <F as Vectorizable<1>>::Array;
 
-    async fn reveal<'fut>(
-        &self,
+    async fn generic_reveal<'fut>(
+        &'fut self,
         ctx: UpgradedMaliciousContext<'a, F>,
         record_id: RecordId,
-    ) -> Result<<F as Vectorizable<1>>::Array, Error>
+        excluded: Option<Role>,
+    ) -> Result<Option<<F as Vectorizable<1>>::Array>, Error>
     where
         UpgradedMaliciousContext<'a, F>: 'fut,
     {
@@ -141,54 +146,19 @@ impl<'a, F: ExtendableField> Reveal<UpgradedMaliciousContext<'a, F>, 1> for Mali
         let right_sender = ctx.send_channel(ctx.role().peer(Direction::Right));
         let right_receiver = ctx.recv_channel::<F>(ctx.role().peer(Direction::Right));
 
-        // Send share to helpers to the right and left
-        try_join(
-            left_sender.send(record_id, right),
-            right_sender.send(record_id, left),
-        )
-        .await?;
-
-        let (share_from_left, share_from_right) = try_join(
-            left_receiver.receive(record_id),
-            right_receiver.receive(record_id),
-        )
-        .await?;
-
-        if share_from_left == share_from_right {
-            Ok((left + right + share_from_left).into_array())
-        } else {
-            Err(Error::MaliciousRevealFailed)
-        }
-    }
-
-    async fn partial_reveal<'fut>(
-        &self,
-        ctx: UpgradedMaliciousContext<'a, F>,
-        record_id: RecordId,
-        left_out: Role,
-    ) -> Result<Option<<F as Vectorizable<1>>::Array>, Error>
-    where
-        UpgradedMaliciousContext<'a, F>: 'fut,
-    {
-        use futures::future::try_join;
-
-        use crate::secret_sharing::replicated::malicious::ThisCodeIsAuthorizedToDowngradeFromMalicious;
+        // Send shares to the left and right helpers, unless excluded.
+        let send_left_fut =
+            MaybeFuture::future_or_ok(Some(ctx.role().peer(Direction::Left)) != excluded, || {
+                left_sender.send(record_id, right)
+            });
 
-        let (left, right) = self.x().access_without_downgrade().as_tuple();
-        let left_sender = ctx.send_channel(ctx.role().peer(Direction::Left));
-        let left_receiver = ctx.recv_channel::<F>(ctx.role().peer(Direction::Left));
-        let right_sender = ctx.send_channel(ctx.role().peer(Direction::Right));
-        let right_receiver = ctx.recv_channel::<F>(ctx.role().peer(Direction::Right));
+        let send_right_fut =
+            MaybeFuture::future_or_ok(Some(ctx.role().peer(Direction::Right)) != excluded, || {
+                right_sender.send(record_id, left)
+            });
+        try_join(send_left_fut, send_right_fut).await?;
 
-        // Send share to helpers to the right and left
-        // send except to left_out
-        if ctx.role().peer(Direction::Left) != left_out {
-            left_sender.send(record_id, right).await?;
-        }
-        if ctx.role().peer(Direction::Right) != left_out {
-            right_sender.send(record_id, left).await?;
-        }
-        if ctx.role() == left_out {
+        if Some(ctx.role()) == excluded {
             Ok(None)
         } else {
             let (share_from_left, share_from_right) = try_join(
@@ -206,6 +176,20 @@ impl<'a, F: ExtendableField> Reveal<UpgradedMaliciousContext<'a, F>, 1> for Mali
     }
 }
 
+// Workaround for https://github.com/rust-lang/rust/issues/100013. Calling this wrapper function
+// instead of `Reveal::reveal` seems to hide the `impl Future` GAT.
+pub fn reveal<'fut, C, S>(
+    ctx: C,
+    record_id: RecordId,
+    v: &'fut S,
+) -> impl Future<Output = Result<S::Output, Error>> + Send + 'fut
+where
+    C: Context + 'fut,
+    S: Reveal<C, 1>,
+{
+    S::reveal(v, ctx, record_id)
+}
+
 #[cfg(all(test, unit_test))]
 mod tests {
     use std::iter::zip;
@@ -231,6 +215,7 @@ mod tests {
             },
             IntoShares, SharedValue,
         },
+        test_executor::run,
         test_fixture::{join3v, Runner, TestWorld},
     };
 
@@ -267,12 +252,12 @@ mod tests {
         let mut rng = thread_rng();
         let world = TestWorld::default();
 
-        for &left_out in Role::all() {
+        for &excluded in Role::all() {
             let input = rng.gen::<TestField>();
             let results = world
                 .semi_honest(input, |ctx, share| async move {
                     share
-                        .partial_reveal(ctx.set_total_records(1), RecordId::from(0), left_out)
+                        .partial_reveal(ctx.set_total_records(1), RecordId::from(0), excluded)
                         .await
                         .unwrap()
                         .map(|revealed| TestField::from_array(&revealed))
@@ -280,7 +265,7 @@ mod tests {
                 .await;
 
             for &helper in Role::all() {
-                if helper == left_out {
+                if helper == excluded {
                     assert_eq!(None, results[helper]);
                 } else {
                     assert_eq!(Some(input), results[helper]);
@@ -360,7 +345,7 @@ mod tests {
         let mut rng = thread_rng();
         let world = TestWorld::default();
 
-        for &left_out in Role::all() {
+        for &excluded in Role::all() {
             let sh_ctx = world.malicious_contexts();
             let v = sh_ctx.map(UpgradableContext::validator);
             let m_ctx: [_; 3] = v
@@ -382,7 +367,7 @@ mod tests {
             let results = join_all(zip(m_ctx.clone().into_iter(), m_shares).map(
                 |(m_ctx, m_share)| async move {
                     m_share
-                        .partial_reveal(m_ctx, record_id, left_out)
+                        .partial_reveal(m_ctx, record_id, excluded)
                         .await
                         .unwrap()
                 },
@@ -390,7 +375,7 @@ mod tests {
             .await;
 
             for &helper in Role::all() {
-                if helper == left_out {
+                if helper == excluded {
                     assert_eq!(None, results[helper]);
                 } else {
                     assert_eq!(Some(input.into_array()), results[helper]);
@@ -401,83 +386,89 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
-    pub async fn malicious_validation_fail() -> Result<(), Error> {
-        let mut rng = thread_rng();
-        let world = TestWorld::default();
-        let sh_ctx = world.malicious_contexts();
-        let v = sh_ctx.map(UpgradableContext::validator);
-        let m_ctx: [_; 3] = v
-            .iter()
-            .map(|v| v.context().set_total_records(1))
-            .collect::<Vec<_>>()
-            .try_into()
-            .unwrap();
-
-        let record_id = RecordId::from(0);
-        let input: Fp31 = rng.gen();
+    #[test]
+    pub fn malicious_validation_fail() {
+        run(|| async {
+            let mut rng = thread_rng();
+            let world = TestWorld::default();
+            let sh_ctx = world.malicious_contexts();
+            let v = sh_ctx.map(UpgradableContext::validator);
+            let m_ctx: [_; 3] = v
+                .iter()
+                .map(|v| v.context().set_total_records(1))
+                .collect::<Vec<_>>()
+                .try_into()
+                .unwrap();
 
-        let m_shares = join3v(
-            zip(m_ctx.iter(), input.share_with(&mut rng))
-                .map(|(m_ctx, share)| async { m_ctx.upgrade(share).await }),
-        )
-        .await;
-        let result = try_join3(
-            m_shares[0].reveal(m_ctx[0].clone(), record_id),
-            m_shares[1].reveal(m_ctx[1].clone(), record_id),
-            reveal_with_additive_attack(
-                m_ctx[2].clone(),
-                record_id,
-                &m_shares[2],
-                false,
-                Fp31::ONE,
-            ),
-        )
-        .await;
+            let record_id = RecordId::from(0);
+            let input: Fp31 = rng.gen();
 
-        assert!(matches!(result, Err(Error::MaliciousRevealFailed)));
+            let m_shares = join3v(
+                zip(m_ctx.iter(), input.share_with(&mut rng))
+                    .map(|(m_ctx, share)| async { m_ctx.upgrade(share).await }),
+            )
+            .await;
+            let result = try_join3(
+                m_shares[0].reveal(m_ctx[0].clone(), record_id),
+                m_shares[1].reveal(m_ctx[1].clone(), record_id),
+                reveal_with_additive_attack(
+                    m_ctx[2].clone(),
+                    record_id,
+                    &m_shares[2],
+                    false,
+                    Fp31::ONE,
+                ),
+            )
+            .await;
 
-        Ok(())
+            assert!(matches!(result, Err(Error::MaliciousRevealFailed)));
+        });
     }
 
-    #[tokio::test]
-    pub async fn malicious_partial_validation_fail() -> Result<(), Error> {
-        let mut rng = thread_rng();
-        let world = TestWorld::default();
-        let sh_ctx = world.malicious_contexts();
-        let v = sh_ctx.map(UpgradableContext::validator);
-        let m_ctx: [_; 3] = v
-            .iter()
-            .map(|v| v.context().set_total_records(1))
-            .collect::<Vec<_>>()
-            .try_into()
-            .unwrap();
-
-        let record_id = RecordId::from(0);
-        let input: Fp31 = rng.gen();
+    #[test]
+    pub fn malicious_partial_validation_fail() {
+        run(|| async {
+            let mut rng = thread_rng();
+            let world = TestWorld::default();
+            let sh_ctx = world.malicious_contexts();
+            let v = sh_ctx.map(UpgradableContext::validator);
+            let m_ctx: [_; 3] = v
+                .iter()
+                .map(|v| v.context().set_total_records(1))
+                .collect::<Vec<_>>()
+                .try_into()
+                .unwrap();
 
-        let m_shares = join3v(
-            zip(m_ctx.iter(), input.share_with(&mut rng))
-                .map(|(m_ctx, share)| async { m_ctx.upgrade(share).await }),
-        )
-        .await;
-        let result = try_join3(
-            m_shares[0].partial_reveal(m_ctx[0].clone(), record_id, Role::H3),
-            m_shares[1].partial_reveal(m_ctx[1].clone(), record_id, Role::H3),
-            reveal_with_additive_attack(m_ctx[2].clone(), record_id, &m_shares[2], true, Fp31::ONE),
-        )
-        .await;
+            let record_id = RecordId::from(0);
+            let input: Fp31 = rng.gen();
 
-        assert!(matches!(result, Err(Error::MaliciousRevealFailed)));
+            let m_shares = join3v(
+                zip(m_ctx.iter(), input.share_with(&mut rng))
+                    .map(|(m_ctx, share)| async { m_ctx.upgrade(share).await }),
+            )
+            .await;
+            let result = try_join3(
+                m_shares[0].partial_reveal(m_ctx[0].clone(), record_id, Role::H3),
+                m_shares[1].partial_reveal(m_ctx[1].clone(), record_id, Role::H3),
+                reveal_with_additive_attack(
+                    m_ctx[2].clone(),
+                    record_id,
+                    &m_shares[2],
+                    true,
+                    Fp31::ONE,
+                ),
+            )
+            .await;
 
-        Ok(())
+            assert!(matches!(result, Err(Error::MaliciousRevealFailed)));
+        });
     }
 
     pub async fn reveal_with_additive_attack<F: ExtendableField>(
         ctx: UpgradedMaliciousContext<'_, F>,
         record_id: RecordId,
         input: &MaliciousReplicated<F>,
-        left_out: bool,
+        excluded: bool,
         additive_error: F,
     ) -> Result<Option<F>, Error> {
         let (left, right) = input.x().access_without_downgrade().as_tuple();
@@ -493,7 +484,7 @@ mod tests {
         )
         .await?;
 
-        if left_out {
+        if excluded {
             Ok(None)
         } else {
             let (share_from_left, _share_from_right): (F, F) =
diff --git a/ipa-core/src/protocol/boolean/comparison.rs b/ipa-core/src/protocol/boolean/comparison.rs
index 7407b1ab4..94046be95 100644
--- a/ipa-core/src/protocol/boolean/comparison.rs
+++ b/ipa-core/src/protocol/boolean/comparison.rs
@@ -5,6 +5,7 @@ use crate::{
     error::Error,
     ff::{Field, PrimeField},
     protocol::{
+        basics::reveal,
         boolean::random_bits_generator::RandomBitsGenerator,
         context::{Context, UpgradedContext},
         step::BitOpStep,
@@ -82,11 +83,7 @@ where
     let r = rbg.generate(record_id).await?;
 
     // Mask `a` with random `r` and reveal.
-    let b = F::from_array(
-        &(r.b_p + a)
-            .reveal(ctx.narrow(&Step::Reveal), record_id)
-            .await?,
-    );
+    let b = F::from_array(&reveal(ctx.narrow(&Step::Reveal), record_id, &(r.b_p + a)).await?);
 
     let RBounds { r_lo, r_hi, invert } = compute_r_bounds(b.as_u128(), c, F::PRIME.into());
 
diff --git a/ipa-core/src/protocol/boolean/solved_bits.rs b/ipa-core/src/protocol/boolean/solved_bits.rs
index 2f2d3227e..1cf64c945 100644
--- a/ipa-core/src/protocol/boolean/solved_bits.rs
+++ b/ipa-core/src/protocol/boolean/solved_bits.rs
@@ -7,7 +7,7 @@ use crate::{
     error::Error,
     ff::{Field, PrimeField},
     protocol::{
-        basics::Reveal,
+        basics::{reveal, Reveal},
         boolean::{
             bitwise_less_than_prime::BitwiseLessThanPrime, generate_random_bits::one_random_bit,
         },
@@ -136,7 +136,8 @@ where
     let c_b =
         BitwiseLessThanPrime::less_than_prime(ctx.narrow(&Step::IsPLessThanB), record_id, b_b)
             .await?;
-    if F::from_array(&c_b.reveal(ctx.narrow(&Step::RevealC), record_id).await?) == F::ZERO {
+
+    if F::from_array(&reveal(ctx.narrow(&Step::RevealC), record_id, &c_b).await?) == F::ZERO {
         return Ok(false);
     }
     Ok(true)
diff --git a/ipa-core/src/protocol/ipa_prf/malicious_security/lagrange.rs b/ipa-core/src/protocol/ipa_prf/malicious_security/lagrange.rs
new file mode 100644
index 000000000..59e0f9432
--- /dev/null
+++ b/ipa-core/src/protocol/ipa_prf/malicious_security/lagrange.rs
@@ -0,0 +1,276 @@
+use std::{borrow::Borrow, fmt::Debug};
+
+use generic_array::{ArrayLength, GenericArray};
+use typenum::{Unsigned, U1};
+
+use crate::ff::{Field, PrimeField, Serializable};
+
+/// The Canonical Lagrange denominator is defined as the denominator of the Lagrange base polynomials
+/// `https://en.wikipedia.org/wiki/Lagrange_polynomial`
+/// where the "x coordinates" of the input points are `x_0` to `x_N` are `F::ZERO` to `(N-1)*F::ONE`
+/// the degree of the polynomials is `N-1`
+pub struct CanonicalLagrangeDenominator<F: Field, N: ArrayLength> {
+    denominator: GenericArray<F, N>,
+}
+
+impl<F, N> CanonicalLagrangeDenominator<F, N>
+where
+    F: PrimeField + TryFrom<u128>,
+    <F as TryFrom<u128>>::Error: Debug,
+    N: ArrayLength,
+{
+    /// generates canonical Lagrange denominators
+    ///
+    /// ## Panics
+    /// When the field size is too small for `N` evaluation points
+    pub fn new() -> Self {
+        // assertion that field is large enough
+        // when it is large enough, `F::try_from().unwrap()` below does not panic
+        assert!(
+            N::U128 < F::PRIME.into(),
+            "Field size {} is not large enough to hold {} points",
+            F::PRIME.into(),
+            N::U128
+        );
+
+        // assertion that table is not too large for the stack
+        assert!(<F as Serializable>::Size::USIZE * N::USIZE < 2024);
+
+        Self {
+            denominator: (0..N::U128)
+                .map(|i| {
+                    (0..N::U128)
+                        .filter(|&j| i != j)
+                        .map(|j| F::try_from(i).unwrap() - F::try_from(j).unwrap())
+                        .fold(F::ONE, |acc, a| acc * a)
+                        .invert()
+                })
+                .collect(),
+        }
+    }
+}
+
+/// `LagrangeTable` is a precomputed table for the Lagrange evaluation.
+/// Allows to compute points on the polynomial, i.e. output points,
+/// given enough points on the polynomial, i.e. input points,
+/// by using the `eval` function.
+/// The "x coordinates" are implicit.
+/// The "y coordinates" of the input points are inputs to `eval`.
+/// The output of `eval` are the "y coordinates" of the output points .
+/// The "x coordinates" of the input points `x_0` to `x_(N-1)` are `F::ZERO` to `(N-1)*F::ONE`.
+/// The `LagrangeTable` also specifies `M` "x coordinates" for the output points.
+/// The "x coordinates" of the output points `x_N` to `x_(N+M-1)` are `N*F::ONE` to `(N+M-1)*F::ONE`
+/// when generated using `from(denominator)`
+/// unless generated using `new(denominator, x_output)` for a specific output "x coordinate" `x_output`.
+pub struct LagrangeTable<F: Field, N: ArrayLength, M: ArrayLength> {
+    table: GenericArray<GenericArray<F, N>, M>,
+}
+
+impl<F, N> LagrangeTable<F, N, U1>
+where
+    F: Field + TryFrom<u128>,
+    <F as TryFrom<u128>>::Error: Debug,
+    N: ArrayLength,
+{
+    /// generates a `CanonicalLagrangeTable` from `CanoncialLagrangeDenominators` for a single output point
+    /// The "x coordinate" of the output point is `x_output`.
+    pub fn new(denominator: &CanonicalLagrangeDenominator<F, N>, x_output: &F) -> Self {
+        // assertion that table is not too large for the stack
+        assert!(<F as Serializable>::Size::USIZE * N::USIZE < 2024);
+
+        let table = Self::compute_table_row(x_output, denominator);
+        LagrangeTable::<F, N, U1> {
+            table: GenericArray::from_array([table; 1]),
+        }
+    }
+}
+
+impl<F, N, M> LagrangeTable<F, N, M>
+where
+    F: Field,
+    N: ArrayLength,
+    M: ArrayLength,
+{
+    /// This function uses the `LagrangeTable` to evaluate `polynomial` on the _output_ "x coordinates"
+    /// that were used to generate this table.
+    /// It is assumed that the `y_coordinates` provided to this function correspond the values of the _input_ "x coordinates"
+    /// that were used to generate this table.
+    pub fn eval<I, J>(&self, y_coordinates: I) -> GenericArray<F, M>
+    where
+        I: IntoIterator<Item = J> + Copy,
+        I::IntoIter: ExactSizeIterator,
+        J: Borrow<F>,
+    {
+        debug_assert_eq!(y_coordinates.into_iter().len(), N::USIZE);
+
+        self.table
+            .iter()
+            .map(|table_row| {
+                table_row
+                    .iter()
+                    .zip(y_coordinates)
+                    .fold(F::ZERO, |acc, (&base, y)| acc + base * (*y.borrow()))
+            })
+            .collect()
+    }
+
+    /// helper function to compute a single row of `LagrangeTable`
+    ///
+    /// ## Panics
+    /// When the field size is too small for `N` evaluation points
+    fn compute_table_row(
+        x_output: &F,
+        denominator: &CanonicalLagrangeDenominator<F, N>,
+    ) -> GenericArray<F, N>
+    where
+        F: Field + TryFrom<u128>,
+        <F as TryFrom<u128>>::Error: Debug,
+        N: ArrayLength,
+    {
+        (0..N::U128)
+            .map(|i| {
+                (0..N::U128)
+                    .filter(|&j| j != i)
+                    .fold(F::ONE, |acc, j| acc * (*x_output - F::try_from(j).unwrap()))
+            })
+            .zip(&denominator.denominator)
+            .map(|(numerator, denominator)| *denominator * numerator)
+            .collect()
+    }
+}
+
+impl<F, N, M> From<CanonicalLagrangeDenominator<F, N>> for LagrangeTable<F, N, M>
+where
+    F: PrimeField,
+    N: ArrayLength,
+    M: ArrayLength,
+{
+    fn from(value: CanonicalLagrangeDenominator<F, N>) -> Self {
+        // assertion that field is large enough
+        // when it is large enough, `F::try_from().unwrap()` below does not panic
+        assert!(
+            N::U128 + M::U128 < F::PRIME.into(),
+            "Field size {} is not large enough to hold {} + {} points",
+            F::PRIME.into(),
+            N::U128,
+            M::U128
+        );
+
+        // assertion that table is not too large for the stack
+        assert!(<F as Serializable>::Size::USIZE * N::USIZE * M::USIZE < 2024);
+
+        LagrangeTable {
+            table: (N::U128..(N::U128 + M::U128))
+                .map(|i| Self::compute_table_row(&F::try_from(i).unwrap(), &value))
+                .collect(),
+        }
+    }
+}
+
+#[cfg(all(test, unit_test))]
+mod test {
+    use std::{borrow::Borrow, fmt::Debug};
+
+    use generic_array::{ArrayLength, GenericArray};
+    use proptest::{prelude::*, proptest};
+    use typenum::{U1, U32, U7, U8};
+
+    use crate::{
+        ff::PrimeField,
+        protocol::ipa_prf::malicious_security::lagrange::{
+            CanonicalLagrangeDenominator, LagrangeTable,
+        },
+    };
+
+    type TestField = crate::ff::Fp32BitPrime;
+
+    #[derive(Debug, PartialEq, Clone)]
+    struct MonomialFormPolynomial<F: PrimeField, N: ArrayLength> {
+        coefficients: GenericArray<F, N>,
+    }
+
+    impl<F, N> MonomialFormPolynomial<F, N>
+    where
+        F: PrimeField,
+        N: ArrayLength,
+    {
+        fn gen_y_values_of_canonical_points(self) -> GenericArray<F, N> {
+            // Sadly, we cannot just use the range (0..N::U128) because it does not implement ExactSizeIterator
+            let canonical_points =
+                (0..N::USIZE).map(|i| F::try_from(u128::try_from(i).unwrap()).unwrap());
+            self.eval(canonical_points)
+        }
+
+        /// test helper function that evaluates a polynomial in monomial form, i.e. `sum_i c_i x^i` on points `x_output`
+        /// where `c_0` to `c_N` are stored in `polynomial`
+        fn eval<M, I, J>(&self, x_output: I) -> GenericArray<F, M>
+        where
+            I: IntoIterator<Item = J>,
+            I::IntoIter: ExactSizeIterator,
+            J: Borrow<F>,
+            M: ArrayLength,
+        {
+            x_output
+                .into_iter()
+                .map(|x| {
+                    // monomial base, i.e. `x^k`
+                    // evaluate p via `sum_k coefficient_k * x^k`
+                    let (_, y) = self
+                        .coefficients
+                        .iter()
+                        .fold((F::ONE, F::ZERO), |(base, y), &coef| {
+                            (base * (*x.borrow()), y + coef * base)
+                        });
+                    y
+                })
+                .collect()
+        }
+    }
+
+    fn lagrange_single_output_point_using_new(
+        output_point: TestField,
+        input_points: [TestField; 32],
+    ) {
+        let polynomial_monomial_form = MonomialFormPolynomial {
+            coefficients: GenericArray::<TestField, U32>::from_array(input_points),
+        };
+        let output_expected = polynomial_monomial_form.eval(&[output_point]);
+        let denominator = CanonicalLagrangeDenominator::<TestField, U32>::new();
+        // generate table using new
+        let lagrange_table = LagrangeTable::<TestField, U32, U1>::new(&denominator, &output_point);
+        let output =
+            lagrange_table.eval(&polynomial_monomial_form.gen_y_values_of_canonical_points());
+        assert_eq!(output, output_expected);
+    }
+
+    proptest! {
+    #[test]
+    fn proptest_lagrange_single_output_point_using_new(output_point: TestField, input_points in prop::array::uniform32(any::<TestField>())){
+        lagrange_single_output_point_using_new(output_point,input_points);
+    }
+    }
+
+    fn lagrange_canonical_using_from(input_points: [TestField; 8]) {
+        let polynomial_monomial_form = MonomialFormPolynomial {
+            coefficients: GenericArray::<TestField, U8>::from_array(input_points),
+        };
+        // the canonical x coordinates are 0..7, the outputs use coordinates 8..15:
+        let x_coordinates_output =
+            (0..7).map(|i| TestField::try_from(u128::try_from(i).unwrap() + 8).unwrap());
+        let output_expected = polynomial_monomial_form.eval(x_coordinates_output);
+        let denominator = CanonicalLagrangeDenominator::<TestField, U8>::new();
+        // generate table using from
+        let lagrange_table = LagrangeTable::<TestField, U8, U7>::from(denominator);
+        let output =
+            lagrange_table.eval(&polynomial_monomial_form.gen_y_values_of_canonical_points());
+        assert_eq!(output, output_expected);
+    }
+
+    proptest! {
+        #[test]
+        fn proptest_lagrange_canonical_using_from(input_points in prop::array::uniform8(any::<TestField>()))
+        {
+            lagrange_canonical_using_from(input_points);
+        }
+    }
+}
diff --git a/ipa-core/src/protocol/ipa_prf/malicious_security/mod.rs b/ipa-core/src/protocol/ipa_prf/malicious_security/mod.rs
new file mode 100644
index 000000000..0e7f6bf3a
--- /dev/null
+++ b/ipa-core/src/protocol/ipa_prf/malicious_security/mod.rs
@@ -0,0 +1,2 @@
+pub mod lagrange;
+pub mod prover;
diff --git a/ipa-core/src/protocol/ipa_prf/malicious_security/prover.rs b/ipa-core/src/protocol/ipa_prf/malicious_security/prover.rs
new file mode 100644
index 000000000..9536b5349
--- /dev/null
+++ b/ipa-core/src/protocol/ipa_prf/malicious_security/prover.rs
@@ -0,0 +1,143 @@
+use std::{
+    iter::zip,
+    ops::{Add, Sub},
+};
+
+use generic_array::{ArrayLength, GenericArray};
+use typenum::{Diff, Sum, U1};
+
+use crate::{
+    ff::PrimeField,
+    protocol::ipa_prf::malicious_security::lagrange::{
+        CanonicalLagrangeDenominator, LagrangeTable,
+    },
+};
+
+pub struct ZeroKnowledgeProof<F: PrimeField, N: ArrayLength> {
+    g: GenericArray<F, N>,
+}
+
+pub struct ProofGenerator<F: PrimeField> {
+    u: Vec<F>,
+    v: Vec<F>,
+}
+
+type TwoNMinusOne<N> = Diff<Sum<N, N>, U1>;
+
+///
+/// Distributed Zero Knowledge Proofs algorithm drawn from
+/// `https://eprint.iacr.org/2023/909.pdf`
+///
+#[allow(non_camel_case_types)]
+impl<F> ProofGenerator<F>
+where
+    F: PrimeField,
+{
+    pub fn new(u: Vec<F>, v: Vec<F>) -> Self {
+        debug_assert_eq!(u.len(), v.len(), "u and v must be of equal length");
+        Self { u, v }
+    }
+
+    pub fn compute_proof<λ: ArrayLength>(
+        &self,
+        r: F,
+    ) -> (ZeroKnowledgeProof<F, TwoNMinusOne<λ>>, ProofGenerator<F>)
+    where
+        λ: ArrayLength + Add + Sub<U1>,
+        <λ as Add>::Output: Sub<U1>,
+        <<λ as Add>::Output as Sub<U1>>::Output: ArrayLength,
+        <λ as Sub<U1>>::Output: ArrayLength,
+    {
+        debug_assert_eq!(self.u.len() % λ::USIZE, 0); // We should pad with zeroes eventually
+
+        let s = self.u.len() / λ::USIZE;
+
+        assert!(
+            s > 1,
+            "When the output is this small, you should call `compute_final_proof`"
+        );
+
+        let mut next_proof_generator = ProofGenerator {
+            u: Vec::<F>::with_capacity(s),
+            v: Vec::<F>::with_capacity(s),
+        };
+
+        let denominator = CanonicalLagrangeDenominator::<F, λ>::new();
+        let lagrange_table_r = LagrangeTable::<F, λ, U1>::new(&denominator, &r);
+        let lagrange_table = LagrangeTable::<F, λ, <λ as Sub<U1>>::Output>::from(denominator);
+        let extrapolated_points = (0..s).map(|i| {
+            let start = i * λ::USIZE;
+            let end = start + λ::USIZE;
+            let p = &self.u[start..end];
+            let q = &self.v[start..end];
+            let p_extrapolated = lagrange_table.eval(p);
+            let q_extrapolated = lagrange_table.eval(q);
+            let p_r = lagrange_table_r.eval(p)[0];
+            let q_r = lagrange_table_r.eval(q)[0];
+            next_proof_generator.u.push(p_r);
+            next_proof_generator.v.push(q_r);
+            // p.into_iter() has elements that are &F
+            // p_extrapolated.into_iter() has elements that are F
+            // So these iterators cannot be chained.
+            zip(p, q)
+                .map(|(a, b)| *a * *b)
+                .chain(zip(p_extrapolated, q_extrapolated).map(|(a, b)| a * b))
+                .collect::<GenericArray<F, _>>()
+        });
+        let proof = ZeroKnowledgeProof {
+            g: extrapolated_points
+                .reduce(|acc, pts| zip(acc, pts).map(|(a, b)| a + b).collect())
+                .unwrap(),
+        };
+        (proof, next_proof_generator)
+    }
+}
+
+#[cfg(all(test, unit_test))]
+mod test {
+    use typenum::U4;
+
+    use super::ProofGenerator;
+    use crate::ff::{Fp31, U128Conversions};
+
+    #[test]
+    fn sample_proof() {
+        const U: [u128; 32] = [
+            0, 30, 0, 16, 0, 1, 0, 15, 0, 0, 0, 16, 0, 30, 0, 16, 29, 1, 1, 15, 0, 0, 1, 15, 2, 30,
+            30, 16, 0, 0, 30, 16,
+        ];
+        const V: [u128; 32] = [
+            0, 0, 0, 30, 0, 0, 0, 1, 30, 30, 30, 30, 0, 0, 30, 30, 0, 30, 0, 30, 0, 0, 0, 1, 0, 0,
+            1, 1, 0, 0, 1, 1,
+        ];
+        const EXPECTED: [u128; 7] = [0, 30, 29, 30, 5, 28, 13];
+        const R1: u128 = 22;
+        const EXPECTED_NEXT_U: [u128; 8] = [0, 0, 26, 0, 7, 18, 24, 13];
+        const EXPECTED_NEXT_V: [u128; 8] = [10, 21, 30, 28, 15, 21, 3, 3];
+        let pg: ProofGenerator<Fp31> = ProofGenerator::new(
+            U.into_iter().map(|x| Fp31::try_from(x).unwrap()).collect(),
+            V.into_iter().map(|x| Fp31::try_from(x).unwrap()).collect(),
+        );
+        let (proof, next_proof_generator) = pg.compute_proof::<U4>(Fp31::try_from(R1).unwrap());
+        assert_eq!(
+            proof.g.into_iter().map(|x| x.as_u128()).collect::<Vec<_>>(),
+            EXPECTED,
+        );
+        assert_eq!(
+            next_proof_generator
+                .u
+                .into_iter()
+                .map(|x| x.as_u128())
+                .collect::<Vec<_>>(),
+            EXPECTED_NEXT_U,
+        );
+        assert_eq!(
+            next_proof_generator
+                .v
+                .into_iter()
+                .map(|x| x.as_u128())
+                .collect::<Vec<_>>(),
+            EXPECTED_NEXT_V,
+        );
+    }
+}
diff --git a/ipa-core/src/protocol/ipa_prf/mod.rs b/ipa-core/src/protocol/ipa_prf/mod.rs
index 2c241f9ad..aab014b52 100644
--- a/ipa-core/src/protocol/ipa_prf/mod.rs
+++ b/ipa-core/src/protocol/ipa_prf/mod.rs
@@ -33,6 +33,8 @@ mod boolean_ops;
 pub mod prf_eval;
 pub mod prf_sharding;
 
+#[cfg(all(test, unit_test))]
+mod malicious_security;
 mod quicksort;
 mod shuffle;
 
diff --git a/ipa-core/src/secret_sharing/mod.rs b/ipa-core/src/secret_sharing/mod.rs
index 7fa77a89f..64365bd51 100644
--- a/ipa-core/src/secret_sharing/mod.rs
+++ b/ipa-core/src/secret_sharing/mod.rs
@@ -1,65 +1,15 @@
-//! # Vectorization
-//!
-//! Vectorization refers to adapting an implementation that previously operated on one value at a
-//! time, to instead operate on `N` values at a time. Vectorization improves performance in two ways:
-//!
-//!  1. Vectorized code can make use of special CPU instructions (Intel AVX, ARM NEON) that operate
-//!     on multiple values at a time. This reduces the CPU time required to perform computations.
-//!     We also use vectorization to refer to "bit packing" of boolean values, i.e., packing
-//!     64 boolean values into a single u64 rather than using a byte (or even a word) for each
-//!     value.
-//!  2. Aside from the core arithmetic operations that are involved in our MPC, a substantial
-//!     amount of other code is needed to send values between helpers, schedule futures for
-//!     execution, etc. Vectorization can result in a greater amount of arithmetic work being
-//!     performed for a given amount of overhead work, thus increasing the efficiency of the
-//!     implementation.
-//!
-//! ## Vectorization traits
-//!
-//! There are two sets of traits related to vectorization.
-//!
-//! If you are writing protocols, the trait of interest is `FieldSimd<N>`, which can be specified in
-//! a trait bound, something like `F: Field + FieldSimd<N>`.
-//!
-//! The other traits are `Vectorizable` (for `SharedValue`s) and `FieldVectorizable`. These traits
-//! are needed to work around a limitation in the rust type system. In most cases, you do not need
-//! to reference the `Vectorizable` or `FieldVectorizable` traits directly when implementing
-//! protocols. Usually the vector type is hidden within `AdditiveShare`, but if you are writing a
-//! vectorized low-level primitive, you may need to refer to it directly, as `<S as
-//! Vectorizable<N>>::Array`. It is even more rare to need to use `FieldVectorizable`; see its
-//! documentation and the documentation of `FieldSimd` for details.
-//!
-//! We require that each supported vectorization configuration (i.e. combination of data type and
-//! width) be explicitly identified, by implementing the `Vectorizable` and/or `FieldVectorizable`
-//! traits for base data type (e.g. `Fp32BitPrime`). This is for two reasons:
-//!  1. Rust doesn't yet support evaluating expressions involving const parameters at compile time,
-//!     which makes it difficult or impossible to write generic serialization routines for
-//!     arbitrary widths.
-//!  2. As a measure of protection against inadvertently using a configuration that will not be
-//!     efficient (i.e. an excessive vector width).
-//!
-//! ## Adding a new supported vectorization
-//!
-//! To add a new supported vectorization:
-//!
-//!  1. Add `FieldSimd` impl (in `secret_sharing/mod.rs`)
-//!  2. Add `FromRandom` impl (in `array.rs` or `boolean_array.rs`)
-//!  3. Add `Serializable` impl (in `array.rs` or `boolean_array.rs`)
-//!  4. Add `Vectorizable` and `FieldVectorizable` impls (with the primitive type def in e.g. `galois_field.rs`
-
 pub mod replicated;
 
-mod array;
 mod decomposed;
 mod into_shares;
 mod scheme;
+mod vector;
 
 use std::{
     fmt::Debug,
-    ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign},
+    ops::{Mul, MulAssign, Neg},
 };
 
-pub use array::StdArray;
 pub(crate) use decomposed::BitDecomposed;
 use generic_array::ArrayLength;
 pub use into_shares::IntoShares;
@@ -69,16 +19,15 @@ use rand::{
     Rng,
 };
 pub use scheme::{Bitwise, Linear, LinearRefOps, SecretSharing};
+pub use vector::{
+    FieldArray, FieldSimd, FieldVectorizable, SharedValueArray, StdArray, Vectorizable,
+};
 
+#[cfg(any(test, feature = "test-fixture", feature = "cli"))]
+use crate::secret_sharing::replicated::semi_honest::AdditiveShare;
 use crate::{
-    error::LengthError,
-    ff::{
-        boolean::Boolean,
-        boolean_array::{BA20, BA256, BA3, BA32, BA5, BA64, BA8},
-        AddSub, AddSubAssign, Field, Fp32BitPrime, Serializable,
-    },
-    protocol::prss::FromRandom,
-    secret_sharing::replicated::{semi_honest::AdditiveShare, ReplicatedSecretSharing},
+    ff::{AddSub, AddSubAssign, Serializable},
+    secret_sharing::replicated::ReplicatedSecretSharing,
 };
 
 /// Operations supported for weak shared values.
@@ -170,148 +119,6 @@ macro_rules! impl_shared_value_common {
     };
 }
 
-// Note that we can either make `trait Vectorizable<N>: SharedValue`, or we can make `trait
-// SharedValue: Vectorizable<1>`, but doing both creates a cycle. (Similarly for
-// `FieldVectorizable` / `Field`.)
-//
-// Although it is somewhat unnatural, we choose to do the latter, because it allows existing
-// high-level protocols unaware of vectorization to call vectorized versions of core protocols (with
-// width of 1) without updating all of the trait bounds. This does mean that the trait definitions
-// do not prevent implementing `Vectorizable` for something that is not a `SharedValue`, but please
-// don't do that.
-
-/// Trait for `SharedValue`s supporting operations on `N`-wide vectors.
-pub trait Vectorizable<const N: usize>: Sized {
-    type Array: SharedValueArray<Self>;
-}
-
-/// Trait for `Field`s supporting operations on `N`-wide vectors.
-///
-/// We would like `F` to be `FieldVectorizable` if it satisfies all of the following:
-///  1. `F: Field`.
-///  2. `<F as Vectorizable<N>>::Array: FieldArray<Self>`. Rust does not support expressing a
-///     constraint on a super-trait's associated type directly. Instead, this effect is achieved
-///     by constraining the `ArrayAlias` associated type and then constraining that
-///     `Vectorizable::Array == FieldVectorizable::ArrayAlias` where necessary (e.g. in the
-///     definition and blanket impl of the `FieldSimd` trait. We call it `ArrayAlias` instead of
-///     `Array` so that references to the `Array` associated type do not require qualification
-///     with a trait name.
-///  3. `F: Vectorizable<N>`. This is implied by the previous two, because `FieldArray`
-///     is a sub-trait of `SharedValueArray`. (See the `FieldSimd` documentation for another
-///     important consequence of this sub-trait relationship.)
-pub trait FieldVectorizable<const N: usize>: SharedValue + Sized {
-    type ArrayAlias: FieldArray<Self>;
-}
-
-// Convenience alias to express a supported vectorization when writing protocols.
-//
-// Typically appears like this: `F: Field + FieldSimd<N>`.
-//
-// We could define a `SharedValueSimd` trait that is the analog of this for `SharedValue`s, but
-// there are not currently any protocols that need it.
-//
-// Because we have constrained the associated types Vectorizable::Array and
-// FieldVectorizable::ArrayAlias to be equal, the type they refer to must satisfy the union of all
-// trait bounds applicable to either. However, in some cases the compiler has trouble proving
-// properties related to this. (See rust issues [41118] and [60471].) A typical workaround for
-// problems of this sort is to redundantly list a trait bound on both associated types, but for us
-// that is not necessary in most cases because `FieldArray` is a sub-trait of `SharedValueArray`.
-//
-// Another consequence of this limitation of the compiler is that if you write the bound `F: Field +
-// FieldSimd<N> + Vectorizable<N, Array = S>`, you will get the error ``type annotations needed:
-// cannot satisfy `<F as secret_sharing::Vectorizable<N>>::Array == <F as
-// secret_sharing::FieldVectorizable<N>>::ArrayAlias```. The compiler is not smart enough to
-// coalesce the constraints and see that `S`, `<F as Vectorizable>::Array`, and `<F as
-// FieldVectorizable>::ArrayAlias` must all to refer to the same type.
-//
-// [41118](https://github.com/rust-lang/rust/issues/41118)
-// [60471](https://github.com/rust-lang/rust/issues/60471)
-pub trait FieldSimd<const N: usize>:
-    Field + Vectorizable<N, Array = <Self as FieldVectorizable<N>>::ArrayAlias> + FieldVectorizable<N>
-{
-}
-
-// Portions of the implementation treat non-vectorized operations as a vector with `N = 1`. This
-// blanket impl (and the fact that `F: Field` is the only trait bound) is important in allowing code
-// that writes `F: Field` to continue working without modification.
-impl<F: Field> FieldSimd<1> for F {}
-
-// Supported vectorizations
-
-impl FieldSimd<32> for Fp32BitPrime {}
-
-macro_rules! boolean_vector {
-    ($dim:expr, $vec:ty) => {
-        impl Vectorizable<$dim> for Boolean {
-            type Array = $vec;
-        }
-
-        impl FieldVectorizable<$dim> for Boolean {
-            type ArrayAlias = $vec;
-        }
-
-        impl FieldSimd<$dim> for Boolean {}
-
-        impl From<AdditiveShare<$vec>> for AdditiveShare<Boolean, $dim> {
-            fn from(value: AdditiveShare<$vec>) -> Self {
-                AdditiveShare::new_arr(value.left(), value.right())
-            }
-        }
-
-        impl From<AdditiveShare<Boolean, $dim>> for AdditiveShare<$vec> {
-            fn from(value: AdditiveShare<Boolean, $dim>) -> Self {
-                AdditiveShare::new(*value.left_arr(), *value.right_arr())
-            }
-        }
-    };
-}
-
-boolean_vector!(3, BA3);
-boolean_vector!(5, BA5);
-boolean_vector!(8, BA8);
-boolean_vector!(20, BA20);
-boolean_vector!(32, BA32);
-boolean_vector!(64, BA64);
-boolean_vector!(256, BA256);
-
-pub trait SharedValueArray<V>:
-    Clone
-    + Eq
-    + Debug
-    + Send
-    + Sync
-    + Sized
-    + Sendable
-    + TryFrom<Vec<V>, Error = LengthError>
-    + FromIterator<V>
-    + IntoIterator<Item = V>
-    + Add<Self, Output = Self>
-    + for<'a> Add<&'a Self, Output = Self>
-    + AddAssign<Self>
-    + for<'a> AddAssign<&'a Self>
-    + Neg<Output = Self>
-    + Sub<Self, Output = Self>
-    + for<'a> Sub<&'a Self, Output = Self>
-    + SubAssign<Self>
-    + for<'a> SubAssign<&'a Self>
-{
-    const ZERO_ARRAY: Self;
-
-    fn from_fn<F: FnMut(usize) -> V>(f: F) -> Self;
-}
-
-// Some `SharedValue` types (and thus their arrays) implement `FromRandom`, but `RP25519` does not.
-// We overload this distinction on `FieldArray` instead of creating a separate `ArrayFromRandom` trait,
-// to avoid making the `Vectorizable` / `FieldVectorizable` situation that much more complicated.
-pub trait FieldArray<F: SharedValue>:
-    SharedValueArray<F>
-    + FromRandom
-    + for<'a> Mul<F, Output = Self>
-    + for<'a> Mul<&'a F, Output = Self>
-    + for<'a> Mul<&'a Self, Output = Self>
-{
-}
-
 #[cfg(any(test, feature = "test-fixture", feature = "cli"))]
 impl<V> IntoShares<AdditiveShare<V>> for V
 where
diff --git a/ipa-core/src/secret_sharing/array.rs b/ipa-core/src/secret_sharing/vector/array.rs
similarity index 99%
rename from ipa-core/src/secret_sharing/array.rs
rename to ipa-core/src/secret_sharing/vector/array.rs
index f38e608c0..13f476b2e 100644
--- a/ipa-core/src/secret_sharing/array.rs
+++ b/ipa-core/src/secret_sharing/vector/array.rs
@@ -23,7 +23,7 @@ use crate::{
 ///  * It disables by-index access to individual elements of the array, which
 ///    should never be necessary in properly vectorized code.
 #[derive(Clone, Debug, Eq, PartialEq)]
-pub struct StdArray<V: SharedValue, const N: usize>([V; N]);
+pub struct StdArray<V: SharedValue, const N: usize>(pub(super) [V; N]);
 
 impl<V, T, const N: usize> PartialEq<T> for StdArray<V, N>
 where
diff --git a/ipa-core/src/secret_sharing/vector/impls.rs b/ipa-core/src/secret_sharing/vector/impls.rs
new file mode 100644
index 000000000..e29d8712b
--- /dev/null
+++ b/ipa-core/src/secret_sharing/vector/impls.rs
@@ -0,0 +1,49 @@
+//! Supported vectorizations
+
+use crate::{
+    ff::{
+        boolean::Boolean,
+        boolean_array::{BA20, BA256, BA3, BA32, BA5, BA64, BA8},
+        Fp32BitPrime,
+    },
+    secret_sharing::{
+        replicated::semi_honest::AdditiveShare, FieldSimd, FieldVectorizable,
+        ReplicatedSecretSharing, Vectorizable,
+    },
+};
+
+impl FieldSimd<32> for Fp32BitPrime {}
+
+macro_rules! boolean_vector {
+    ($dim:expr, $vec:ty) => {
+        impl Vectorizable<$dim> for Boolean {
+            type Array = $vec;
+        }
+
+        impl FieldVectorizable<$dim> for Boolean {
+            type ArrayAlias = $vec;
+        }
+
+        impl FieldSimd<$dim> for Boolean {}
+
+        impl From<AdditiveShare<$vec>> for AdditiveShare<Boolean, $dim> {
+            fn from(value: AdditiveShare<$vec>) -> Self {
+                AdditiveShare::new_arr(value.left(), value.right())
+            }
+        }
+
+        impl From<AdditiveShare<Boolean, $dim>> for AdditiveShare<$vec> {
+            fn from(value: AdditiveShare<Boolean, $dim>) -> Self {
+                AdditiveShare::new(*value.left_arr(), *value.right_arr())
+            }
+        }
+    };
+}
+
+boolean_vector!(3, BA3);
+boolean_vector!(5, BA5);
+boolean_vector!(8, BA8);
+boolean_vector!(20, BA20);
+boolean_vector!(32, BA32);
+boolean_vector!(64, BA64);
+boolean_vector!(256, BA256);
diff --git a/ipa-core/src/secret_sharing/vector/mod.rs b/ipa-core/src/secret_sharing/vector/mod.rs
new file mode 100644
index 000000000..e276d7d22
--- /dev/null
+++ b/ipa-core/src/secret_sharing/vector/mod.rs
@@ -0,0 +1,56 @@
+//! Vectorized secret shares
+//!
+//! Vectorization refers to adapting an implementation that previously operated on one value at a
+//! time, to instead operate on `N` values at a time. Vectorization improves performance in two ways:
+//!
+//!  1. Vectorized code can make use of special CPU instructions (Intel AVX, ARM NEON) that operate
+//!     on multiple values at a time. This reduces the CPU time required to perform computations.
+//!     We also use vectorization to refer to "bit packing" of boolean values, i.e., packing
+//!     64 boolean values into a single u64 rather than using a byte (or even a word) for each
+//!     value.
+//!  2. Aside from the core arithmetic operations that are involved in our MPC, a substantial
+//!     amount of other code is needed to send values between helpers, schedule futures for
+//!     execution, etc. Vectorization can result in a greater amount of arithmetic work being
+//!     performed for a given amount of overhead work, thus increasing the efficiency of the
+//!     implementation.
+//!
+//! ## Vectorization traits
+//!
+//! There are two sets of traits related to vectorization.
+//!
+//! If you are writing protocols, the trait of interest is `FieldSimd<N>`, which can be specified in
+//! a trait bound, something like `F: Field + FieldSimd<N>`.
+//!
+//! The other traits are `Vectorizable` (for `SharedValue`s) and `FieldVectorizable`. These traits
+//! are needed to work around a limitation in the rust type system. In most cases, you do not need
+//! to reference the `Vectorizable` or `FieldVectorizable` traits directly when implementing
+//! protocols. Usually the vector type is hidden within `AdditiveShare`, but if you are writing a
+//! vectorized low-level primitive, you may need to refer to it directly, as `<S as
+//! Vectorizable<N>>::Array`. It is even more rare to need to use `FieldVectorizable`; see its
+//! documentation and the documentation of `FieldSimd` for details.
+//!
+//! We require that each supported vectorization configuration (i.e. combination of data type and
+//! width) be explicitly identified, by implementing the `Vectorizable` and/or `FieldVectorizable`
+//! traits for base data type (e.g. `Fp32BitPrime`). This is for two reasons:
+//!  1. Rust doesn't yet support evaluating expressions involving const parameters at compile time,
+//!     which makes it difficult or impossible to write generic serialization routines for
+//!     arbitrary widths.
+//!  2. As a measure of protection against inadvertently using a configuration that will not be
+//!     efficient (i.e. an excessive vector width).
+//!
+//! ## Adding a new supported vectorization
+//!
+//! To add a new supported vectorization:
+//!
+//!  1. Add `FromRandom` impl (in `array.rs` or `boolean_array.rs`)
+//!  2. Add `Serializable` impl (in `array.rs` or `boolean_array.rs`)
+//!  3. Add `FieldSimd` impl (in `secret_sharing/vector/impls.rs`)
+//!  4. Add `Vectorizable` and `FieldVectorizable` impls (either with the primitive type def in
+//!     e.g. `galois_field.rs`, or in `vector/impls.rs`)
+
+mod array;
+mod impls;
+mod traits;
+
+pub use array::StdArray;
+pub use traits::{FieldArray, FieldSimd, FieldVectorizable, SharedValueArray, Vectorizable};
diff --git a/ipa-core/src/secret_sharing/vector/traits.rs b/ipa-core/src/secret_sharing/vector/traits.rs
new file mode 100644
index 000000000..b44316b70
--- /dev/null
+++ b/ipa-core/src/secret_sharing/vector/traits.rs
@@ -0,0 +1,115 @@
+use std::{
+    fmt::Debug,
+    ops::{Add, AddAssign, Mul, Neg, Sub, SubAssign},
+};
+
+use crate::{
+    error::LengthError,
+    ff::Field,
+    protocol::prss::FromRandom,
+    secret_sharing::{Sendable, SharedValue},
+};
+
+// Note that we can either make `trait Vectorizable<N>: SharedValue`, or we can make `trait
+// SharedValue: Vectorizable<1>`, but doing both creates a cycle. (Similarly for
+// `FieldVectorizable` / `Field`.)
+//
+// Although it is somewhat unnatural, we choose to do the latter, because it allows existing
+// high-level protocols unaware of vectorization to call vectorized versions of core protocols (with
+// width of 1) without updating all of the trait bounds. This does mean that the trait definitions
+// do not prevent implementing `Vectorizable` for something that is not a `SharedValue`, but please
+// don't do that.
+
+/// Trait for `SharedValue`s supporting operations on `N`-wide vectors.
+pub trait Vectorizable<const N: usize>: Sized {
+    type Array: SharedValueArray<Self>;
+}
+
+/// Trait for `Field`s supporting operations on `N`-wide vectors.
+///
+/// We would like `F` to be `FieldVectorizable` if it satisfies all of the following:
+///  1. `F: Field`.
+///  2. `<F as Vectorizable<N>>::Array: FieldArray<Self>`. Rust does not support expressing a
+///     constraint on a super-trait's associated type directly. Instead, this effect is achieved
+///     by constraining the `ArrayAlias` associated type and then constraining that
+///     `Vectorizable::Array == FieldVectorizable::ArrayAlias` where necessary (e.g. in the
+///     definition and blanket impl of the `FieldSimd` trait. We call it `ArrayAlias` instead of
+///     `Array` so that references to the `Array` associated type do not require qualification
+///     with a trait name.
+///  3. `F: Vectorizable<N>`. This is implied by the previous two, because `FieldArray`
+///     is a sub-trait of `SharedValueArray`. (See the `FieldSimd` documentation for another
+///     important consequence of this sub-trait relationship.)
+pub trait FieldVectorizable<const N: usize>: SharedValue + Sized {
+    type ArrayAlias: FieldArray<Self>;
+}
+
+// Convenience alias to express a supported vectorization when writing protocols.
+//
+// Typically appears like this: `F: Field + FieldSimd<N>`.
+//
+// We could define a `SharedValueSimd` trait that is the analog of this for `SharedValue`s, but
+// there are not currently any protocols that need it.
+//
+// Because we have constrained the associated types Vectorizable::Array and
+// FieldVectorizable::ArrayAlias to be equal, the type they refer to must satisfy the union of all
+// trait bounds applicable to either. However, in some cases the compiler has trouble proving
+// properties related to this. (See rust issues [41118] and [60471].) A typical workaround for
+// problems of this sort is to redundantly list a trait bound on both associated types, but for us
+// that is not necessary in most cases because `FieldArray` is a sub-trait of `SharedValueArray`.
+//
+// Another consequence of this limitation of the compiler is that if you write the bound `F: Field +
+// FieldSimd<N> + Vectorizable<N, Array = S>`, you will get the error ``type annotations needed:
+// cannot satisfy `<F as secret_sharing::Vectorizable<N>>::Array == <F as
+// secret_sharing::FieldVectorizable<N>>::ArrayAlias```. The compiler is not smart enough to
+// coalesce the constraints and see that `S`, `<F as Vectorizable>::Array`, and `<F as
+// FieldVectorizable>::ArrayAlias` must all to refer to the same type.
+//
+// [41118](https://github.com/rust-lang/rust/issues/41118)
+// [60471](https://github.com/rust-lang/rust/issues/60471)
+pub trait FieldSimd<const N: usize>:
+    Field + Vectorizable<N, Array = <Self as FieldVectorizable<N>>::ArrayAlias> + FieldVectorizable<N>
+{
+}
+
+// Portions of the implementation treat non-vectorized operations as a vector with `N = 1`. This
+// blanket impl (and the fact that `F: Field` is the only trait bound) is important in allowing code
+// that writes `F: Field` to continue working without modification.
+impl<F: Field> FieldSimd<1> for F {}
+
+pub trait SharedValueArray<V>:
+    Clone
+    + Eq
+    + Debug
+    + Send
+    + Sync
+    + Sized
+    + Sendable
+    + TryFrom<Vec<V>, Error = LengthError>
+    + FromIterator<V>
+    + IntoIterator<Item = V>
+    + Add<Self, Output = Self>
+    + for<'a> Add<&'a Self, Output = Self>
+    + AddAssign<Self>
+    + for<'a> AddAssign<&'a Self>
+    + Neg<Output = Self>
+    + Sub<Self, Output = Self>
+    + for<'a> Sub<&'a Self, Output = Self>
+    + SubAssign<Self>
+    + for<'a> SubAssign<&'a Self>
+{
+    const ZERO_ARRAY: Self;
+
+    fn from_fn<F: FnMut(usize) -> V>(f: F) -> Self;
+}
+
+// Some `SharedValue` types (and thus their arrays) implement `FromRandom`, but `RP25519` does not.
+// We overload this distinction on `FieldArray` instead of creating a separate `ArrayFromRandom` trait,
+// to avoid making the `Vectorizable` / `FieldVectorizable` situation that much more complicated.
+pub trait FieldArray<F: SharedValue>:
+    SharedValueArray<F>
+    + FromRandom
+    + for<'a> Mul<F, Output = Self>
+    + for<'a> Mul<&'a F, Output = Self>
+    + for<'a> Mul<&'a Self, Output = Self>
+{
+}
diff --git a/scripts/collect_steps.py b/scripts/collect_steps.py
index 240e85573..ea34ee3c1 100755
--- a/scripts/collect_steps.py
+++ b/scripts/collect_steps.py
@@ -12,7 +12,7 @@
 QUERY_SIZE = 100
 # attribution_window_seconds = 0 runs an optimized protocol, so 0 and anything larger
 ATTRIBUTION_WINDOW = [0, 86400]
-ROOT_STEP_PREFIX = "protocol/alloc::string::String::run-0"
+ROOT_STEP_PREFIX = "protocol/ipa_core::test_fixture::world::TestExecutionStep::iter0"
 BREAKDOWN_KEYS = 256
 USER_CAP = [8, 16, 32, 64, 128]
 SECURITY_MODEL = "semi-honest"