From 68d95363220107ef56524f820db54f99ba167e48 Mon Sep 17 00:00:00 2001
From: Paul <108982045+ChengYueJia@users.noreply.github.com>
Date: Sun, 5 Nov 2023 18:28:47 +0800
Subject: [PATCH] feat: add avx2 Goldilocks (#151)

* feat: add draft struct

* feat: impl add

* bugfix for avx by lo

* add aknowledge

* add aknowledge

* use nightly version rustc
---
 .github/workflows/ci.yml                     |   4 +-
 algebraic/src/arch/mod.rs                    |   2 +
 algebraic/src/arch/x86_64/avx2_field_gl.rs   | 798 +++++++++++++++++++
 algebraic/src/arch/x86_64/avx512_field_gl.rs | 751 +++++++++++++++++
 algebraic/src/arch/x86_64/mod.rs             |  20 +
 algebraic/src/lib.rs                         |   4 +
 algebraic/src/packable.rs                    |  40 +
 algebraic/src/packed.rs                      | 127 +++
 8 files changed, 1744 insertions(+), 2 deletions(-)
 create mode 100644 algebraic/src/arch/mod.rs
 create mode 100644 algebraic/src/arch/x86_64/avx2_field_gl.rs
 create mode 100644 algebraic/src/arch/x86_64/avx512_field_gl.rs
 create mode 100644 algebraic/src/arch/x86_64/mod.rs
 create mode 100644 algebraic/src/packable.rs
 create mode 100644 algebraic/src/packed.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 03de5ae8..81d7ec21 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -34,7 +34,7 @@ jobs:
       - uses: actions-rs/toolchain@v1
         with:
           profile: minimal
-          toolchain: stable
+          toolchain: nightly
           override: true
       - run: rustup component add rustfmt
       - uses: actions-rs/cargo@v1
@@ -49,7 +49,7 @@ jobs:
       - uses: actions-rs/toolchain@v1
         with:
           profile: minimal
-          toolchain: stable
+          toolchain: nightly
           override: true
       - run: rustup component add clippy
       - run: cargo clippy --all-targets -- -D warnings
\ No newline at end of file
diff --git a/algebraic/src/arch/mod.rs b/algebraic/src/arch/mod.rs
new file mode 100644
index 00000000..832557ef
--- /dev/null
+++ b/algebraic/src/arch/mod.rs
@@ -0,0 +1,2 @@
+#[cfg(target_arch = "x86_64")]
+pub mod x86_64;
diff --git a/algebraic/src/arch/x86_64/avx2_field_gl.rs b/algebraic/src/arch/x86_64/avx2_field_gl.rs
new file mode 100644
index 00000000..03fc5d5d
--- /dev/null
+++ b/algebraic/src/arch/x86_64/avx2_field_gl.rs
@@ -0,0 +1,798 @@
+//! Porting from plonky2:
+//! https://github.com/0xPolygonZero/plonky2/blob/main/field/src/arch/x86_64/avx2_goldilocks_field.rs
+//!
+//! How to build/run/test:
+//! RUSTFLAGS='-C target-feature=+avx2' cargo build --release
+//!
+use crate::ff::*;
+use crate::field_gl::{Fr, FrRepr as GoldilocksField};
+use core::arch::x86_64::*;
+use core::fmt;
+use core::fmt::{Debug, Formatter};
+use core::mem::transmute;
+use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+// use crate::packed::PackedField;
+
+/// AVX2 Goldilocks Field
+///
+/// Ideally `Avx2GoldilocksField` would wrap `__m256i`. Unfortunately, `__m256i` has an alignment of
+/// 32B, which would preclude us from casting `[GoldilocksField; 4]` (alignment 8B) to
+/// `Avx2GoldilocksField`. We need to ensure that `Avx2GoldilocksField` has the same alignment as
+/// `GoldilocksField`. Thus we wrap `[GoldilocksField; 4]` and use the `new` and `get` methods to
+/// convert to and from `__m256i`.
+#[derive(Copy, Clone)]
+#[repr(transparent)]
+pub struct Avx2GoldilocksField(pub [GoldilocksField; 4]);
+
+const WIDTH: usize = 4;
+
+impl Avx2GoldilocksField {
+    #[inline]
+    pub fn new(x: __m256i) -> Self {
+        unsafe { transmute(x) }
+    }
+    #[inline]
+    pub fn get(&self) -> __m256i {
+        unsafe { transmute(*self) }
+    }
+    // }
+    // unsafe impl PackedField for Avx2GoldilocksField {
+    #[inline]
+    pub fn from_slice(slice: &[GoldilocksField]) -> &Self {
+        assert_eq!(slice.len(), WIDTH);
+        unsafe { &*slice.as_ptr().cast() }
+    }
+    #[inline]
+    pub fn from_slice_mut(slice: &mut [GoldilocksField]) -> &mut Self {
+        assert_eq!(slice.len(), WIDTH);
+        unsafe { &mut *slice.as_mut_ptr().cast() }
+    }
+    #[inline]
+    pub fn as_slice(&self) -> &[GoldilocksField] {
+        &self.0[..]
+    }
+    #[inline]
+    pub fn as_slice_mut(&mut self) -> &mut [GoldilocksField] {
+        &mut self.0[..]
+    }
+    #[inline]
+    pub fn square(&self) -> Avx2GoldilocksField {
+        Self::new(unsafe { square(self.get()) })
+    }
+
+    #[inline]
+    fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) {
+        let (v0, v1) = (self.get(), other.get());
+        let (res0, res1) = match block_len {
+            1 => unsafe { interleave1(v0, v1) },
+            2 => unsafe { interleave2(v0, v1) },
+            4 => (v0, v1),
+            _ => panic!("unsupported block_len"),
+        };
+        (Self::new(res0), Self::new(res1))
+    }
+}
+
+impl Add<Self> for Avx2GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn add(self, rhs: Self) -> Self {
+        Self::new(unsafe { add(self.get(), rhs.get()) })
+    }
+}
+impl Add<GoldilocksField> for Avx2GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn add(self, rhs: GoldilocksField) -> Self {
+        self + Self::from(rhs)
+    }
+}
+impl Add<Avx2GoldilocksField> for GoldilocksField {
+    type Output = Avx2GoldilocksField;
+    #[inline]
+    fn add(self, rhs: Self::Output) -> Self::Output {
+        Self::Output::from(self) + rhs
+    }
+}
+impl AddAssign<Self> for Avx2GoldilocksField {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = *self + rhs;
+    }
+}
+impl AddAssign<GoldilocksField> for Avx2GoldilocksField {
+    #[inline]
+    fn add_assign(&mut self, rhs: GoldilocksField) {
+        *self = *self + rhs;
+    }
+}
+
+impl Debug for Avx2GoldilocksField {
+    #[inline]
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "({:?})", self.get())
+    }
+}
+
+impl Default for Avx2GoldilocksField {
+    #[inline]
+    fn default() -> Self {
+        Self([GoldilocksField::from(0); 4])
+    }
+}
+
+impl Div<GoldilocksField> for Avx2GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn div(self, rhs: GoldilocksField) -> Self {
+        let rhs_value = Fr::from_repr(rhs).unwrap();
+        let rhs_inverse = rhs_value.inverse().unwrap().into_repr();
+        self * rhs_inverse
+    }
+}
+impl DivAssign<GoldilocksField> for Avx2GoldilocksField {
+    #[inline]
+    fn div_assign(&mut self, rhs: GoldilocksField) {
+        let rhs_value = Fr::from_repr(rhs).unwrap();
+        let rhs_inverse = rhs_value.inverse().unwrap().into_repr();
+        *self *= rhs_inverse;
+    }
+}
+
+impl From<GoldilocksField> for Avx2GoldilocksField {
+    fn from(x: GoldilocksField) -> Self {
+        Self([x; 4])
+    }
+}
+
+impl Mul<Self> for Avx2GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        Self::new(unsafe { mul(self.get(), rhs.get()) })
+    }
+}
+impl Mul<GoldilocksField> for Avx2GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn mul(self, rhs: GoldilocksField) -> Self {
+        self * Self::from(rhs)
+    }
+}
+impl Mul<Avx2GoldilocksField> for GoldilocksField {
+    type Output = Avx2GoldilocksField;
+    #[inline]
+    fn mul(self, rhs: Avx2GoldilocksField) -> Self::Output {
+        Self::Output::from(self) * rhs
+    }
+}
+impl MulAssign<Self> for Avx2GoldilocksField {
+    #[inline]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = *self * rhs;
+    }
+}
+impl MulAssign<GoldilocksField> for Avx2GoldilocksField {
+    #[inline]
+    fn mul_assign(&mut self, rhs: GoldilocksField) {
+        *self = *self * rhs;
+    }
+}
+
+impl Neg for Avx2GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn neg(self) -> Self {
+        Self::new(unsafe { neg(self.get()) })
+    }
+}
+//
+// impl Product for Avx2GoldilocksField {
+//     #[inline]
+//     fn product<I: Iterator<Item = Self>>(iter: I) -> Self {
+//         iter.reduce(|x, y| x * y).unwrap_or(Self::ONES)
+//     }
+// }
+
+impl Sub<Self> for Avx2GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: Self) -> Self {
+        Self::new(unsafe { sub(self.get(), rhs.get()) })
+    }
+}
+impl Sub<GoldilocksField> for Avx2GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: GoldilocksField) -> Self {
+        self - Self::from(rhs)
+    }
+}
+impl Sub<Avx2GoldilocksField> for GoldilocksField {
+    type Output = Avx2GoldilocksField;
+    #[inline]
+    fn sub(self, rhs: Avx2GoldilocksField) -> Self::Output {
+        Self::Output::from(self) - rhs
+    }
+}
+impl SubAssign<Self> for Avx2GoldilocksField {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = *self - rhs;
+    }
+}
+impl SubAssign<GoldilocksField> for Avx2GoldilocksField {
+    #[inline]
+    fn sub_assign(&mut self, rhs: GoldilocksField) {
+        *self = *self - rhs;
+    }
+}
+
+// impl Sum for Avx2GoldilocksField {
+//     #[inline]
+//     fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
+//         iter.reduce(|x, y| x + y).unwrap_or(Self::ZEROS)
+//     }
+// }
+
+// Resources:
+// 1. Intel Intrinsics Guide for explanation of each intrinsic:
+//    https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+// 2. uops.info lists micro-ops for each instruction: https://uops.info/table.html
+// 3. Intel optimization manual for introduction to x86 vector extensions and best practices:
+//    https://software.intel.com/content/www/us/en/develop/download/intel-64-and-ia-32-architectures-optimization-reference-manual.html
+
+// Preliminary knowledge:
+// 1. Vector code usually avoids branching. Instead of branches, we can do input selection with
+//    _mm256_blendv_epi8 or similar instruction. If all we're doing is conditionally zeroing a
+//    vector element then _mm256_and_si256 or _mm256_andnot_si256 may be used and are cheaper.
+//
+// 2. AVX does not support addition with carry but 128-bit (2-word) addition can be easily
+//    emulated. The method recognizes that for a + b overflowed iff (a + b) < a:
+//        i. res_lo = a_lo + b_lo
+//       ii. carry_mask = res_lo < a_lo
+//      iii. res_hi = a_hi + b_hi - carry_mask
+//    Notice that carry_mask is subtracted, not added. This is because AVX comparison instructions
+//    return -1 (all bits 1) for true and 0 for false.
+//
+// 3. AVX does not have unsigned 64-bit comparisons. Those can be emulated with signed comparisons
+//    by recognizing that a <u b iff a + (1 << 63) <s b + (1 << 63), where the addition wraps around
+//    and the comparisons are unsigned and signed respectively. The shift function adds/subtracts
+//    1 << 63 to enable this trick.
+//      Example: addition with carry.
+//        i. a_lo_s = shift(a_lo)
+//       ii. res_lo_s = a_lo_s + b_lo
+//      iii. carry_mask = res_lo_s <s a_lo_s
+//       iv. res_lo = shift(res_lo_s)
+//        v. res_hi = a_hi + b_hi - carry_mask
+//    The suffix _s denotes a value that has been shifted by 1 << 63. The result of addition is
+//    shifted if exactly one of the operands is shifted, as is the case on line ii. Line iii.
+//    performs a signed comparison res_lo_s <s a_lo_s on shifted values to emulate unsigned
+//    comparison res_lo <u a_lo on unshifted values. Finally, line iv. reverses the shift so the
+//    result can be returned.
+//      When performing a chain of calculations, we can often save instructions by letting the shift
+//    propagate through and only undoing it when necessary. For example, to compute the addition of
+//    three two-word (128-bit) numbers we can do:
+//        i. a_lo_s = shift(a_lo)
+//       ii. tmp_lo_s = a_lo_s + b_lo
+//      iii. tmp_carry_mask = tmp_lo_s <s a_lo_s
+//       iv. tmp_hi = a_hi + b_hi - tmp_carry_mask
+//        v. res_lo_s = tmp_lo_s + c_lo
+//       vi. res_carry_mask = res_lo_s <s tmp_lo_s
+//      vii. res_lo = shift(res_lo_s)
+//     viii. res_hi = tmp_hi + c_hi - res_carry_mask
+//    Notice that the above 3-value addition still only requires two calls to shift, just like our
+//    2-value addition.
+const SIGN_BIT: __m256i = unsafe { transmute([i64::MIN; 4]) };
+const SHIFTED_FIELD_ORDER: __m256i =
+    unsafe { transmute([GOLDILOCKS_FIELD_ORDER ^ (i64::MIN as u64); 4]) };
+const EPSILON: __m256i = unsafe { transmute([GoldilocksField([4294967295u64]); 4]) };
+// Goldilocks Order
+const GOLDILOCKS_FIELD_ORDER: u64 = 0xFFFFFFFF00000001;
+
+/// Add 2^63 with overflow. Needed to emulate unsigned comparisons (see point 3. in
+/// packed_prime_field.rs).
+#[inline]
+pub unsafe fn shift(x: __m256i) -> __m256i {
+    _mm256_xor_si256(x, SIGN_BIT)
+}
+
+/// Convert to canonical representation.
+/// The argument is assumed to be shifted by 1 << 63 (i.e. x_s = x + 1<<63, where x is the field
+///   value). The returned value is similarly shifted by 1 << 63 (i.e. we return y_s = y + (1<<63),
+///   where 0 <= y < FIELD_ORDER).
+#[inline]
+unsafe fn canonicalize_s(x_s: __m256i) -> __m256i {
+    // If x >= FIELD_ORDER then corresponding mask bits are all 0; otherwise all 1.
+    let mask = _mm256_cmpgt_epi64(SHIFTED_FIELD_ORDER, x_s);
+    // wrapback_amt is -FIELD_ORDER if mask is 0; otherwise 0.
+    let wrapback_amt = _mm256_andnot_si256(mask, EPSILON);
+    _mm256_add_epi64(x_s, wrapback_amt)
+}
+
+/// Addition u64 + u64 -> u64. Assumes that x + y < 2^64 + FIELD_ORDER. The second argument is
+/// pre-shifted by 1 << 63. The result is similarly shifted.
+#[inline]
+unsafe fn add_no_double_overflow_64_64s_s(x: __m256i, y_s: __m256i) -> __m256i {
+    let res_wrapped_s = _mm256_add_epi64(x, y_s);
+    let mask = _mm256_cmpgt_epi64(y_s, res_wrapped_s); // -1 if overflowed else 0.
+    let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0.
+    let res_s = _mm256_add_epi64(res_wrapped_s, wrapback_amt);
+    res_s
+}
+
+#[inline]
+unsafe fn add(x: __m256i, y: __m256i) -> __m256i {
+    let y_s = shift(y);
+    let res_s = add_no_double_overflow_64_64s_s(x, canonicalize_s(y_s));
+    shift(res_s)
+}
+
+#[inline]
+unsafe fn sub(x: __m256i, y: __m256i) -> __m256i {
+    let mut y_s = shift(y);
+    y_s = canonicalize_s(y_s);
+    let x_s = shift(x);
+    let mask = _mm256_cmpgt_epi64(y_s, x_s); // -1 if sub will underflow (y > x) else 0.
+    let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflow else 0.
+    let res_wrapped = _mm256_sub_epi64(x_s, y_s);
+    let res = _mm256_sub_epi64(res_wrapped, wrapback_amt);
+    res
+}
+
+#[inline]
+unsafe fn neg(y: __m256i) -> __m256i {
+    let y_s = shift(y);
+    _mm256_sub_epi64(SHIFTED_FIELD_ORDER, canonicalize_s(y_s))
+}
+
+/// Full 64-bit by 64-bit multiplication. This emulated multiplication is 1.33x slower than the
+/// scalar instruction, but may be worth it if we want our data to live in vector registers.
+#[inline]
+unsafe fn mul64_64(x: __m256i, y: __m256i) -> (__m256i, __m256i) {
+    // We want to move the high 32 bits to the low position. The multiplication instruction ignores
+    // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can
+    // be done on port 5; bitshifts run on ports 0 and 1, competing with multiplication.
+    //   This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the
+    // distinction; the casts are free and it guarantees that the exact bit pattern is preserved.
+    // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency
+    // since Haswell.
+    let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x)));
+    let y_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(y)));
+
+    // All four pairwise multiplications
+    let mul_ll = _mm256_mul_epu32(x, y);
+    let mul_lh = _mm256_mul_epu32(x, y_hi);
+    let mul_hl = _mm256_mul_epu32(x_hi, y);
+    let mul_hh = _mm256_mul_epu32(x_hi, y_hi);
+
+    // Bignum addition
+    // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow.
+    let mul_ll_hi = _mm256_srli_epi64::<32>(mul_ll);
+    let t0 = _mm256_add_epi64(mul_hl, mul_ll_hi);
+    // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow.
+    // Also, extract high 32 bits of t0 and add to mul_hh.
+    let t0_lo = _mm256_and_si256(t0, EPSILON);
+    let t0_hi = _mm256_srli_epi64::<32>(t0);
+    let t1 = _mm256_add_epi64(mul_lh, t0_lo);
+    let t2 = _mm256_add_epi64(mul_hh, t0_hi);
+    // Lastly, extract the high 32 bits of t1 and add to t2.
+    let t1_hi = _mm256_srli_epi64::<32>(t1);
+    let res_hi = _mm256_add_epi64(t2, t1_hi);
+
+    // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high
+    // position).
+    let t1_lo = _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(t1)));
+    let res_lo = _mm256_blend_epi32::<0xaa>(mul_ll, t1_lo);
+
+    (res_hi, res_lo)
+}
+
+/// Full 64-bit squaring. This routine is 1.2x faster than the scalar instruction.
+#[inline]
+unsafe fn square64(x: __m256i) -> (__m256i, __m256i) {
+    // Get high 32 bits of x. See comment in mul64_64_s.
+    let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x)));
+
+    // All pairwise multiplications.
+    let mul_ll = _mm256_mul_epu32(x, x);
+    let mul_lh = _mm256_mul_epu32(x, x_hi);
+    let mul_hh = _mm256_mul_epu32(x_hi, x_hi);
+
+    // Bignum addition, but mul_lh is shifted by 33 bits (not 32).
+    let mul_ll_hi = _mm256_srli_epi64::<33>(mul_ll);
+    let t0 = _mm256_add_epi64(mul_lh, mul_ll_hi);
+    let t0_hi = _mm256_srli_epi64::<31>(t0);
+    let res_hi = _mm256_add_epi64(mul_hh, t0_hi);
+
+    // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high
+    // position).
+    let mul_lh_lo = _mm256_slli_epi64::<33>(mul_lh);
+    let res_lo = _mm256_add_epi64(mul_ll, mul_lh_lo);
+
+    (res_hi, res_lo)
+}
+
+/// Goldilocks addition of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be <=
+/// `0xffffffff00000000`. The result is shifted by 2**63.
+#[inline]
+unsafe fn add_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i {
+    let res_wrapped_s = _mm256_add_epi64(x_s, y);
+    // 32-bit compare is faster than 64-bit. It's safe as long as x > res_wrapped iff x >> 32 >
+    // res_wrapped >> 32. The case of x >> 32 > res_wrapped >> 32 is trivial and so is <. The case
+    // where x >> 32 = res_wrapped >> 32 remains. If x >> 32 = res_wrapped >> 32, then y >> 32 =
+    // 0xffffffff and the addition of the low 32 bits generated a carry. This can never occur if y
+    // <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no carry can occur.
+    let mask = _mm256_cmpgt_epi32(x_s, res_wrapped_s); // -1 if overflowed else 0.
+                                                       // The mask contains 0xffffffff in the high 32 bits if wraparound occured and 0 otherwise.
+    let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0.
+    let res_s = _mm256_add_epi64(res_wrapped_s, wrapback_amt);
+    res_s
+}
+
+/// Goldilocks subtraction of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be
+/// <= `0xffffffff00000000`. The result is shifted by 2**63.
+#[inline]
+unsafe fn sub_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i {
+    let res_wrapped_s = _mm256_sub_epi64(x_s, y);
+    // 32-bit compare is faster than 64-bit. It's safe as long as res_wrapped > x iff res_wrapped >>
+    // 32 > x >> 32. The case of res_wrapped >> 32 > x >> 32 is trivial and so is <. The case where
+    // res_wrapped >> 32 = x >> 32 remains. If res_wrapped >> 32 = x >> 32, then y >> 32 =
+    // 0xffffffff and the subtraction of the low 32 bits generated a borrow. This can never occur if
+    // y <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no borrow can occur.
+    let mask = _mm256_cmpgt_epi32(res_wrapped_s, x_s); // -1 if underflowed else 0.
+                                                       // The mask contains 0xffffffff in the high 32 bits if wraparound occured and 0 otherwise.
+    let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflowed else 0.
+    let res_s = _mm256_sub_epi64(res_wrapped_s, wrapback_amt);
+    res_s
+}
+
+#[inline]
+unsafe fn reduce128(x: (__m256i, __m256i)) -> __m256i {
+    let (hi0, lo0) = x;
+    let lo0_s = shift(lo0);
+    let hi_hi0 = _mm256_srli_epi64::<32>(hi0);
+    let lo1_s = sub_small_64s_64_s(lo0_s, hi_hi0);
+    let t1 = _mm256_mul_epu32(hi0, EPSILON);
+    let lo2_s = add_small_64s_64_s(lo1_s, t1);
+    let lo2 = shift(lo2_s);
+    lo2
+}
+
+/// Multiply two integers modulo FIELD_ORDER.
+#[inline]
+unsafe fn mul(x: __m256i, y: __m256i) -> __m256i {
+    reduce128(mul64_64(x, y))
+}
+
+// /// Square an integer modulo FIELD_ORDER.
+#[inline]
+unsafe fn square(x: __m256i) -> __m256i {
+    reduce128(square64(x))
+}
+
+#[inline]
+unsafe fn interleave1(x: __m256i, y: __m256i) -> (__m256i, __m256i) {
+    let a = _mm256_unpacklo_epi64(x, y);
+    let b = _mm256_unpackhi_epi64(x, y);
+    (a, b)
+}
+
+#[inline]
+unsafe fn interleave2(x: __m256i, y: __m256i) -> (__m256i, __m256i) {
+    let y_lo = _mm256_castsi256_si128(y); // This has 0 cost.
+
+    // 1 places y_lo in the high half of x; 0 would place it in the lower half.
+    let a = _mm256_inserti128_si256::<1>(x, y_lo);
+    // NB: _mm256_permute2x128_si256 could be used here as well but _mm256_inserti128_si256 has
+    // lower latency on Zen 3 processors.
+
+    // Each nibble of the constant has the following semantics:
+    // 0 => src1[low 128 bits]
+    // 1 => src1[high 128 bits]
+    // 2 => src2[low 128 bits]
+    // 3 => src2[high 128 bits]
+    // The low (resp. high) nibble chooses the low (resp. high) 128 bits of the result.
+    let b = _mm256_permute2x128_si256::<0x31>(x, y);
+
+    (a, b)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Avx2GoldilocksField;
+    use crate::ff::*;
+    use crate::field_gl::{Fr, FrRepr as GoldilocksField};
+    use std::time::Instant;
+    // use crate::packed::PackedField;
+
+    fn test_vals_a() -> [GoldilocksField; 4] {
+        [
+            GoldilocksField([14479013849828404771u64]),
+            GoldilocksField([9087029921428221768u64]),
+            GoldilocksField([2441288194761790662u64]),
+            GoldilocksField([5646033492608483824u64]),
+        ]
+    }
+    fn test_vals_b() -> [GoldilocksField; 4] {
+        [
+            GoldilocksField([17891926589593242302u64]),
+            GoldilocksField([11009798273260028228u64]),
+            GoldilocksField([2028722748960791447u64]),
+            GoldilocksField([7929433601095175579u64]),
+        ]
+    }
+
+    #[test]
+    fn test_add() {
+        let a_arr = test_vals_a();
+        let b_arr = test_vals_b();
+        let start = Instant::now();
+        let packed_a = Avx2GoldilocksField::from_slice(&a_arr);
+        let packed_b = Avx2GoldilocksField::from_slice(&b_arr);
+        let packed_res = *packed_a + *packed_b;
+        let arr_res = packed_res.as_slice();
+        let avx2_duration = start.elapsed();
+        // println!("arr_res: {:?}", arr_res);
+
+        let start = Instant::now();
+        let expected = a_arr
+            .iter()
+            .zip(b_arr)
+            .map(|(&a, b)| Fr::from_repr(a).unwrap() + Fr::from_repr(b).unwrap());
+        let expected_values: Vec<Fr> = expected.collect();
+        // println!("expected values: {:?}", expected_values);
+        let non_accelerated_duration = start.elapsed();
+        for (exp, &res) in expected_values.iter().zip(arr_res) {
+            assert_eq!(res, exp.into_repr());
+        }
+
+        println!("test_add_AVX2_accelerated time: {:?}", avx2_duration);
+        println!(
+            "test_add_Non_accelerated time: {:?}",
+            non_accelerated_duration
+        );
+    }
+
+    #[test]
+    fn test_mul() {
+        let a_arr = test_vals_a();
+        let b_arr = test_vals_b();
+        let start = Instant::now();
+        let packed_a = *Avx2GoldilocksField::from_slice(&a_arr);
+        let packed_b = *Avx2GoldilocksField::from_slice(&b_arr);
+        let packed_res = packed_a * packed_b;
+        let arr_res = packed_res.as_slice();
+        let avx2_duration = start.elapsed();
+        // println!("arr_res: {:?}", arr_res);
+
+        let start = Instant::now();
+        let expected = a_arr
+            .iter()
+            .zip(b_arr)
+            .map(|(&a, b)| Fr::from_repr(a).unwrap() * Fr::from_repr(b).unwrap());
+        let expected_values: Vec<Fr> = expected.collect();
+        let non_accelerated_duration = start.elapsed();
+        // println!("expected values: {:?}", expected_values);
+
+        for (exp, &res) in expected_values.iter().zip(arr_res) {
+            assert_eq!(res, exp.into_repr());
+        }
+
+        println!("test_mul_AVX2_accelerated time: {:?}", avx2_duration);
+        println!(
+            "test_mul_Non_accelerated time: {:?}",
+            non_accelerated_duration
+        );
+    }
+
+    #[test]
+    fn test_div() {
+        let a_arr = test_vals_a();
+        let start = Instant::now();
+        let packed_a = *Avx2GoldilocksField::from_slice(&a_arr);
+        let packed_res = packed_a / GoldilocksField([7929433601095175579u64]);
+        let arr_res = packed_res.as_slice();
+        let avx2_duration = start.elapsed();
+        // println!("arr_res: {:?}", arr_res);
+
+        let start = Instant::now();
+        let expected = a_arr.iter().map(|&a| {
+            Fr::from_repr(a).unwrap()
+                / Fr::from_repr(GoldilocksField([7929433601095175579u64])).unwrap()
+        });
+        let expected_values: Vec<Fr> = expected.collect();
+        let non_accelerated_duration = start.elapsed();
+        // println!("expected values: {:?}", expected_values);
+
+        for (exp, &res) in expected_values.iter().zip(arr_res) {
+            assert_eq!(res, exp.into_repr());
+        }
+
+        println!("test_div_AVX2_accelerated time: {:?}", avx2_duration);
+        println!(
+            "test_div_Non_accelerated time: {:?}",
+            non_accelerated_duration
+        );
+    }
+
+    #[test]
+    fn test_square() {
+        let a_arr = test_vals_a();
+        let start = Instant::now();
+        let packed_a = *Avx2GoldilocksField::from_slice(&a_arr);
+        let packed_res = packed_a.square();
+        let arr_res = packed_res.as_slice();
+        let avx2_duration = start.elapsed();
+        // println!("arr_res: {:?}", arr_res);
+
+        let start = Instant::now();
+        let mut expected_values = Vec::new();
+        for &a in &a_arr {
+            match Fr::from_repr(a) {
+                Ok(mut fr) => {
+                    fr.square();
+                    expected_values.push(fr);
+                }
+                Err(_) => {
+                    continue;
+                }
+            }
+        }
+        let non_accelerated_duration = start.elapsed();
+        // println!("expected values: {:?}", expected_values);
+        for (exp, &res) in expected_values.iter().zip(arr_res) {
+            assert_eq!(res, exp.into_repr());
+        }
+        println!("test_square_AVX2_accelerated time: {:?}", avx2_duration);
+        println!(
+            "test_square_Non_accelerated time: {:?}",
+            non_accelerated_duration
+        );
+    }
+
+    #[test]
+    fn test_neg() {
+        let a_arr = test_vals_a();
+        let start = Instant::now();
+        let packed_a = *Avx2GoldilocksField::from_slice(&a_arr);
+        let packed_res = -packed_a;
+        let arr_res = packed_res.as_slice();
+        let avx2_duration = start.elapsed();
+        // println!("arr_res: {:?}", arr_res);
+
+        let start = Instant::now();
+        let expected = a_arr.iter().map(|&a| -Fr::from_repr(a).unwrap());
+        let expected_values: Vec<Fr> = expected.collect();
+        let non_accelerated_duration = start.elapsed();
+        // println!("expected values: {:?}", expected_values);
+
+        for (exp, &res) in expected_values.iter().zip(arr_res) {
+            assert_eq!(res, exp.into_repr());
+        }
+
+        println!("test_neg_AVX2_accelerated time: {:?}", avx2_duration);
+        println!(
+            "test_neg_Non_accelerated time: {:?}",
+            non_accelerated_duration
+        );
+    }
+
+    #[test]
+    fn test_sub() {
+        let a_arr = test_vals_a();
+        let b_arr = test_vals_b();
+        let start = Instant::now();
+        let packed_a = *Avx2GoldilocksField::from_slice(&a_arr);
+        let packed_b = *Avx2GoldilocksField::from_slice(&b_arr);
+        let packed_res = packed_a - packed_b;
+        let arr_res = packed_res.as_slice();
+        let avx2_duration = start.elapsed();
+        // println!("arr_res: {:?}", arr_res);
+
+        let start = Instant::now();
+        let expected = a_arr
+            .iter()
+            .zip(b_arr)
+            .map(|(&a, b)| Fr::from_repr(a).unwrap() - Fr::from_repr(b).unwrap());
+        let expected_values: Vec<Fr> = expected.collect();
+        let non_accelerated_duration = start.elapsed();
+        // println!("expected values: {:?}", expected_values);
+
+        for (exp, &res) in expected_values.iter().zip(arr_res) {
+            assert_eq!(res, exp.into_repr());
+        }
+
+        println!("test_sub_AVX2_accelerated time: {:?}", avx2_duration);
+        println!(
+            "test_sub_Non_accelerated time: {:?}",
+            non_accelerated_duration
+        );
+    }
+
+    #[test]
+    fn test_interleave_is_involution() {
+        let a_arr = test_vals_a();
+        let b_arr = test_vals_b();
+
+        let packed_a = *Avx2GoldilocksField::from_slice(&a_arr);
+        let packed_b = *Avx2GoldilocksField::from_slice(&b_arr);
+        {
+            // Interleave, then deinterleave.
+            let (x, y) = packed_a.interleave(packed_b, 1);
+            let (res_a, res_b) = x.interleave(y, 1);
+            assert_eq!(res_a.as_slice(), a_arr);
+            assert_eq!(res_b.as_slice(), b_arr);
+        }
+        {
+            let (x, y) = packed_a.interleave(packed_b, 2);
+            let (res_a, res_b) = x.interleave(y, 2);
+            assert_eq!(res_a.as_slice(), a_arr);
+            assert_eq!(res_b.as_slice(), b_arr);
+        }
+        {
+            let (x, y) = packed_a.interleave(packed_b, 4);
+            let (res_a, res_b) = x.interleave(y, 4);
+            assert_eq!(res_a.as_slice(), a_arr);
+            assert_eq!(res_b.as_slice(), b_arr);
+        }
+    }
+
+    #[test]
+    fn test_interleave() {
+        let in_a: [GoldilocksField; 4] = [
+            GoldilocksField([0]),
+            GoldilocksField([1]),
+            GoldilocksField([2]),
+            GoldilocksField([3]),
+        ];
+        let in_b: [GoldilocksField; 4] = [
+            GoldilocksField([10]),
+            GoldilocksField([11]),
+            GoldilocksField([12]),
+            GoldilocksField([13]),
+        ];
+        let int1_a: [GoldilocksField; 4] = [
+            GoldilocksField([0]),
+            GoldilocksField([10]),
+            GoldilocksField([2]),
+            GoldilocksField([12]),
+        ];
+        let int1_b: [GoldilocksField; 4] = [
+            GoldilocksField([1]),
+            GoldilocksField([11]),
+            GoldilocksField([3]),
+            GoldilocksField([13]),
+        ];
+        let int2_a: [GoldilocksField; 4] = [
+            GoldilocksField([0]),
+            GoldilocksField([1]),
+            GoldilocksField([10]),
+            GoldilocksField([11]),
+        ];
+        let int2_b: [GoldilocksField; 4] = [
+            GoldilocksField([2]),
+            GoldilocksField([3]),
+            GoldilocksField([12]),
+            GoldilocksField([13]),
+        ];
+
+        let packed_a = *Avx2GoldilocksField::from_slice(&in_a);
+        let packed_b = *Avx2GoldilocksField::from_slice(&in_b);
+        {
+            let (x1, y1) = packed_a.interleave(packed_b, 1);
+            assert_eq!(x1.as_slice(), int1_a);
+            assert_eq!(y1.as_slice(), int1_b);
+        }
+        {
+            let (x2, y2) = packed_a.interleave(packed_b, 2);
+            assert_eq!(x2.as_slice(), int2_a);
+            assert_eq!(y2.as_slice(), int2_b);
+        }
+        {
+            let (x4, y4) = packed_a.interleave(packed_b, 4);
+            assert_eq!(x4.as_slice(), in_a);
+            assert_eq!(y4.as_slice(), in_b);
+        }
+    }
+}
diff --git a/algebraic/src/arch/x86_64/avx512_field_gl.rs b/algebraic/src/arch/x86_64/avx512_field_gl.rs
new file mode 100644
index 00000000..fe9e8942
--- /dev/null
+++ b/algebraic/src/arch/x86_64/avx512_field_gl.rs
@@ -0,0 +1,751 @@
+//! Porting from plonky2
+//! https://github.com/0xPolygonZero/plonky2/blob/main/field/src/arch/x86_64/avx512_goldilocks_field.rs
+//!
+//! How to build/run/test:
+//! RUSTFLAGS='-C target-feature=+avx512f,+avx512bw,+avx512cd,+avx512dq,+avx512vl' cargo build --release
+use crate::ff::*;
+use crate::field_gl::{Fr, FrRepr as GoldilocksField};
+use core::arch::x86_64::*;
+use core::fmt;
+use core::fmt::{Debug, Formatter};
+use core::mem::transmute;
+use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+
+/// AVX512 Goldilocks Field
+///
+/// Ideally `Avx512GoldilocksField` would wrap `__m512i`. Unfortunately, `__m512i` has an alignment
+/// of 64B, which would preclude us from casting `[GoldilocksField; 8]` (alignment 8B) to
+/// `Avx512GoldilocksField`. We need to ensure that `Avx512GoldilocksField` has the same alignment as
+/// `GoldilocksField`. Thus we wrap `[GoldilocksField; 8]` and use the `new` and `get` methods to
+/// convert to and from `__m512i`.
+#[derive(Copy, Clone)]
+#[repr(transparent)]
+pub struct Avx512GoldilocksField(pub [GoldilocksField; 8]);
+
+const WIDTH: usize = 8;
+
+impl Avx512GoldilocksField {
+    #[inline]
+    fn new(x: __m512i) -> Self {
+        unsafe { transmute(x) }
+    }
+    #[inline]
+    fn get(&self) -> __m512i {
+        unsafe { transmute(*self) }
+    }
+    #[inline]
+    pub fn from_slice(slice: &[GoldilocksField]) -> &Self {
+        assert_eq!(slice.len(), WIDTH);
+        unsafe { &*slice.as_ptr().cast() }
+    }
+    #[inline]
+    pub fn from_slice_mut(slice: &mut [GoldilocksField]) -> &mut Self {
+        assert_eq!(slice.len(), WIDTH);
+        unsafe { &mut *slice.as_mut_ptr().cast() }
+    }
+    #[inline]
+    pub fn as_slice(&self) -> &[GoldilocksField] {
+        &self.0[..]
+    }
+    #[inline]
+    pub fn as_slice_mut(&mut self) -> &mut [GoldilocksField] {
+        &mut self.0[..]
+    }
+    #[inline]
+    pub fn square(&self) -> Avx512GoldilocksField {
+        Self::new(unsafe { square(self.get()) })
+    }
+
+    #[inline]
+    fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) {
+        let (v0, v1) = (self.get(), other.get());
+        let (res0, res1) = match block_len {
+            1 => unsafe { interleave1(v0, v1) },
+            2 => unsafe { interleave2(v0, v1) },
+            4 => unsafe { interleave4(v0, v1) },
+            8 => (v0, v1),
+            _ => panic!("unsupported block_len"),
+        };
+        (Self::new(res0), Self::new(res1))
+    }
+}
+
+impl Add<Self> for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn add(self, rhs: Self) -> Self {
+        Self::new(unsafe { add(self.get(), rhs.get()) })
+    }
+}
+impl Add<GoldilocksField> for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn add(self, rhs: GoldilocksField) -> Self {
+        self + Self::from(rhs)
+    }
+}
+impl Add<Avx512GoldilocksField> for GoldilocksField {
+    type Output = Avx512GoldilocksField;
+    #[inline]
+    fn add(self, rhs: Self::Output) -> Self::Output {
+        Self::Output::from(self) + rhs
+    }
+}
+impl AddAssign<Self> for Avx512GoldilocksField {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = *self + rhs;
+    }
+}
+impl AddAssign<GoldilocksField> for Avx512GoldilocksField {
+    #[inline]
+    fn add_assign(&mut self, rhs: GoldilocksField) {
+        *self = *self + rhs;
+    }
+}
+
+impl Debug for Avx512GoldilocksField {
+    #[inline]
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "({:?})", self.get())
+    }
+}
+
+impl Default for Avx512GoldilocksField {
+    #[inline]
+    fn default() -> Self {
+        Self([GoldilocksField::from(0); 8])
+    }
+}
+
+impl Div<GoldilocksField> for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn div(self, rhs: GoldilocksField) -> Self {
+        let rhs_value = Fr::from_repr(rhs).unwrap();
+        let rhs_inverse = rhs_value.inverse().unwrap().into_repr();
+        self * rhs_inverse
+    }
+}
+impl DivAssign<GoldilocksField> for Avx512GoldilocksField {
+    #[inline]
+    fn div_assign(&mut self, rhs: GoldilocksField) {
+        let rhs_value = Fr::from_repr(rhs).unwrap();
+        let rhs_inverse = rhs_value.inverse().unwrap().into_repr();
+        *self *= rhs_inverse;
+    }
+}
+
+impl From<GoldilocksField> for Avx512GoldilocksField {
+    fn from(x: GoldilocksField) -> Self {
+        Self([x; 8])
+    }
+}
+
+impl Mul<Self> for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        Self::new(unsafe { mul(self.get(), rhs.get()) })
+    }
+}
+impl Mul<GoldilocksField> for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn mul(self, rhs: GoldilocksField) -> Self {
+        self * Self::from(rhs)
+    }
+}
+impl Mul<Avx512GoldilocksField> for GoldilocksField {
+    type Output = Avx512GoldilocksField;
+    #[inline]
+    fn mul(self, rhs: Avx512GoldilocksField) -> Self::Output {
+        Self::Output::from(self) * rhs
+    }
+}
+impl MulAssign<Self> for Avx512GoldilocksField {
+    #[inline]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = *self * rhs;
+    }
+}
+impl MulAssign<GoldilocksField> for Avx512GoldilocksField {
+    #[inline]
+    fn mul_assign(&mut self, rhs: GoldilocksField) {
+        *self = *self * rhs;
+    }
+}
+
+impl Neg for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn neg(self) -> Self {
+        Self::new(unsafe { neg(self.get()) })
+    }
+}
+
+// impl Product for Avx512GoldilocksField {
+//     #[inline]
+//     fn product<I: Iterator<Item = Self>>(iter: I) -> Self {
+//         iter.reduce(|x, y| x * y).unwrap_or(Self::ONES)
+//     }
+// }
+
+impl Sub<Self> for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: Self) -> Self {
+        Self::new(unsafe { sub(self.get(), rhs.get()) })
+    }
+}
+impl Sub<GoldilocksField> for Avx512GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: GoldilocksField) -> Self {
+        self - Self::from(rhs)
+    }
+}
+impl Sub<Avx512GoldilocksField> for GoldilocksField {
+    type Output = Avx512GoldilocksField;
+    #[inline]
+    fn sub(self, rhs: Avx512GoldilocksField) -> Self::Output {
+        Self::Output::from(self) - rhs
+    }
+}
+impl SubAssign<Self> for Avx512GoldilocksField {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = *self - rhs;
+    }
+}
+impl SubAssign<GoldilocksField> for Avx512GoldilocksField {
+    #[inline]
+    fn sub_assign(&mut self, rhs: GoldilocksField) {
+        *self = *self - rhs;
+    }
+}
+
+// impl Sum for Avx512GoldilocksField {
+//     #[inline]
+//     fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
+//         iter.reduce(|x, y| x + y).unwrap_or(Self::ZEROS)
+//     }
+// }
+
+const FIELD_ORDER: __m512i = unsafe { transmute([GOLDILOCKS_FIELD_ORDER; 8]) };
+const EPSILON: __m512i = unsafe { transmute([GoldilocksField([4294967295u64]); 8]) };
+// Goldilocks Order
+const GOLDILOCKS_FIELD_ORDER: u64 = 0xFFFFFFFF00000001;
+
+#[inline]
+unsafe fn canonicalize(x: __m512i) -> __m512i {
+    let mask = _mm512_cmpge_epu64_mask(x, FIELD_ORDER);
+    _mm512_mask_sub_epi64(x, mask, x, FIELD_ORDER)
+}
+
+#[inline]
+unsafe fn add_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i {
+    let res_wrapped = _mm512_add_epi64(x, y);
+    let mask = _mm512_cmplt_epu64_mask(res_wrapped, y); // mask set if add overflowed
+    let res = _mm512_mask_sub_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER);
+    res
+}
+
+#[inline]
+unsafe fn sub_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i {
+    let mask = _mm512_cmplt_epu64_mask(x, y); // mask set if sub will underflow (x < y)
+    let res_wrapped = _mm512_sub_epi64(x, y);
+    let res = _mm512_mask_add_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER);
+    res
+}
+
+#[inline]
+unsafe fn add(x: __m512i, y: __m512i) -> __m512i {
+    add_no_double_overflow_64_64(x, canonicalize(y))
+}
+
+#[inline]
+unsafe fn sub(x: __m512i, y: __m512i) -> __m512i {
+    sub_no_double_overflow_64_64(x, canonicalize(y))
+}
+
+#[inline]
+unsafe fn neg(y: __m512i) -> __m512i {
+    _mm512_sub_epi64(FIELD_ORDER, canonicalize(y))
+}
+
+const LO_32_BITS_MASK: __mmask16 = unsafe { transmute(0b0101010101010101u16) };
+
+#[inline]
+unsafe fn mul64_64(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
+    // We want to move the high 32 bits to the low position. The multiplication instruction ignores
+    // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can
+    // be done on port 5; bitshifts run on port 0, competing with multiplication.
+    //   This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the
+    // distinction; the casts are free and it guarantees that the exact bit pattern is preserved.
+    // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency
+    // since Haswell.
+    let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x)));
+    let y_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(y)));
+
+    // All four pairwise multiplications
+    let mul_ll = _mm512_mul_epu32(x, y);
+    let mul_lh = _mm512_mul_epu32(x, y_hi);
+    let mul_hl = _mm512_mul_epu32(x_hi, y);
+    let mul_hh = _mm512_mul_epu32(x_hi, y_hi);
+
+    // Bignum addition
+    // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow.
+    let mul_ll_hi = _mm512_srli_epi64::<32>(mul_ll);
+    let t0 = _mm512_add_epi64(mul_hl, mul_ll_hi);
+    // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow.
+    // Also, extract high 32 bits of t0 and add to mul_hh.
+    let t0_lo = _mm512_and_si512(t0, EPSILON);
+    let t0_hi = _mm512_srli_epi64::<32>(t0);
+    let t1 = _mm512_add_epi64(mul_lh, t0_lo);
+    let t2 = _mm512_add_epi64(mul_hh, t0_hi);
+    // Lastly, extract the high 32 bits of t1 and add to t2.
+    let t1_hi = _mm512_srli_epi64::<32>(t1);
+    let res_hi = _mm512_add_epi64(t2, t1_hi);
+
+    // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high
+    // position).
+    let t1_lo = _mm512_castps_si512(_mm512_moveldup_ps(_mm512_castsi512_ps(t1)));
+    let res_lo = _mm512_mask_blend_epi32(LO_32_BITS_MASK, t1_lo, mul_ll);
+
+    (res_hi, res_lo)
+}
+
+#[inline]
+unsafe fn square64(x: __m512i) -> (__m512i, __m512i) {
+    // Get high 32 bits of x. See comment in mul64_64_s.
+    let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x)));
+
+    // All pairwise multiplications.
+    let mul_ll = _mm512_mul_epu32(x, x);
+    let mul_lh = _mm512_mul_epu32(x, x_hi);
+    let mul_hh = _mm512_mul_epu32(x_hi, x_hi);
+
+    // Bignum addition, but mul_lh is shifted by 33 bits (not 32).
+    let mul_ll_hi = _mm512_srli_epi64::<33>(mul_ll);
+    let t0 = _mm512_add_epi64(mul_lh, mul_ll_hi);
+    let t0_hi = _mm512_srli_epi64::<31>(t0);
+    let res_hi = _mm512_add_epi64(mul_hh, t0_hi);
+
+    // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high
+    // position).
+    let mul_lh_lo = _mm512_slli_epi64::<33>(mul_lh);
+    let res_lo = _mm512_add_epi64(mul_ll, mul_lh_lo);
+
+    (res_hi, res_lo)
+}
+
+#[inline]
+unsafe fn reduce128(x: (__m512i, __m512i)) -> __m512i {
+    let (hi0, lo0) = x;
+    let hi_hi0 = _mm512_srli_epi64::<32>(hi0);
+    let lo1 = sub_no_double_overflow_64_64(lo0, hi_hi0);
+    let t1 = _mm512_mul_epu32(hi0, EPSILON);
+    let lo2 = add_no_double_overflow_64_64(lo1, t1);
+    lo2
+}
+
+#[inline]
+unsafe fn mul(x: __m512i, y: __m512i) -> __m512i {
+    reduce128(mul64_64(x, y))
+}
+
+#[inline]
+unsafe fn square(x: __m512i) -> __m512i {
+    reduce128(square64(x))
+}
+
+#[inline]
+unsafe fn interleave1(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
+    let a = _mm512_unpacklo_epi64(x, y);
+    let b = _mm512_unpackhi_epi64(x, y);
+    (a, b)
+}
+
+const INTERLEAVE2_IDX_A: __m512i = unsafe {
+    transmute([
+        0o00u64, 0o01u64, 0o10u64, 0o11u64, 0o04u64, 0o05u64, 0o14u64, 0o15u64,
+    ])
+};
+const INTERLEAVE2_IDX_B: __m512i = unsafe {
+    transmute([
+        0o02u64, 0o03u64, 0o12u64, 0o13u64, 0o06u64, 0o07u64, 0o16u64, 0o17u64,
+    ])
+};
+
+#[inline]
+unsafe fn interleave2(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
+    let a = _mm512_permutex2var_epi64(x, INTERLEAVE2_IDX_A, y);
+    let b = _mm512_permutex2var_epi64(x, INTERLEAVE2_IDX_B, y);
+    (a, b)
+}
+
+#[inline]
+unsafe fn interleave4(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
+    let a = _mm512_shuffle_i64x2::<0x44>(x, y);
+    let b = _mm512_shuffle_i64x2::<0xee>(x, y);
+    (a, b)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Avx512GoldilocksField;
+    use crate::ff::*;
+    use crate::field_gl::{Fr, FrRepr as GoldilocksField};
+    use std::time::Instant;
+
+    fn test_vals_a() -> [GoldilocksField; 8] {
+        [
+            GoldilocksField([14479013849828404771u64]),
+            GoldilocksField([9087029921428221768u64]),
+            GoldilocksField([2441288194761790662u64]),
+            GoldilocksField([5646033492608483824u64]),
+            GoldilocksField([2779181197214900072u64]),
+            GoldilocksField([2989742820063487116u64]),
+            GoldilocksField([727880025589250743u64]),
+            GoldilocksField([3803926346107752679u64]),
+        ]
+    }
+    fn test_vals_b() -> [GoldilocksField; 8] {
+        [
+            GoldilocksField([17891926589593242302u64]),
+            GoldilocksField([11009798273260028228u64]),
+            GoldilocksField([2028722748960791447u64]),
+            GoldilocksField([7929433601095175579u64]),
+            GoldilocksField([6632528436085461172u64]),
+            GoldilocksField([2145438710786785567u64]),
+            GoldilocksField([11821483668392863016u64]),
+            GoldilocksField([15638272883309521929u64]),
+        ]
+    }
+
+    #[test]
+    fn test_add1() {
+        let a_arr = test_vals_a();
+        let b_arr = test_vals_b();
+        let start = Instant::now();
+        let packed_a = *Avx512GoldilocksField::from_slice(&a_arr);
+        let packed_b = *Avx512GoldilocksField::from_slice(&b_arr);
+        let packed_res = packed_a + packed_b;
+        let arr_res = packed_res.as_slice();
+        let avx512_duration = start.elapsed();
+        let start = Instant::now();
+        let expected = a_arr
+            .iter()
+            .zip(b_arr)
+            .map(|(&a, b)| Fr::from_repr(a).unwrap() + Fr::from_repr(b).unwrap());
+        let expected_values: Vec<Fr> = expected.collect();
+        // println!("expected values: {:?}", expected_values);
+        let non_accelerated_duration = start.elapsed();
+        for (exp, &res) in expected_values.iter().zip(arr_res) {
+            assert_eq!(res, exp.into_repr());
+        }
+
+        println!("test_add_AVX512_accelerated time: {:?}", avx512_duration);
+        println!(
+            "test_add_Non_accelerated time: {:?}",
+            non_accelerated_duration
+        );
+    }
+
+    #[test]
+    fn test_mul() {
+        let a_arr = test_vals_a();
+        let b_arr = test_vals_b();
+        let start = Instant::now();
+        let packed_a = *Avx512GoldilocksField::from_slice(&a_arr);
+        let packed_b = *Avx512GoldilocksField::from_slice(&b_arr);
+        let packed_res = packed_a * packed_b;
+        let arr_res = packed_res.as_slice();
+        let avx512_duration = start.elapsed();
+
+        let start = Instant::now();
+        let expected = a_arr
+            .iter()
+            .zip(b_arr)
+            .map(|(&a, b)| Fr::from_repr(a).unwrap() * Fr::from_repr(b).unwrap());
+        let expected_values: Vec<Fr> = expected.collect();
+        let non_accelerated_duration = start.elapsed();
+        // println!("expected values: {:?}", expected_values);
+
+        for (exp, &res) in expected_values.iter().zip(arr_res) {
+            assert_eq!(res, exp.into_repr());
+        }
+
+        println!("test_mul_AVX512_accelerated time: {:?}", avx512_duration);
+        println!(
+            "test_mul_Non_accelerated time: {:?}",
+            non_accelerated_duration
+        );
+    }
+
+    #[test]
+    fn test_div() {
+        let a_arr = test_vals_a();
+        let start = Instant::now();
+        let packed_a = *Avx512GoldilocksField::from_slice(&a_arr);
+        let packed_res = packed_a / GoldilocksField([7929433601095175579u64]);
+        let arr_res = packed_res.as_slice();
+        let avx512_duration = start.elapsed();
+        // println!("arr_res: {:?}", arr_res);
+
+        let start = Instant::now();
+        let expected = a_arr.iter().map(|&a| {
+            Fr::from_repr(a).unwrap()
+                / Fr::from_repr(GoldilocksField([7929433601095175579u64])).unwrap()
+        });
+        let expected_values: Vec<Fr> = expected.collect();
+        let non_accelerated_duration = start.elapsed();
+        // println!("expected values: {:?}", expected_values);
+
+        for (exp, &res) in expected_values.iter().zip(arr_res) {
+            assert_eq!(res, exp.into_repr());
+        }
+
+        println!("test_div_AVX512_accelerated time: {:?}", avx512_duration);
+        println!(
+            "test_div_Non_accelerated time: {:?}",
+            non_accelerated_duration
+        );
+    }
+
+    #[test]
+    fn test_square() {
+        let a_arr = test_vals_a();
+        let start = Instant::now();
+        let packed_a = *Avx512GoldilocksField::from_slice(&a_arr);
+        let packed_res = packed_a.square();
+        let arr_res = packed_res.as_slice();
+        let avx512_duration = start.elapsed();
+        // println!("arr_res: {:?}", arr_res);
+
+        let start = Instant::now();
+        let mut expected_values = Vec::new();
+        for &a in &a_arr {
+            match Fr::from_repr(a) {
+                Ok(mut fr) => {
+                    fr.square();
+                    expected_values.push(fr);
+                }
+                Err(_) => {
+                    continue;
+                }
+            }
+        }
+        let non_accelerated_duration = start.elapsed();
+        // println!("expected values: {:?}", expected_values);
+        for (exp, &res) in expected_values.iter().zip(arr_res) {
+            assert_eq!(res, exp.into_repr());
+        }
+        println!("test_square_AVX512_accelerated time: {:?}", avx512_duration);
+        println!(
+            "test_square_Non_accelerated time: {:?}",
+            non_accelerated_duration
+        );
+    }
+
+    #[test]
+    fn test_neg() {
+        let a_arr = test_vals_a();
+        let start = Instant::now();
+        let packed_a = *Avx512GoldilocksField::from_slice(&a_arr);
+        let packed_res = -packed_a;
+        let arr_res = packed_res.as_slice();
+        let avx512_duration = start.elapsed();
+        // println!("arr_res: {:?}", arr_res);
+
+        let start = Instant::now();
+        let expected = a_arr.iter().map(|&a| -Fr::from_repr(a).unwrap());
+        let expected_values: Vec<Fr> = expected.collect();
+        let non_accelerated_duration = start.elapsed();
+        // println!("expected values: {:?}", expected_values);
+
+        for (exp, &res) in expected_values.iter().zip(arr_res) {
+            assert_eq!(res, exp.into_repr());
+        }
+
+        println!("test_neg_AVX512_accelerated time: {:?}", avx512_duration);
+        println!(
+            "test_neg_Non_accelerated time: {:?}",
+            non_accelerated_duration
+        );
+    }
+
+    #[test]
+    fn test_sub() {
+        let a_arr = test_vals_a();
+        let b_arr = test_vals_b();
+        let start = Instant::now();
+        let packed_a = *Avx512GoldilocksField::from_slice(&a_arr);
+        let packed_b = *Avx512GoldilocksField::from_slice(&b_arr);
+        let packed_res = packed_a - packed_b;
+        let arr_res = packed_res.as_slice();
+        let avx512_duration = start.elapsed();
+        // println!("arr_res: {:?}", arr_res);
+
+        let start = Instant::now();
+        let expected = a_arr
+            .iter()
+            .zip(b_arr)
+            .map(|(&a, b)| Fr::from_repr(a).unwrap() - Fr::from_repr(b).unwrap());
+        let expected_values: Vec<Fr> = expected.collect();
+        let non_accelerated_duration = start.elapsed();
+        // println!("expected values: {:?}", expected_values);
+
+        for (exp, &res) in expected_values.iter().zip(arr_res) {
+            assert_eq!(res, exp.into_repr());
+        }
+
+        println!("test_sub_AVX512_accelerated time: {:?}", avx512_duration);
+        println!(
+            "test_sub_Non_accelerated time: {:?}",
+            non_accelerated_duration
+        );
+    }
+
+    #[test]
+    fn test_interleave_is_involution() {
+        let a_arr = test_vals_a();
+        let b_arr = test_vals_b();
+
+        let packed_a = *Avx512GoldilocksField::from_slice(&a_arr);
+        let packed_b = *Avx512GoldilocksField::from_slice(&b_arr);
+        {
+            // Interleave, then deinterleave.
+            let (x, y) = packed_a.interleave(packed_b, 1);
+            let (res_a, res_b) = x.interleave(y, 1);
+            assert_eq!(res_a.as_slice(), a_arr);
+            assert_eq!(res_b.as_slice(), b_arr);
+        }
+        {
+            let (x, y) = packed_a.interleave(packed_b, 2);
+            let (res_a, res_b) = x.interleave(y, 2);
+            assert_eq!(res_a.as_slice(), a_arr);
+            assert_eq!(res_b.as_slice(), b_arr);
+        }
+        {
+            let (x, y) = packed_a.interleave(packed_b, 4);
+            let (res_a, res_b) = x.interleave(y, 4);
+            assert_eq!(res_a.as_slice(), a_arr);
+            assert_eq!(res_b.as_slice(), b_arr);
+        }
+        {
+            let (x, y) = packed_a.interleave(packed_b, 8);
+            let (res_a, res_b) = x.interleave(y, 8);
+            assert_eq!(res_a.as_slice(), a_arr);
+            assert_eq!(res_b.as_slice(), b_arr);
+        }
+    }
+
+    #[test]
+    fn test_interleave() {
+        let in_a: [GoldilocksField; 8] = [
+            GoldilocksField([0u64]),
+            GoldilocksField([1u64]),
+            GoldilocksField([2u64]),
+            GoldilocksField([3u64]),
+            GoldilocksField([4u64]),
+            GoldilocksField([5u64]),
+            GoldilocksField([6u64]),
+            GoldilocksField([7u64]),
+        ];
+        let in_b: [GoldilocksField; 8] = [
+            GoldilocksField([10u64]),
+            GoldilocksField([11u64]),
+            GoldilocksField([12u64]),
+            GoldilocksField([13u64]),
+            GoldilocksField([14u64]),
+            GoldilocksField([15u64]),
+            GoldilocksField([16u64]),
+            GoldilocksField([17u64]),
+        ];
+        let int1_a: [GoldilocksField; 8] = [
+            GoldilocksField([0u64]),
+            GoldilocksField([10u64]),
+            GoldilocksField([2u64]),
+            GoldilocksField([12u64]),
+            GoldilocksField([4u64]),
+            GoldilocksField([14u64]),
+            GoldilocksField([6u64]),
+            GoldilocksField([16u64]),
+        ];
+        let int1_b: [GoldilocksField; 8] = [
+            GoldilocksField([1u64]),
+            GoldilocksField([11u64]),
+            GoldilocksField([3u64]),
+            GoldilocksField([13u64]),
+            GoldilocksField([5u64]),
+            GoldilocksField([15u64]),
+            GoldilocksField([7u64]),
+            GoldilocksField([17u64]),
+        ];
+        let int2_a: [GoldilocksField; 8] = [
+            GoldilocksField([0u64]),
+            GoldilocksField([1u64]),
+            GoldilocksField([10u64]),
+            GoldilocksField([11u64]),
+            GoldilocksField([4u64]),
+            GoldilocksField([5u64]),
+            GoldilocksField([14u64]),
+            GoldilocksField([15u64]),
+        ];
+        let int2_b: [GoldilocksField; 8] = [
+            GoldilocksField([2u64]),
+            GoldilocksField([3u64]),
+            GoldilocksField([12u64]),
+            GoldilocksField([13u64]),
+            GoldilocksField([6u64]),
+            GoldilocksField([7u64]),
+            GoldilocksField([16u64]),
+            GoldilocksField([17u64]),
+        ];
+        let int4_a: [GoldilocksField; 8] = [
+            GoldilocksField([0u64]),
+            GoldilocksField([1u64]),
+            GoldilocksField([2u64]),
+            GoldilocksField([3u64]),
+            GoldilocksField([10u64]),
+            GoldilocksField([11u64]),
+            GoldilocksField([12u64]),
+            GoldilocksField([13u64]),
+        ];
+        let int4_b: [GoldilocksField; 8] = [
+            GoldilocksField([4u64]),
+            GoldilocksField([5u64]),
+            GoldilocksField([6u64]),
+            GoldilocksField([7u64]),
+            GoldilocksField([14u64]),
+            GoldilocksField([15u64]),
+            GoldilocksField([16u64]),
+            GoldilocksField([17u64]),
+        ];
+
+        let packed_a = *Avx512GoldilocksField::from_slice(&in_a);
+        let packed_b = *Avx512GoldilocksField::from_slice(&in_b);
+        {
+            let (x1, y1) = packed_a.interleave(packed_b, 1);
+            assert_eq!(x1.as_slice(), int1_a);
+            assert_eq!(y1.as_slice(), int1_b);
+        }
+        {
+            let (x2, y2) = packed_a.interleave(packed_b, 2);
+            assert_eq!(x2.as_slice(), int2_a);
+            assert_eq!(y2.as_slice(), int2_b);
+        }
+        {
+            let (x4, y4) = packed_a.interleave(packed_b, 4);
+            assert_eq!(x4.as_slice(), int4_a);
+            assert_eq!(y4.as_slice(), int4_b);
+        }
+        {
+            let (x8, y8) = packed_a.interleave(packed_b, 8);
+            assert_eq!(x8.as_slice(), in_a);
+            assert_eq!(y8.as_slice(), in_b);
+        }
+    }
+}
diff --git a/algebraic/src/arch/x86_64/mod.rs b/algebraic/src/arch/x86_64/mod.rs
new file mode 100644
index 00000000..2bdff650
--- /dev/null
+++ b/algebraic/src/arch/x86_64/mod.rs
@@ -0,0 +1,20 @@
+#[cfg(all(
+    target_feature = "avx2",
+    not(all(
+        target_feature = "avx512bw",
+        target_feature = "avx512cd",
+        target_feature = "avx512dq",
+        target_feature = "avx512f",
+        target_feature = "avx512vl"
+    ))
+))]
+pub mod avx2_field_gl;
+
+#[cfg(all(
+    target_feature = "avx512bw",
+    target_feature = "avx512cd",
+    target_feature = "avx512dq",
+    target_feature = "avx512f",
+    target_feature = "avx512vl"
+))]
+pub mod avx512_field_gl;
diff --git a/algebraic/src/lib.rs b/algebraic/src/lib.rs
index be0b90b3..e48706a0 100644
--- a/algebraic/src/lib.rs
+++ b/algebraic/src/lib.rs
@@ -1,4 +1,5 @@
 #![allow(clippy::unit_arg)]
+#![feature(stdsimd)]
 
 #[macro_use]
 extern crate serde;
@@ -11,6 +12,8 @@ extern crate num_bigint;
 extern crate num_traits;
 extern crate rand;
 
+pub mod arch;
+
 pub mod circom_circuit;
 pub mod errors;
 pub mod field_gl;
@@ -26,6 +29,7 @@ pub use franklin_crypto::bellman as bellman_ce;
 
 #[cfg(test)]
 mod field_gl_test;
+// mod packed;
 
 #[cfg(target_arch = "wasm32")]
 extern crate wasm_bindgen;
diff --git a/algebraic/src/packable.rs b/algebraic/src/packable.rs
new file mode 100644
index 00000000..c6f8cd36
--- /dev/null
+++ b/algebraic/src/packable.rs
@@ -0,0 +1,40 @@
+use crate::packed::PackedField;
+use crate::types::Field;
+
+/// Points us to the default packing for a particular field. There may me multiple choices of
+/// PackedField for a particular Field (e.g. every Field is also a PackedField), but this is the
+/// recommended one. The recommended packing varies by target_arch and target_feature.
+pub trait Packable: Field {
+    type Packing: PackedField<Scalar = Self>;
+}
+
+impl<F: Field> Packable for F {
+    default type Packing = Self;
+}
+
+#[cfg(all(
+    target_arch = "x86_64",
+    target_feature = "avx2",
+    not(all(
+        target_feature = "avx512bw",
+        target_feature = "avx512cd",
+        target_feature = "avx512dq",
+        target_feature = "avx512f",
+        target_feature = "avx512vl"
+    ))
+))]
+impl Packable for crate::goldilocks_field::GoldilocksField {
+    type Packing = crate::arch::x86_64::avx2_goldilocks_field::Avx2GoldilocksField;
+}
+
+#[cfg(all(
+    target_arch = "x86_64",
+    target_feature = "avx512bw",
+    target_feature = "avx512cd",
+    target_feature = "avx512dq",
+    target_feature = "avx512f",
+    target_feature = "avx512vl"
+))]
+impl Packable for crate::goldilocks_field::GoldilocksField {
+    type Packing = crate::arch::x86_64::avx512_goldilocks_field::Avx512GoldilocksField;
+}
diff --git a/algebraic/src/packed.rs b/algebraic/src/packed.rs
new file mode 100644
index 00000000..0236962d
--- /dev/null
+++ b/algebraic/src/packed.rs
@@ -0,0 +1,127 @@
+use core::fmt::Debug;
+use core::iter::{Product, Sum};
+use core::ops::{Add, AddAssign, Div, Mul, MulAssign, Neg, Sub, SubAssign};
+use core::slice;
+
+// use crate::ops::Square;
+use crate::ff::*;
+
+/// # Safety
+/// - WIDTH is assumed to be a power of 2.
+/// - If P implements PackedField then P must be castable to/from [P::Scalar; P::WIDTH] without UB.
+pub unsafe trait PackedField:
+    'static
+    + Add<Self, Output = Self>
+    + Add<Self::Scalar, Output = Self>
+    + AddAssign<Self>
+    + AddAssign<Self::Scalar>
+    + Copy
+    + Debug
+    + Default
+    + From<Self::Scalar>
+    // TODO: Implement packed / packed division
+    + Div<Self::Scalar, Output = Self>
+    + Mul<Self, Output = Self>
+    + Mul<Self::Scalar, Output = Self>
+    + MulAssign<Self>
+    + MulAssign<Self::Scalar>
+    // + Square
+    + Neg<Output = Self>
+    + Product
+    + Send
+    + Sub<Self, Output = Self>
+    + Sub<Self::Scalar, Output = Self>
+    + SubAssign<Self>
+    + SubAssign<Self::Scalar>
+    + Sum
+    + Sync
+where
+    Self::Scalar: Add<Self, Output = Self>,
+    Self::Scalar: Mul<Self, Output = Self>,
+    Self::Scalar: Sub<Self, Output = Self>,
+{
+    type Scalar: Field;
+
+    const WIDTH: usize;
+    const ZEROS: Self;
+    const ONES: Self;
+
+    fn from_slice(slice: &[Self::Scalar]) -> &Self;
+    fn from_slice_mut(slice: &mut [Self::Scalar]) -> &mut Self;
+    fn as_slice(&self) -> &[Self::Scalar];
+    fn as_slice_mut(&mut self) -> &mut [Self::Scalar];
+
+    /// Take interpret two vectors as chunks of block_len elements. Unpack and interleave those
+    /// chunks. This is best seen with an example. If we have:
+    ///     A = [x0, y0, x1, y1],
+    ///     B = [x2, y2, x3, y3],
+    /// then
+    ///     interleave(A, B, 1) = ([x0, x2, x1, x3], [y0, y2, y1, y3]).
+    /// Pairs that were adjacent in the input are at corresponding positions in the output.
+    ///   r lets us set the size of chunks we're interleaving. If we set block_len = 2, then for
+    ///     A = [x0, x1, y0, y1],
+    ///     B = [x2, x3, y2, y3],
+    /// we obtain
+    ///     interleave(A, B, block_len) = ([x0, x1, x2, x3], [y0, y1, y2, y3]).
+    ///   We can also think about this as stacking the vectors, dividing them into 2x2 matrices, and
+    /// transposing those matrices.
+    ///   When block_len = WIDTH, this operation is a no-op. block_len must divide WIDTH. Since
+    /// WIDTH is specified to be a power of 2, block_len must also be a power of 2. It cannot be 0
+    /// and it cannot be > WIDTH.
+    fn interleave(&self, other: Self, block_len: usize) -> (Self, Self);
+
+    fn pack_slice(buf: &[Self::Scalar]) -> &[Self] {
+        assert!(
+            buf.len() % Self::WIDTH == 0,
+            "Slice length (got {}) must be a multiple of packed field width ({}).",
+            buf.len(),
+            Self::WIDTH
+        );
+        let buf_ptr = buf.as_ptr().cast::<Self>();
+        let n = buf.len() / Self::WIDTH;
+        unsafe { slice::from_raw_parts(buf_ptr, n) }
+    }
+    fn pack_slice_mut(buf: &mut [Self::Scalar]) -> &mut [Self] {
+        assert!(
+            buf.len() % Self::WIDTH == 0,
+            "Slice length (got {}) must be a multiple of packed field width ({}).",
+            buf.len(),
+            Self::WIDTH
+        );
+        let buf_ptr = buf.as_mut_ptr().cast::<Self>();
+        let n = buf.len() / Self::WIDTH;
+        unsafe { slice::from_raw_parts_mut(buf_ptr, n) }
+    }
+
+    fn doubles(&self) -> Self {
+        *self * Self::Scalar::TWO
+    }
+}
+
+unsafe impl<F: Field> PackedField for F {
+    type Scalar = Self;
+
+    const WIDTH: usize = 1;
+    const ZEROS: Self = F::ZERO;
+    const ONES: Self = F::ONE;
+
+    fn from_slice(slice: &[Self::Scalar]) -> &Self {
+        &slice[0]
+    }
+    fn from_slice_mut(slice: &mut [Self::Scalar]) -> &mut Self {
+        &mut slice[0]
+    }
+    fn as_slice(&self) -> &[Self::Scalar] {
+        slice::from_ref(self)
+    }
+    fn as_slice_mut(&mut self) -> &mut [Self::Scalar] {
+        slice::from_mut(self)
+    }
+
+    fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) {
+        match block_len {
+            1 => (*self, other),
+            _ => panic!("unsupported block length"),
+        }
+    }
+}