From 68d95363220107ef56524f820db54f99ba167e48 Mon Sep 17 00:00:00 2001 From: Paul <108982045+ChengYueJia@users.noreply.github.com> Date: Sun, 5 Nov 2023 18:28:47 +0800 Subject: [PATCH] feat: add avx2 Goldilocks (#151) * feat: add draft struct * feat: impl add * bugfix for avx by lo * add aknowledge * add aknowledge * use nightly version rustc --- .github/workflows/ci.yml | 4 +- algebraic/src/arch/mod.rs | 2 + algebraic/src/arch/x86_64/avx2_field_gl.rs | 798 +++++++++++++++++++ algebraic/src/arch/x86_64/avx512_field_gl.rs | 751 +++++++++++++++++ algebraic/src/arch/x86_64/mod.rs | 20 + algebraic/src/lib.rs | 4 + algebraic/src/packable.rs | 40 + algebraic/src/packed.rs | 127 +++ 8 files changed, 1744 insertions(+), 2 deletions(-) create mode 100644 algebraic/src/arch/mod.rs create mode 100644 algebraic/src/arch/x86_64/avx2_field_gl.rs create mode 100644 algebraic/src/arch/x86_64/avx512_field_gl.rs create mode 100644 algebraic/src/arch/x86_64/mod.rs create mode 100644 algebraic/src/packable.rs create mode 100644 algebraic/src/packed.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 03de5ae8..81d7ec21 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,7 +34,7 @@ jobs: - uses: actions-rs/toolchain@v1 with: profile: minimal - toolchain: stable + toolchain: nightly override: true - run: rustup component add rustfmt - uses: actions-rs/cargo@v1 @@ -49,7 +49,7 @@ jobs: - uses: actions-rs/toolchain@v1 with: profile: minimal - toolchain: stable + toolchain: nightly override: true - run: rustup component add clippy - run: cargo clippy --all-targets -- -D warnings \ No newline at end of file diff --git a/algebraic/src/arch/mod.rs b/algebraic/src/arch/mod.rs new file mode 100644 index 00000000..832557ef --- /dev/null +++ b/algebraic/src/arch/mod.rs @@ -0,0 +1,2 @@ +#[cfg(target_arch = "x86_64")] +pub mod x86_64; diff --git a/algebraic/src/arch/x86_64/avx2_field_gl.rs b/algebraic/src/arch/x86_64/avx2_field_gl.rs new file mode 100644 index 00000000..03fc5d5d --- /dev/null +++ b/algebraic/src/arch/x86_64/avx2_field_gl.rs @@ -0,0 +1,798 @@ +//! Porting from plonky2: +//! https://github.com/0xPolygonZero/plonky2/blob/main/field/src/arch/x86_64/avx2_goldilocks_field.rs +//! +//! How to build/run/test: +//! RUSTFLAGS='-C target-feature=+avx2' cargo build --release +//! +use crate::ff::*; +use crate::field_gl::{Fr, FrRepr as GoldilocksField}; +use core::arch::x86_64::*; +use core::fmt; +use core::fmt::{Debug, Formatter}; +use core::mem::transmute; +use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign}; +// use crate::packed::PackedField; + +/// AVX2 Goldilocks Field +/// +/// Ideally `Avx2GoldilocksField` would wrap `__m256i`. Unfortunately, `__m256i` has an alignment of +/// 32B, which would preclude us from casting `[GoldilocksField; 4]` (alignment 8B) to +/// `Avx2GoldilocksField`. We need to ensure that `Avx2GoldilocksField` has the same alignment as +/// `GoldilocksField`. Thus we wrap `[GoldilocksField; 4]` and use the `new` and `get` methods to +/// convert to and from `__m256i`. +#[derive(Copy, Clone)] +#[repr(transparent)] +pub struct Avx2GoldilocksField(pub [GoldilocksField; 4]); + +const WIDTH: usize = 4; + +impl Avx2GoldilocksField { + #[inline] + pub fn new(x: __m256i) -> Self { + unsafe { transmute(x) } + } + #[inline] + pub fn get(&self) -> __m256i { + unsafe { transmute(*self) } + } + // } + // unsafe impl PackedField for Avx2GoldilocksField { + #[inline] + pub fn from_slice(slice: &[GoldilocksField]) -> &Self { + assert_eq!(slice.len(), WIDTH); + unsafe { &*slice.as_ptr().cast() } + } + #[inline] + pub fn from_slice_mut(slice: &mut [GoldilocksField]) -> &mut Self { + assert_eq!(slice.len(), WIDTH); + unsafe { &mut *slice.as_mut_ptr().cast() } + } + #[inline] + pub fn as_slice(&self) -> &[GoldilocksField] { + &self.0[..] + } + #[inline] + pub fn as_slice_mut(&mut self) -> &mut [GoldilocksField] { + &mut self.0[..] + } + #[inline] + pub fn square(&self) -> Avx2GoldilocksField { + Self::new(unsafe { square(self.get()) }) + } + + #[inline] + fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) { + let (v0, v1) = (self.get(), other.get()); + let (res0, res1) = match block_len { + 1 => unsafe { interleave1(v0, v1) }, + 2 => unsafe { interleave2(v0, v1) }, + 4 => (v0, v1), + _ => panic!("unsupported block_len"), + }; + (Self::new(res0), Self::new(res1)) + } +} + +impl Add for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn add(self, rhs: Self) -> Self { + Self::new(unsafe { add(self.get(), rhs.get()) }) + } +} +impl Add for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn add(self, rhs: GoldilocksField) -> Self { + self + Self::from(rhs) + } +} +impl Add for GoldilocksField { + type Output = Avx2GoldilocksField; + #[inline] + fn add(self, rhs: Self::Output) -> Self::Output { + Self::Output::from(self) + rhs + } +} +impl AddAssign for Avx2GoldilocksField { + #[inline] + fn add_assign(&mut self, rhs: Self) { + *self = *self + rhs; + } +} +impl AddAssign for Avx2GoldilocksField { + #[inline] + fn add_assign(&mut self, rhs: GoldilocksField) { + *self = *self + rhs; + } +} + +impl Debug for Avx2GoldilocksField { + #[inline] + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "({:?})", self.get()) + } +} + +impl Default for Avx2GoldilocksField { + #[inline] + fn default() -> Self { + Self([GoldilocksField::from(0); 4]) + } +} + +impl Div for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn div(self, rhs: GoldilocksField) -> Self { + let rhs_value = Fr::from_repr(rhs).unwrap(); + let rhs_inverse = rhs_value.inverse().unwrap().into_repr(); + self * rhs_inverse + } +} +impl DivAssign for Avx2GoldilocksField { + #[inline] + fn div_assign(&mut self, rhs: GoldilocksField) { + let rhs_value = Fr::from_repr(rhs).unwrap(); + let rhs_inverse = rhs_value.inverse().unwrap().into_repr(); + *self *= rhs_inverse; + } +} + +impl From for Avx2GoldilocksField { + fn from(x: GoldilocksField) -> Self { + Self([x; 4]) + } +} + +impl Mul for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn mul(self, rhs: Self) -> Self { + Self::new(unsafe { mul(self.get(), rhs.get()) }) + } +} +impl Mul for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn mul(self, rhs: GoldilocksField) -> Self { + self * Self::from(rhs) + } +} +impl Mul for GoldilocksField { + type Output = Avx2GoldilocksField; + #[inline] + fn mul(self, rhs: Avx2GoldilocksField) -> Self::Output { + Self::Output::from(self) * rhs + } +} +impl MulAssign for Avx2GoldilocksField { + #[inline] + fn mul_assign(&mut self, rhs: Self) { + *self = *self * rhs; + } +} +impl MulAssign for Avx2GoldilocksField { + #[inline] + fn mul_assign(&mut self, rhs: GoldilocksField) { + *self = *self * rhs; + } +} + +impl Neg for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn neg(self) -> Self { + Self::new(unsafe { neg(self.get()) }) + } +} +// +// impl Product for Avx2GoldilocksField { +// #[inline] +// fn product>(iter: I) -> Self { +// iter.reduce(|x, y| x * y).unwrap_or(Self::ONES) +// } +// } + +impl Sub for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn sub(self, rhs: Self) -> Self { + Self::new(unsafe { sub(self.get(), rhs.get()) }) + } +} +impl Sub for Avx2GoldilocksField { + type Output = Self; + #[inline] + fn sub(self, rhs: GoldilocksField) -> Self { + self - Self::from(rhs) + } +} +impl Sub for GoldilocksField { + type Output = Avx2GoldilocksField; + #[inline] + fn sub(self, rhs: Avx2GoldilocksField) -> Self::Output { + Self::Output::from(self) - rhs + } +} +impl SubAssign for Avx2GoldilocksField { + #[inline] + fn sub_assign(&mut self, rhs: Self) { + *self = *self - rhs; + } +} +impl SubAssign for Avx2GoldilocksField { + #[inline] + fn sub_assign(&mut self, rhs: GoldilocksField) { + *self = *self - rhs; + } +} + +// impl Sum for Avx2GoldilocksField { +// #[inline] +// fn sum>(iter: I) -> Self { +// iter.reduce(|x, y| x + y).unwrap_or(Self::ZEROS) +// } +// } + +// Resources: +// 1. Intel Intrinsics Guide for explanation of each intrinsic: +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/ +// 2. uops.info lists micro-ops for each instruction: https://uops.info/table.html +// 3. Intel optimization manual for introduction to x86 vector extensions and best practices: +// https://software.intel.com/content/www/us/en/develop/download/intel-64-and-ia-32-architectures-optimization-reference-manual.html + +// Preliminary knowledge: +// 1. Vector code usually avoids branching. Instead of branches, we can do input selection with +// _mm256_blendv_epi8 or similar instruction. If all we're doing is conditionally zeroing a +// vector element then _mm256_and_si256 or _mm256_andnot_si256 may be used and are cheaper. +// +// 2. AVX does not support addition with carry but 128-bit (2-word) addition can be easily +// emulated. The method recognizes that for a + b overflowed iff (a + b) < a: +// i. res_lo = a_lo + b_lo +// ii. carry_mask = res_lo < a_lo +// iii. res_hi = a_hi + b_hi - carry_mask +// Notice that carry_mask is subtracted, not added. This is because AVX comparison instructions +// return -1 (all bits 1) for true and 0 for false. +// +// 3. AVX does not have unsigned 64-bit comparisons. Those can be emulated with signed comparisons +// by recognizing that a __m256i { + _mm256_xor_si256(x, SIGN_BIT) +} + +/// Convert to canonical representation. +/// The argument is assumed to be shifted by 1 << 63 (i.e. x_s = x + 1<<63, where x is the field +/// value). The returned value is similarly shifted by 1 << 63 (i.e. we return y_s = y + (1<<63), +/// where 0 <= y < FIELD_ORDER). +#[inline] +unsafe fn canonicalize_s(x_s: __m256i) -> __m256i { + // If x >= FIELD_ORDER then corresponding mask bits are all 0; otherwise all 1. + let mask = _mm256_cmpgt_epi64(SHIFTED_FIELD_ORDER, x_s); + // wrapback_amt is -FIELD_ORDER if mask is 0; otherwise 0. + let wrapback_amt = _mm256_andnot_si256(mask, EPSILON); + _mm256_add_epi64(x_s, wrapback_amt) +} + +/// Addition u64 + u64 -> u64. Assumes that x + y < 2^64 + FIELD_ORDER. The second argument is +/// pre-shifted by 1 << 63. The result is similarly shifted. +#[inline] +unsafe fn add_no_double_overflow_64_64s_s(x: __m256i, y_s: __m256i) -> __m256i { + let res_wrapped_s = _mm256_add_epi64(x, y_s); + let mask = _mm256_cmpgt_epi64(y_s, res_wrapped_s); // -1 if overflowed else 0. + let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0. + let res_s = _mm256_add_epi64(res_wrapped_s, wrapback_amt); + res_s +} + +#[inline] +unsafe fn add(x: __m256i, y: __m256i) -> __m256i { + let y_s = shift(y); + let res_s = add_no_double_overflow_64_64s_s(x, canonicalize_s(y_s)); + shift(res_s) +} + +#[inline] +unsafe fn sub(x: __m256i, y: __m256i) -> __m256i { + let mut y_s = shift(y); + y_s = canonicalize_s(y_s); + let x_s = shift(x); + let mask = _mm256_cmpgt_epi64(y_s, x_s); // -1 if sub will underflow (y > x) else 0. + let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflow else 0. + let res_wrapped = _mm256_sub_epi64(x_s, y_s); + let res = _mm256_sub_epi64(res_wrapped, wrapback_amt); + res +} + +#[inline] +unsafe fn neg(y: __m256i) -> __m256i { + let y_s = shift(y); + _mm256_sub_epi64(SHIFTED_FIELD_ORDER, canonicalize_s(y_s)) +} + +/// Full 64-bit by 64-bit multiplication. This emulated multiplication is 1.33x slower than the +/// scalar instruction, but may be worth it if we want our data to live in vector registers. +#[inline] +unsafe fn mul64_64(x: __m256i, y: __m256i) -> (__m256i, __m256i) { + // We want to move the high 32 bits to the low position. The multiplication instruction ignores + // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can + // be done on port 5; bitshifts run on ports 0 and 1, competing with multiplication. + // This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the + // distinction; the casts are free and it guarantees that the exact bit pattern is preserved. + // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency + // since Haswell. + let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x))); + let y_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(y))); + + // All four pairwise multiplications + let mul_ll = _mm256_mul_epu32(x, y); + let mul_lh = _mm256_mul_epu32(x, y_hi); + let mul_hl = _mm256_mul_epu32(x_hi, y); + let mul_hh = _mm256_mul_epu32(x_hi, y_hi); + + // Bignum addition + // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow. + let mul_ll_hi = _mm256_srli_epi64::<32>(mul_ll); + let t0 = _mm256_add_epi64(mul_hl, mul_ll_hi); + // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow. + // Also, extract high 32 bits of t0 and add to mul_hh. + let t0_lo = _mm256_and_si256(t0, EPSILON); + let t0_hi = _mm256_srli_epi64::<32>(t0); + let t1 = _mm256_add_epi64(mul_lh, t0_lo); + let t2 = _mm256_add_epi64(mul_hh, t0_hi); + // Lastly, extract the high 32 bits of t1 and add to t2. + let t1_hi = _mm256_srli_epi64::<32>(t1); + let res_hi = _mm256_add_epi64(t2, t1_hi); + + // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high + // position). + let t1_lo = _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(t1))); + let res_lo = _mm256_blend_epi32::<0xaa>(mul_ll, t1_lo); + + (res_hi, res_lo) +} + +/// Full 64-bit squaring. This routine is 1.2x faster than the scalar instruction. +#[inline] +unsafe fn square64(x: __m256i) -> (__m256i, __m256i) { + // Get high 32 bits of x. See comment in mul64_64_s. + let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x))); + + // All pairwise multiplications. + let mul_ll = _mm256_mul_epu32(x, x); + let mul_lh = _mm256_mul_epu32(x, x_hi); + let mul_hh = _mm256_mul_epu32(x_hi, x_hi); + + // Bignum addition, but mul_lh is shifted by 33 bits (not 32). + let mul_ll_hi = _mm256_srli_epi64::<33>(mul_ll); + let t0 = _mm256_add_epi64(mul_lh, mul_ll_hi); + let t0_hi = _mm256_srli_epi64::<31>(t0); + let res_hi = _mm256_add_epi64(mul_hh, t0_hi); + + // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high + // position). + let mul_lh_lo = _mm256_slli_epi64::<33>(mul_lh); + let res_lo = _mm256_add_epi64(mul_ll, mul_lh_lo); + + (res_hi, res_lo) +} + +/// Goldilocks addition of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be <= +/// `0xffffffff00000000`. The result is shifted by 2**63. +#[inline] +unsafe fn add_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i { + let res_wrapped_s = _mm256_add_epi64(x_s, y); + // 32-bit compare is faster than 64-bit. It's safe as long as x > res_wrapped iff x >> 32 > + // res_wrapped >> 32. The case of x >> 32 > res_wrapped >> 32 is trivial and so is <. The case + // where x >> 32 = res_wrapped >> 32 remains. If x >> 32 = res_wrapped >> 32, then y >> 32 = + // 0xffffffff and the addition of the low 32 bits generated a carry. This can never occur if y + // <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no carry can occur. + let mask = _mm256_cmpgt_epi32(x_s, res_wrapped_s); // -1 if overflowed else 0. + // The mask contains 0xffffffff in the high 32 bits if wraparound occured and 0 otherwise. + let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if overflowed else 0. + let res_s = _mm256_add_epi64(res_wrapped_s, wrapback_amt); + res_s +} + +/// Goldilocks subtraction of a "small" number. `x_s` is pre-shifted by 2**63. `y` is assumed to be +/// <= `0xffffffff00000000`. The result is shifted by 2**63. +#[inline] +unsafe fn sub_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i { + let res_wrapped_s = _mm256_sub_epi64(x_s, y); + // 32-bit compare is faster than 64-bit. It's safe as long as res_wrapped > x iff res_wrapped >> + // 32 > x >> 32. The case of res_wrapped >> 32 > x >> 32 is trivial and so is <. The case where + // res_wrapped >> 32 = x >> 32 remains. If res_wrapped >> 32 = x >> 32, then y >> 32 = + // 0xffffffff and the subtraction of the low 32 bits generated a borrow. This can never occur if + // y <= 0xffffffff00000000: if y >> 32 = 0xffffffff, then no borrow can occur. + let mask = _mm256_cmpgt_epi32(res_wrapped_s, x_s); // -1 if underflowed else 0. + // The mask contains 0xffffffff in the high 32 bits if wraparound occured and 0 otherwise. + let wrapback_amt = _mm256_srli_epi64::<32>(mask); // -FIELD_ORDER if underflowed else 0. + let res_s = _mm256_sub_epi64(res_wrapped_s, wrapback_amt); + res_s +} + +#[inline] +unsafe fn reduce128(x: (__m256i, __m256i)) -> __m256i { + let (hi0, lo0) = x; + let lo0_s = shift(lo0); + let hi_hi0 = _mm256_srli_epi64::<32>(hi0); + let lo1_s = sub_small_64s_64_s(lo0_s, hi_hi0); + let t1 = _mm256_mul_epu32(hi0, EPSILON); + let lo2_s = add_small_64s_64_s(lo1_s, t1); + let lo2 = shift(lo2_s); + lo2 +} + +/// Multiply two integers modulo FIELD_ORDER. +#[inline] +unsafe fn mul(x: __m256i, y: __m256i) -> __m256i { + reduce128(mul64_64(x, y)) +} + +// /// Square an integer modulo FIELD_ORDER. +#[inline] +unsafe fn square(x: __m256i) -> __m256i { + reduce128(square64(x)) +} + +#[inline] +unsafe fn interleave1(x: __m256i, y: __m256i) -> (__m256i, __m256i) { + let a = _mm256_unpacklo_epi64(x, y); + let b = _mm256_unpackhi_epi64(x, y); + (a, b) +} + +#[inline] +unsafe fn interleave2(x: __m256i, y: __m256i) -> (__m256i, __m256i) { + let y_lo = _mm256_castsi256_si128(y); // This has 0 cost. + + // 1 places y_lo in the high half of x; 0 would place it in the lower half. + let a = _mm256_inserti128_si256::<1>(x, y_lo); + // NB: _mm256_permute2x128_si256 could be used here as well but _mm256_inserti128_si256 has + // lower latency on Zen 3 processors. + + // Each nibble of the constant has the following semantics: + // 0 => src1[low 128 bits] + // 1 => src1[high 128 bits] + // 2 => src2[low 128 bits] + // 3 => src2[high 128 bits] + // The low (resp. high) nibble chooses the low (resp. high) 128 bits of the result. + let b = _mm256_permute2x128_si256::<0x31>(x, y); + + (a, b) +} + +#[cfg(test)] +mod tests { + use super::Avx2GoldilocksField; + use crate::ff::*; + use crate::field_gl::{Fr, FrRepr as GoldilocksField}; + use std::time::Instant; + // use crate::packed::PackedField; + + fn test_vals_a() -> [GoldilocksField; 4] { + [ + GoldilocksField([14479013849828404771u64]), + GoldilocksField([9087029921428221768u64]), + GoldilocksField([2441288194761790662u64]), + GoldilocksField([5646033492608483824u64]), + ] + } + fn test_vals_b() -> [GoldilocksField; 4] { + [ + GoldilocksField([17891926589593242302u64]), + GoldilocksField([11009798273260028228u64]), + GoldilocksField([2028722748960791447u64]), + GoldilocksField([7929433601095175579u64]), + ] + } + + #[test] + fn test_add() { + let a_arr = test_vals_a(); + let b_arr = test_vals_b(); + let start = Instant::now(); + let packed_a = Avx2GoldilocksField::from_slice(&a_arr); + let packed_b = Avx2GoldilocksField::from_slice(&b_arr); + let packed_res = *packed_a + *packed_b; + let arr_res = packed_res.as_slice(); + let avx2_duration = start.elapsed(); + // println!("arr_res: {:?}", arr_res); + + let start = Instant::now(); + let expected = a_arr + .iter() + .zip(b_arr) + .map(|(&a, b)| Fr::from_repr(a).unwrap() + Fr::from_repr(b).unwrap()); + let expected_values: Vec = expected.collect(); + // println!("expected values: {:?}", expected_values); + let non_accelerated_duration = start.elapsed(); + for (exp, &res) in expected_values.iter().zip(arr_res) { + assert_eq!(res, exp.into_repr()); + } + + println!("test_add_AVX2_accelerated time: {:?}", avx2_duration); + println!( + "test_add_Non_accelerated time: {:?}", + non_accelerated_duration + ); + } + + #[test] + fn test_mul() { + let a_arr = test_vals_a(); + let b_arr = test_vals_b(); + let start = Instant::now(); + let packed_a = *Avx2GoldilocksField::from_slice(&a_arr); + let packed_b = *Avx2GoldilocksField::from_slice(&b_arr); + let packed_res = packed_a * packed_b; + let arr_res = packed_res.as_slice(); + let avx2_duration = start.elapsed(); + // println!("arr_res: {:?}", arr_res); + + let start = Instant::now(); + let expected = a_arr + .iter() + .zip(b_arr) + .map(|(&a, b)| Fr::from_repr(a).unwrap() * Fr::from_repr(b).unwrap()); + let expected_values: Vec = expected.collect(); + let non_accelerated_duration = start.elapsed(); + // println!("expected values: {:?}", expected_values); + + for (exp, &res) in expected_values.iter().zip(arr_res) { + assert_eq!(res, exp.into_repr()); + } + + println!("test_mul_AVX2_accelerated time: {:?}", avx2_duration); + println!( + "test_mul_Non_accelerated time: {:?}", + non_accelerated_duration + ); + } + + #[test] + fn test_div() { + let a_arr = test_vals_a(); + let start = Instant::now(); + let packed_a = *Avx2GoldilocksField::from_slice(&a_arr); + let packed_res = packed_a / GoldilocksField([7929433601095175579u64]); + let arr_res = packed_res.as_slice(); + let avx2_duration = start.elapsed(); + // println!("arr_res: {:?}", arr_res); + + let start = Instant::now(); + let expected = a_arr.iter().map(|&a| { + Fr::from_repr(a).unwrap() + / Fr::from_repr(GoldilocksField([7929433601095175579u64])).unwrap() + }); + let expected_values: Vec = expected.collect(); + let non_accelerated_duration = start.elapsed(); + // println!("expected values: {:?}", expected_values); + + for (exp, &res) in expected_values.iter().zip(arr_res) { + assert_eq!(res, exp.into_repr()); + } + + println!("test_div_AVX2_accelerated time: {:?}", avx2_duration); + println!( + "test_div_Non_accelerated time: {:?}", + non_accelerated_duration + ); + } + + #[test] + fn test_square() { + let a_arr = test_vals_a(); + let start = Instant::now(); + let packed_a = *Avx2GoldilocksField::from_slice(&a_arr); + let packed_res = packed_a.square(); + let arr_res = packed_res.as_slice(); + let avx2_duration = start.elapsed(); + // println!("arr_res: {:?}", arr_res); + + let start = Instant::now(); + let mut expected_values = Vec::new(); + for &a in &a_arr { + match Fr::from_repr(a) { + Ok(mut fr) => { + fr.square(); + expected_values.push(fr); + } + Err(_) => { + continue; + } + } + } + let non_accelerated_duration = start.elapsed(); + // println!("expected values: {:?}", expected_values); + for (exp, &res) in expected_values.iter().zip(arr_res) { + assert_eq!(res, exp.into_repr()); + } + println!("test_square_AVX2_accelerated time: {:?}", avx2_duration); + println!( + "test_square_Non_accelerated time: {:?}", + non_accelerated_duration + ); + } + + #[test] + fn test_neg() { + let a_arr = test_vals_a(); + let start = Instant::now(); + let packed_a = *Avx2GoldilocksField::from_slice(&a_arr); + let packed_res = -packed_a; + let arr_res = packed_res.as_slice(); + let avx2_duration = start.elapsed(); + // println!("arr_res: {:?}", arr_res); + + let start = Instant::now(); + let expected = a_arr.iter().map(|&a| -Fr::from_repr(a).unwrap()); + let expected_values: Vec = expected.collect(); + let non_accelerated_duration = start.elapsed(); + // println!("expected values: {:?}", expected_values); + + for (exp, &res) in expected_values.iter().zip(arr_res) { + assert_eq!(res, exp.into_repr()); + } + + println!("test_neg_AVX2_accelerated time: {:?}", avx2_duration); + println!( + "test_neg_Non_accelerated time: {:?}", + non_accelerated_duration + ); + } + + #[test] + fn test_sub() { + let a_arr = test_vals_a(); + let b_arr = test_vals_b(); + let start = Instant::now(); + let packed_a = *Avx2GoldilocksField::from_slice(&a_arr); + let packed_b = *Avx2GoldilocksField::from_slice(&b_arr); + let packed_res = packed_a - packed_b; + let arr_res = packed_res.as_slice(); + let avx2_duration = start.elapsed(); + // println!("arr_res: {:?}", arr_res); + + let start = Instant::now(); + let expected = a_arr + .iter() + .zip(b_arr) + .map(|(&a, b)| Fr::from_repr(a).unwrap() - Fr::from_repr(b).unwrap()); + let expected_values: Vec = expected.collect(); + let non_accelerated_duration = start.elapsed(); + // println!("expected values: {:?}", expected_values); + + for (exp, &res) in expected_values.iter().zip(arr_res) { + assert_eq!(res, exp.into_repr()); + } + + println!("test_sub_AVX2_accelerated time: {:?}", avx2_duration); + println!( + "test_sub_Non_accelerated time: {:?}", + non_accelerated_duration + ); + } + + #[test] + fn test_interleave_is_involution() { + let a_arr = test_vals_a(); + let b_arr = test_vals_b(); + + let packed_a = *Avx2GoldilocksField::from_slice(&a_arr); + let packed_b = *Avx2GoldilocksField::from_slice(&b_arr); + { + // Interleave, then deinterleave. + let (x, y) = packed_a.interleave(packed_b, 1); + let (res_a, res_b) = x.interleave(y, 1); + assert_eq!(res_a.as_slice(), a_arr); + assert_eq!(res_b.as_slice(), b_arr); + } + { + let (x, y) = packed_a.interleave(packed_b, 2); + let (res_a, res_b) = x.interleave(y, 2); + assert_eq!(res_a.as_slice(), a_arr); + assert_eq!(res_b.as_slice(), b_arr); + } + { + let (x, y) = packed_a.interleave(packed_b, 4); + let (res_a, res_b) = x.interleave(y, 4); + assert_eq!(res_a.as_slice(), a_arr); + assert_eq!(res_b.as_slice(), b_arr); + } + } + + #[test] + fn test_interleave() { + let in_a: [GoldilocksField; 4] = [ + GoldilocksField([0]), + GoldilocksField([1]), + GoldilocksField([2]), + GoldilocksField([3]), + ]; + let in_b: [GoldilocksField; 4] = [ + GoldilocksField([10]), + GoldilocksField([11]), + GoldilocksField([12]), + GoldilocksField([13]), + ]; + let int1_a: [GoldilocksField; 4] = [ + GoldilocksField([0]), + GoldilocksField([10]), + GoldilocksField([2]), + GoldilocksField([12]), + ]; + let int1_b: [GoldilocksField; 4] = [ + GoldilocksField([1]), + GoldilocksField([11]), + GoldilocksField([3]), + GoldilocksField([13]), + ]; + let int2_a: [GoldilocksField; 4] = [ + GoldilocksField([0]), + GoldilocksField([1]), + GoldilocksField([10]), + GoldilocksField([11]), + ]; + let int2_b: [GoldilocksField; 4] = [ + GoldilocksField([2]), + GoldilocksField([3]), + GoldilocksField([12]), + GoldilocksField([13]), + ]; + + let packed_a = *Avx2GoldilocksField::from_slice(&in_a); + let packed_b = *Avx2GoldilocksField::from_slice(&in_b); + { + let (x1, y1) = packed_a.interleave(packed_b, 1); + assert_eq!(x1.as_slice(), int1_a); + assert_eq!(y1.as_slice(), int1_b); + } + { + let (x2, y2) = packed_a.interleave(packed_b, 2); + assert_eq!(x2.as_slice(), int2_a); + assert_eq!(y2.as_slice(), int2_b); + } + { + let (x4, y4) = packed_a.interleave(packed_b, 4); + assert_eq!(x4.as_slice(), in_a); + assert_eq!(y4.as_slice(), in_b); + } + } +} diff --git a/algebraic/src/arch/x86_64/avx512_field_gl.rs b/algebraic/src/arch/x86_64/avx512_field_gl.rs new file mode 100644 index 00000000..fe9e8942 --- /dev/null +++ b/algebraic/src/arch/x86_64/avx512_field_gl.rs @@ -0,0 +1,751 @@ +//! Porting from plonky2 +//! https://github.com/0xPolygonZero/plonky2/blob/main/field/src/arch/x86_64/avx512_goldilocks_field.rs +//! +//! How to build/run/test: +//! RUSTFLAGS='-C target-feature=+avx512f,+avx512bw,+avx512cd,+avx512dq,+avx512vl' cargo build --release +use crate::ff::*; +use crate::field_gl::{Fr, FrRepr as GoldilocksField}; +use core::arch::x86_64::*; +use core::fmt; +use core::fmt::{Debug, Formatter}; +use core::mem::transmute; +use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign}; + +/// AVX512 Goldilocks Field +/// +/// Ideally `Avx512GoldilocksField` would wrap `__m512i`. Unfortunately, `__m512i` has an alignment +/// of 64B, which would preclude us from casting `[GoldilocksField; 8]` (alignment 8B) to +/// `Avx512GoldilocksField`. We need to ensure that `Avx512GoldilocksField` has the same alignment as +/// `GoldilocksField`. Thus we wrap `[GoldilocksField; 8]` and use the `new` and `get` methods to +/// convert to and from `__m512i`. +#[derive(Copy, Clone)] +#[repr(transparent)] +pub struct Avx512GoldilocksField(pub [GoldilocksField; 8]); + +const WIDTH: usize = 8; + +impl Avx512GoldilocksField { + #[inline] + fn new(x: __m512i) -> Self { + unsafe { transmute(x) } + } + #[inline] + fn get(&self) -> __m512i { + unsafe { transmute(*self) } + } + #[inline] + pub fn from_slice(slice: &[GoldilocksField]) -> &Self { + assert_eq!(slice.len(), WIDTH); + unsafe { &*slice.as_ptr().cast() } + } + #[inline] + pub fn from_slice_mut(slice: &mut [GoldilocksField]) -> &mut Self { + assert_eq!(slice.len(), WIDTH); + unsafe { &mut *slice.as_mut_ptr().cast() } + } + #[inline] + pub fn as_slice(&self) -> &[GoldilocksField] { + &self.0[..] + } + #[inline] + pub fn as_slice_mut(&mut self) -> &mut [GoldilocksField] { + &mut self.0[..] + } + #[inline] + pub fn square(&self) -> Avx512GoldilocksField { + Self::new(unsafe { square(self.get()) }) + } + + #[inline] + fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) { + let (v0, v1) = (self.get(), other.get()); + let (res0, res1) = match block_len { + 1 => unsafe { interleave1(v0, v1) }, + 2 => unsafe { interleave2(v0, v1) }, + 4 => unsafe { interleave4(v0, v1) }, + 8 => (v0, v1), + _ => panic!("unsupported block_len"), + }; + (Self::new(res0), Self::new(res1)) + } +} + +impl Add for Avx512GoldilocksField { + type Output = Self; + #[inline] + fn add(self, rhs: Self) -> Self { + Self::new(unsafe { add(self.get(), rhs.get()) }) + } +} +impl Add for Avx512GoldilocksField { + type Output = Self; + #[inline] + fn add(self, rhs: GoldilocksField) -> Self { + self + Self::from(rhs) + } +} +impl Add for GoldilocksField { + type Output = Avx512GoldilocksField; + #[inline] + fn add(self, rhs: Self::Output) -> Self::Output { + Self::Output::from(self) + rhs + } +} +impl AddAssign for Avx512GoldilocksField { + #[inline] + fn add_assign(&mut self, rhs: Self) { + *self = *self + rhs; + } +} +impl AddAssign for Avx512GoldilocksField { + #[inline] + fn add_assign(&mut self, rhs: GoldilocksField) { + *self = *self + rhs; + } +} + +impl Debug for Avx512GoldilocksField { + #[inline] + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "({:?})", self.get()) + } +} + +impl Default for Avx512GoldilocksField { + #[inline] + fn default() -> Self { + Self([GoldilocksField::from(0); 8]) + } +} + +impl Div for Avx512GoldilocksField { + type Output = Self; + #[inline] + fn div(self, rhs: GoldilocksField) -> Self { + let rhs_value = Fr::from_repr(rhs).unwrap(); + let rhs_inverse = rhs_value.inverse().unwrap().into_repr(); + self * rhs_inverse + } +} +impl DivAssign for Avx512GoldilocksField { + #[inline] + fn div_assign(&mut self, rhs: GoldilocksField) { + let rhs_value = Fr::from_repr(rhs).unwrap(); + let rhs_inverse = rhs_value.inverse().unwrap().into_repr(); + *self *= rhs_inverse; + } +} + +impl From for Avx512GoldilocksField { + fn from(x: GoldilocksField) -> Self { + Self([x; 8]) + } +} + +impl Mul for Avx512GoldilocksField { + type Output = Self; + #[inline] + fn mul(self, rhs: Self) -> Self { + Self::new(unsafe { mul(self.get(), rhs.get()) }) + } +} +impl Mul for Avx512GoldilocksField { + type Output = Self; + #[inline] + fn mul(self, rhs: GoldilocksField) -> Self { + self * Self::from(rhs) + } +} +impl Mul for GoldilocksField { + type Output = Avx512GoldilocksField; + #[inline] + fn mul(self, rhs: Avx512GoldilocksField) -> Self::Output { + Self::Output::from(self) * rhs + } +} +impl MulAssign for Avx512GoldilocksField { + #[inline] + fn mul_assign(&mut self, rhs: Self) { + *self = *self * rhs; + } +} +impl MulAssign for Avx512GoldilocksField { + #[inline] + fn mul_assign(&mut self, rhs: GoldilocksField) { + *self = *self * rhs; + } +} + +impl Neg for Avx512GoldilocksField { + type Output = Self; + #[inline] + fn neg(self) -> Self { + Self::new(unsafe { neg(self.get()) }) + } +} + +// impl Product for Avx512GoldilocksField { +// #[inline] +// fn product>(iter: I) -> Self { +// iter.reduce(|x, y| x * y).unwrap_or(Self::ONES) +// } +// } + +impl Sub for Avx512GoldilocksField { + type Output = Self; + #[inline] + fn sub(self, rhs: Self) -> Self { + Self::new(unsafe { sub(self.get(), rhs.get()) }) + } +} +impl Sub for Avx512GoldilocksField { + type Output = Self; + #[inline] + fn sub(self, rhs: GoldilocksField) -> Self { + self - Self::from(rhs) + } +} +impl Sub for GoldilocksField { + type Output = Avx512GoldilocksField; + #[inline] + fn sub(self, rhs: Avx512GoldilocksField) -> Self::Output { + Self::Output::from(self) - rhs + } +} +impl SubAssign for Avx512GoldilocksField { + #[inline] + fn sub_assign(&mut self, rhs: Self) { + *self = *self - rhs; + } +} +impl SubAssign for Avx512GoldilocksField { + #[inline] + fn sub_assign(&mut self, rhs: GoldilocksField) { + *self = *self - rhs; + } +} + +// impl Sum for Avx512GoldilocksField { +// #[inline] +// fn sum>(iter: I) -> Self { +// iter.reduce(|x, y| x + y).unwrap_or(Self::ZEROS) +// } +// } + +const FIELD_ORDER: __m512i = unsafe { transmute([GOLDILOCKS_FIELD_ORDER; 8]) }; +const EPSILON: __m512i = unsafe { transmute([GoldilocksField([4294967295u64]); 8]) }; +// Goldilocks Order +const GOLDILOCKS_FIELD_ORDER: u64 = 0xFFFFFFFF00000001; + +#[inline] +unsafe fn canonicalize(x: __m512i) -> __m512i { + let mask = _mm512_cmpge_epu64_mask(x, FIELD_ORDER); + _mm512_mask_sub_epi64(x, mask, x, FIELD_ORDER) +} + +#[inline] +unsafe fn add_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i { + let res_wrapped = _mm512_add_epi64(x, y); + let mask = _mm512_cmplt_epu64_mask(res_wrapped, y); // mask set if add overflowed + let res = _mm512_mask_sub_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER); + res +} + +#[inline] +unsafe fn sub_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i { + let mask = _mm512_cmplt_epu64_mask(x, y); // mask set if sub will underflow (x < y) + let res_wrapped = _mm512_sub_epi64(x, y); + let res = _mm512_mask_add_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER); + res +} + +#[inline] +unsafe fn add(x: __m512i, y: __m512i) -> __m512i { + add_no_double_overflow_64_64(x, canonicalize(y)) +} + +#[inline] +unsafe fn sub(x: __m512i, y: __m512i) -> __m512i { + sub_no_double_overflow_64_64(x, canonicalize(y)) +} + +#[inline] +unsafe fn neg(y: __m512i) -> __m512i { + _mm512_sub_epi64(FIELD_ORDER, canonicalize(y)) +} + +const LO_32_BITS_MASK: __mmask16 = unsafe { transmute(0b0101010101010101u16) }; + +#[inline] +unsafe fn mul64_64(x: __m512i, y: __m512i) -> (__m512i, __m512i) { + // We want to move the high 32 bits to the low position. The multiplication instruction ignores + // the high 32 bits, so it's ok to just duplicate it into the low position. This duplication can + // be done on port 5; bitshifts run on port 0, competing with multiplication. + // This instruction is only provided for 32-bit floats, not integers. Idk why Intel makes the + // distinction; the casts are free and it guarantees that the exact bit pattern is preserved. + // Using a swizzle instruction of the wrong domain (float vs int) does not increase latency + // since Haswell. + let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x))); + let y_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(y))); + + // All four pairwise multiplications + let mul_ll = _mm512_mul_epu32(x, y); + let mul_lh = _mm512_mul_epu32(x, y_hi); + let mul_hl = _mm512_mul_epu32(x_hi, y); + let mul_hh = _mm512_mul_epu32(x_hi, y_hi); + + // Bignum addition + // Extract high 32 bits of mul_ll and add to mul_hl. This cannot overflow. + let mul_ll_hi = _mm512_srli_epi64::<32>(mul_ll); + let t0 = _mm512_add_epi64(mul_hl, mul_ll_hi); + // Extract low 32 bits of t0 and add to mul_lh. Again, this cannot overflow. + // Also, extract high 32 bits of t0 and add to mul_hh. + let t0_lo = _mm512_and_si512(t0, EPSILON); + let t0_hi = _mm512_srli_epi64::<32>(t0); + let t1 = _mm512_add_epi64(mul_lh, t0_lo); + let t2 = _mm512_add_epi64(mul_hh, t0_hi); + // Lastly, extract the high 32 bits of t1 and add to t2. + let t1_hi = _mm512_srli_epi64::<32>(t1); + let res_hi = _mm512_add_epi64(t2, t1_hi); + + // Form res_lo by combining the low half of mul_ll with the low half of t1 (shifted into high + // position). + let t1_lo = _mm512_castps_si512(_mm512_moveldup_ps(_mm512_castsi512_ps(t1))); + let res_lo = _mm512_mask_blend_epi32(LO_32_BITS_MASK, t1_lo, mul_ll); + + (res_hi, res_lo) +} + +#[inline] +unsafe fn square64(x: __m512i) -> (__m512i, __m512i) { + // Get high 32 bits of x. See comment in mul64_64_s. + let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x))); + + // All pairwise multiplications. + let mul_ll = _mm512_mul_epu32(x, x); + let mul_lh = _mm512_mul_epu32(x, x_hi); + let mul_hh = _mm512_mul_epu32(x_hi, x_hi); + + // Bignum addition, but mul_lh is shifted by 33 bits (not 32). + let mul_ll_hi = _mm512_srli_epi64::<33>(mul_ll); + let t0 = _mm512_add_epi64(mul_lh, mul_ll_hi); + let t0_hi = _mm512_srli_epi64::<31>(t0); + let res_hi = _mm512_add_epi64(mul_hh, t0_hi); + + // Form low result by adding the mul_ll and the low 31 bits of mul_lh (shifted to the high + // position). + let mul_lh_lo = _mm512_slli_epi64::<33>(mul_lh); + let res_lo = _mm512_add_epi64(mul_ll, mul_lh_lo); + + (res_hi, res_lo) +} + +#[inline] +unsafe fn reduce128(x: (__m512i, __m512i)) -> __m512i { + let (hi0, lo0) = x; + let hi_hi0 = _mm512_srli_epi64::<32>(hi0); + let lo1 = sub_no_double_overflow_64_64(lo0, hi_hi0); + let t1 = _mm512_mul_epu32(hi0, EPSILON); + let lo2 = add_no_double_overflow_64_64(lo1, t1); + lo2 +} + +#[inline] +unsafe fn mul(x: __m512i, y: __m512i) -> __m512i { + reduce128(mul64_64(x, y)) +} + +#[inline] +unsafe fn square(x: __m512i) -> __m512i { + reduce128(square64(x)) +} + +#[inline] +unsafe fn interleave1(x: __m512i, y: __m512i) -> (__m512i, __m512i) { + let a = _mm512_unpacklo_epi64(x, y); + let b = _mm512_unpackhi_epi64(x, y); + (a, b) +} + +const INTERLEAVE2_IDX_A: __m512i = unsafe { + transmute([ + 0o00u64, 0o01u64, 0o10u64, 0o11u64, 0o04u64, 0o05u64, 0o14u64, 0o15u64, + ]) +}; +const INTERLEAVE2_IDX_B: __m512i = unsafe { + transmute([ + 0o02u64, 0o03u64, 0o12u64, 0o13u64, 0o06u64, 0o07u64, 0o16u64, 0o17u64, + ]) +}; + +#[inline] +unsafe fn interleave2(x: __m512i, y: __m512i) -> (__m512i, __m512i) { + let a = _mm512_permutex2var_epi64(x, INTERLEAVE2_IDX_A, y); + let b = _mm512_permutex2var_epi64(x, INTERLEAVE2_IDX_B, y); + (a, b) +} + +#[inline] +unsafe fn interleave4(x: __m512i, y: __m512i) -> (__m512i, __m512i) { + let a = _mm512_shuffle_i64x2::<0x44>(x, y); + let b = _mm512_shuffle_i64x2::<0xee>(x, y); + (a, b) +} + +#[cfg(test)] +mod tests { + use super::Avx512GoldilocksField; + use crate::ff::*; + use crate::field_gl::{Fr, FrRepr as GoldilocksField}; + use std::time::Instant; + + fn test_vals_a() -> [GoldilocksField; 8] { + [ + GoldilocksField([14479013849828404771u64]), + GoldilocksField([9087029921428221768u64]), + GoldilocksField([2441288194761790662u64]), + GoldilocksField([5646033492608483824u64]), + GoldilocksField([2779181197214900072u64]), + GoldilocksField([2989742820063487116u64]), + GoldilocksField([727880025589250743u64]), + GoldilocksField([3803926346107752679u64]), + ] + } + fn test_vals_b() -> [GoldilocksField; 8] { + [ + GoldilocksField([17891926589593242302u64]), + GoldilocksField([11009798273260028228u64]), + GoldilocksField([2028722748960791447u64]), + GoldilocksField([7929433601095175579u64]), + GoldilocksField([6632528436085461172u64]), + GoldilocksField([2145438710786785567u64]), + GoldilocksField([11821483668392863016u64]), + GoldilocksField([15638272883309521929u64]), + ] + } + + #[test] + fn test_add1() { + let a_arr = test_vals_a(); + let b_arr = test_vals_b(); + let start = Instant::now(); + let packed_a = *Avx512GoldilocksField::from_slice(&a_arr); + let packed_b = *Avx512GoldilocksField::from_slice(&b_arr); + let packed_res = packed_a + packed_b; + let arr_res = packed_res.as_slice(); + let avx512_duration = start.elapsed(); + let start = Instant::now(); + let expected = a_arr + .iter() + .zip(b_arr) + .map(|(&a, b)| Fr::from_repr(a).unwrap() + Fr::from_repr(b).unwrap()); + let expected_values: Vec = expected.collect(); + // println!("expected values: {:?}", expected_values); + let non_accelerated_duration = start.elapsed(); + for (exp, &res) in expected_values.iter().zip(arr_res) { + assert_eq!(res, exp.into_repr()); + } + + println!("test_add_AVX512_accelerated time: {:?}", avx512_duration); + println!( + "test_add_Non_accelerated time: {:?}", + non_accelerated_duration + ); + } + + #[test] + fn test_mul() { + let a_arr = test_vals_a(); + let b_arr = test_vals_b(); + let start = Instant::now(); + let packed_a = *Avx512GoldilocksField::from_slice(&a_arr); + let packed_b = *Avx512GoldilocksField::from_slice(&b_arr); + let packed_res = packed_a * packed_b; + let arr_res = packed_res.as_slice(); + let avx512_duration = start.elapsed(); + + let start = Instant::now(); + let expected = a_arr + .iter() + .zip(b_arr) + .map(|(&a, b)| Fr::from_repr(a).unwrap() * Fr::from_repr(b).unwrap()); + let expected_values: Vec = expected.collect(); + let non_accelerated_duration = start.elapsed(); + // println!("expected values: {:?}", expected_values); + + for (exp, &res) in expected_values.iter().zip(arr_res) { + assert_eq!(res, exp.into_repr()); + } + + println!("test_mul_AVX512_accelerated time: {:?}", avx512_duration); + println!( + "test_mul_Non_accelerated time: {:?}", + non_accelerated_duration + ); + } + + #[test] + fn test_div() { + let a_arr = test_vals_a(); + let start = Instant::now(); + let packed_a = *Avx512GoldilocksField::from_slice(&a_arr); + let packed_res = packed_a / GoldilocksField([7929433601095175579u64]); + let arr_res = packed_res.as_slice(); + let avx512_duration = start.elapsed(); + // println!("arr_res: {:?}", arr_res); + + let start = Instant::now(); + let expected = a_arr.iter().map(|&a| { + Fr::from_repr(a).unwrap() + / Fr::from_repr(GoldilocksField([7929433601095175579u64])).unwrap() + }); + let expected_values: Vec = expected.collect(); + let non_accelerated_duration = start.elapsed(); + // println!("expected values: {:?}", expected_values); + + for (exp, &res) in expected_values.iter().zip(arr_res) { + assert_eq!(res, exp.into_repr()); + } + + println!("test_div_AVX512_accelerated time: {:?}", avx512_duration); + println!( + "test_div_Non_accelerated time: {:?}", + non_accelerated_duration + ); + } + + #[test] + fn test_square() { + let a_arr = test_vals_a(); + let start = Instant::now(); + let packed_a = *Avx512GoldilocksField::from_slice(&a_arr); + let packed_res = packed_a.square(); + let arr_res = packed_res.as_slice(); + let avx512_duration = start.elapsed(); + // println!("arr_res: {:?}", arr_res); + + let start = Instant::now(); + let mut expected_values = Vec::new(); + for &a in &a_arr { + match Fr::from_repr(a) { + Ok(mut fr) => { + fr.square(); + expected_values.push(fr); + } + Err(_) => { + continue; + } + } + } + let non_accelerated_duration = start.elapsed(); + // println!("expected values: {:?}", expected_values); + for (exp, &res) in expected_values.iter().zip(arr_res) { + assert_eq!(res, exp.into_repr()); + } + println!("test_square_AVX512_accelerated time: {:?}", avx512_duration); + println!( + "test_square_Non_accelerated time: {:?}", + non_accelerated_duration + ); + } + + #[test] + fn test_neg() { + let a_arr = test_vals_a(); + let start = Instant::now(); + let packed_a = *Avx512GoldilocksField::from_slice(&a_arr); + let packed_res = -packed_a; + let arr_res = packed_res.as_slice(); + let avx512_duration = start.elapsed(); + // println!("arr_res: {:?}", arr_res); + + let start = Instant::now(); + let expected = a_arr.iter().map(|&a| -Fr::from_repr(a).unwrap()); + let expected_values: Vec = expected.collect(); + let non_accelerated_duration = start.elapsed(); + // println!("expected values: {:?}", expected_values); + + for (exp, &res) in expected_values.iter().zip(arr_res) { + assert_eq!(res, exp.into_repr()); + } + + println!("test_neg_AVX512_accelerated time: {:?}", avx512_duration); + println!( + "test_neg_Non_accelerated time: {:?}", + non_accelerated_duration + ); + } + + #[test] + fn test_sub() { + let a_arr = test_vals_a(); + let b_arr = test_vals_b(); + let start = Instant::now(); + let packed_a = *Avx512GoldilocksField::from_slice(&a_arr); + let packed_b = *Avx512GoldilocksField::from_slice(&b_arr); + let packed_res = packed_a - packed_b; + let arr_res = packed_res.as_slice(); + let avx512_duration = start.elapsed(); + // println!("arr_res: {:?}", arr_res); + + let start = Instant::now(); + let expected = a_arr + .iter() + .zip(b_arr) + .map(|(&a, b)| Fr::from_repr(a).unwrap() - Fr::from_repr(b).unwrap()); + let expected_values: Vec = expected.collect(); + let non_accelerated_duration = start.elapsed(); + // println!("expected values: {:?}", expected_values); + + for (exp, &res) in expected_values.iter().zip(arr_res) { + assert_eq!(res, exp.into_repr()); + } + + println!("test_sub_AVX512_accelerated time: {:?}", avx512_duration); + println!( + "test_sub_Non_accelerated time: {:?}", + non_accelerated_duration + ); + } + + #[test] + fn test_interleave_is_involution() { + let a_arr = test_vals_a(); + let b_arr = test_vals_b(); + + let packed_a = *Avx512GoldilocksField::from_slice(&a_arr); + let packed_b = *Avx512GoldilocksField::from_slice(&b_arr); + { + // Interleave, then deinterleave. + let (x, y) = packed_a.interleave(packed_b, 1); + let (res_a, res_b) = x.interleave(y, 1); + assert_eq!(res_a.as_slice(), a_arr); + assert_eq!(res_b.as_slice(), b_arr); + } + { + let (x, y) = packed_a.interleave(packed_b, 2); + let (res_a, res_b) = x.interleave(y, 2); + assert_eq!(res_a.as_slice(), a_arr); + assert_eq!(res_b.as_slice(), b_arr); + } + { + let (x, y) = packed_a.interleave(packed_b, 4); + let (res_a, res_b) = x.interleave(y, 4); + assert_eq!(res_a.as_slice(), a_arr); + assert_eq!(res_b.as_slice(), b_arr); + } + { + let (x, y) = packed_a.interleave(packed_b, 8); + let (res_a, res_b) = x.interleave(y, 8); + assert_eq!(res_a.as_slice(), a_arr); + assert_eq!(res_b.as_slice(), b_arr); + } + } + + #[test] + fn test_interleave() { + let in_a: [GoldilocksField; 8] = [ + GoldilocksField([0u64]), + GoldilocksField([1u64]), + GoldilocksField([2u64]), + GoldilocksField([3u64]), + GoldilocksField([4u64]), + GoldilocksField([5u64]), + GoldilocksField([6u64]), + GoldilocksField([7u64]), + ]; + let in_b: [GoldilocksField; 8] = [ + GoldilocksField([10u64]), + GoldilocksField([11u64]), + GoldilocksField([12u64]), + GoldilocksField([13u64]), + GoldilocksField([14u64]), + GoldilocksField([15u64]), + GoldilocksField([16u64]), + GoldilocksField([17u64]), + ]; + let int1_a: [GoldilocksField; 8] = [ + GoldilocksField([0u64]), + GoldilocksField([10u64]), + GoldilocksField([2u64]), + GoldilocksField([12u64]), + GoldilocksField([4u64]), + GoldilocksField([14u64]), + GoldilocksField([6u64]), + GoldilocksField([16u64]), + ]; + let int1_b: [GoldilocksField; 8] = [ + GoldilocksField([1u64]), + GoldilocksField([11u64]), + GoldilocksField([3u64]), + GoldilocksField([13u64]), + GoldilocksField([5u64]), + GoldilocksField([15u64]), + GoldilocksField([7u64]), + GoldilocksField([17u64]), + ]; + let int2_a: [GoldilocksField; 8] = [ + GoldilocksField([0u64]), + GoldilocksField([1u64]), + GoldilocksField([10u64]), + GoldilocksField([11u64]), + GoldilocksField([4u64]), + GoldilocksField([5u64]), + GoldilocksField([14u64]), + GoldilocksField([15u64]), + ]; + let int2_b: [GoldilocksField; 8] = [ + GoldilocksField([2u64]), + GoldilocksField([3u64]), + GoldilocksField([12u64]), + GoldilocksField([13u64]), + GoldilocksField([6u64]), + GoldilocksField([7u64]), + GoldilocksField([16u64]), + GoldilocksField([17u64]), + ]; + let int4_a: [GoldilocksField; 8] = [ + GoldilocksField([0u64]), + GoldilocksField([1u64]), + GoldilocksField([2u64]), + GoldilocksField([3u64]), + GoldilocksField([10u64]), + GoldilocksField([11u64]), + GoldilocksField([12u64]), + GoldilocksField([13u64]), + ]; + let int4_b: [GoldilocksField; 8] = [ + GoldilocksField([4u64]), + GoldilocksField([5u64]), + GoldilocksField([6u64]), + GoldilocksField([7u64]), + GoldilocksField([14u64]), + GoldilocksField([15u64]), + GoldilocksField([16u64]), + GoldilocksField([17u64]), + ]; + + let packed_a = *Avx512GoldilocksField::from_slice(&in_a); + let packed_b = *Avx512GoldilocksField::from_slice(&in_b); + { + let (x1, y1) = packed_a.interleave(packed_b, 1); + assert_eq!(x1.as_slice(), int1_a); + assert_eq!(y1.as_slice(), int1_b); + } + { + let (x2, y2) = packed_a.interleave(packed_b, 2); + assert_eq!(x2.as_slice(), int2_a); + assert_eq!(y2.as_slice(), int2_b); + } + { + let (x4, y4) = packed_a.interleave(packed_b, 4); + assert_eq!(x4.as_slice(), int4_a); + assert_eq!(y4.as_slice(), int4_b); + } + { + let (x8, y8) = packed_a.interleave(packed_b, 8); + assert_eq!(x8.as_slice(), in_a); + assert_eq!(y8.as_slice(), in_b); + } + } +} diff --git a/algebraic/src/arch/x86_64/mod.rs b/algebraic/src/arch/x86_64/mod.rs new file mode 100644 index 00000000..2bdff650 --- /dev/null +++ b/algebraic/src/arch/x86_64/mod.rs @@ -0,0 +1,20 @@ +#[cfg(all( + target_feature = "avx2", + not(all( + target_feature = "avx512bw", + target_feature = "avx512cd", + target_feature = "avx512dq", + target_feature = "avx512f", + target_feature = "avx512vl" + )) +))] +pub mod avx2_field_gl; + +#[cfg(all( + target_feature = "avx512bw", + target_feature = "avx512cd", + target_feature = "avx512dq", + target_feature = "avx512f", + target_feature = "avx512vl" +))] +pub mod avx512_field_gl; diff --git a/algebraic/src/lib.rs b/algebraic/src/lib.rs index be0b90b3..e48706a0 100644 --- a/algebraic/src/lib.rs +++ b/algebraic/src/lib.rs @@ -1,4 +1,5 @@ #![allow(clippy::unit_arg)] +#![feature(stdsimd)] #[macro_use] extern crate serde; @@ -11,6 +12,8 @@ extern crate num_bigint; extern crate num_traits; extern crate rand; +pub mod arch; + pub mod circom_circuit; pub mod errors; pub mod field_gl; @@ -26,6 +29,7 @@ pub use franklin_crypto::bellman as bellman_ce; #[cfg(test)] mod field_gl_test; +// mod packed; #[cfg(target_arch = "wasm32")] extern crate wasm_bindgen; diff --git a/algebraic/src/packable.rs b/algebraic/src/packable.rs new file mode 100644 index 00000000..c6f8cd36 --- /dev/null +++ b/algebraic/src/packable.rs @@ -0,0 +1,40 @@ +use crate::packed::PackedField; +use crate::types::Field; + +/// Points us to the default packing for a particular field. There may me multiple choices of +/// PackedField for a particular Field (e.g. every Field is also a PackedField), but this is the +/// recommended one. The recommended packing varies by target_arch and target_feature. +pub trait Packable: Field { + type Packing: PackedField; +} + +impl Packable for F { + default type Packing = Self; +} + +#[cfg(all( + target_arch = "x86_64", + target_feature = "avx2", + not(all( + target_feature = "avx512bw", + target_feature = "avx512cd", + target_feature = "avx512dq", + target_feature = "avx512f", + target_feature = "avx512vl" + )) +))] +impl Packable for crate::goldilocks_field::GoldilocksField { + type Packing = crate::arch::x86_64::avx2_goldilocks_field::Avx2GoldilocksField; +} + +#[cfg(all( + target_arch = "x86_64", + target_feature = "avx512bw", + target_feature = "avx512cd", + target_feature = "avx512dq", + target_feature = "avx512f", + target_feature = "avx512vl" +))] +impl Packable for crate::goldilocks_field::GoldilocksField { + type Packing = crate::arch::x86_64::avx512_goldilocks_field::Avx512GoldilocksField; +} diff --git a/algebraic/src/packed.rs b/algebraic/src/packed.rs new file mode 100644 index 00000000..0236962d --- /dev/null +++ b/algebraic/src/packed.rs @@ -0,0 +1,127 @@ +use core::fmt::Debug; +use core::iter::{Product, Sum}; +use core::ops::{Add, AddAssign, Div, Mul, MulAssign, Neg, Sub, SubAssign}; +use core::slice; + +// use crate::ops::Square; +use crate::ff::*; + +/// # Safety +/// - WIDTH is assumed to be a power of 2. +/// - If P implements PackedField then P must be castable to/from [P::Scalar; P::WIDTH] without UB. +pub unsafe trait PackedField: + 'static + + Add + + Add + + AddAssign + + AddAssign + + Copy + + Debug + + Default + + From + // TODO: Implement packed / packed division + + Div + + Mul + + Mul + + MulAssign + + MulAssign + // + Square + + Neg + + Product + + Send + + Sub + + Sub + + SubAssign + + SubAssign + + Sum + + Sync +where + Self::Scalar: Add, + Self::Scalar: Mul, + Self::Scalar: Sub, +{ + type Scalar: Field; + + const WIDTH: usize; + const ZEROS: Self; + const ONES: Self; + + fn from_slice(slice: &[Self::Scalar]) -> &Self; + fn from_slice_mut(slice: &mut [Self::Scalar]) -> &mut Self; + fn as_slice(&self) -> &[Self::Scalar]; + fn as_slice_mut(&mut self) -> &mut [Self::Scalar]; + + /// Take interpret two vectors as chunks of block_len elements. Unpack and interleave those + /// chunks. This is best seen with an example. If we have: + /// A = [x0, y0, x1, y1], + /// B = [x2, y2, x3, y3], + /// then + /// interleave(A, B, 1) = ([x0, x2, x1, x3], [y0, y2, y1, y3]). + /// Pairs that were adjacent in the input are at corresponding positions in the output. + /// r lets us set the size of chunks we're interleaving. If we set block_len = 2, then for + /// A = [x0, x1, y0, y1], + /// B = [x2, x3, y2, y3], + /// we obtain + /// interleave(A, B, block_len) = ([x0, x1, x2, x3], [y0, y1, y2, y3]). + /// We can also think about this as stacking the vectors, dividing them into 2x2 matrices, and + /// transposing those matrices. + /// When block_len = WIDTH, this operation is a no-op. block_len must divide WIDTH. Since + /// WIDTH is specified to be a power of 2, block_len must also be a power of 2. It cannot be 0 + /// and it cannot be > WIDTH. + fn interleave(&self, other: Self, block_len: usize) -> (Self, Self); + + fn pack_slice(buf: &[Self::Scalar]) -> &[Self] { + assert!( + buf.len() % Self::WIDTH == 0, + "Slice length (got {}) must be a multiple of packed field width ({}).", + buf.len(), + Self::WIDTH + ); + let buf_ptr = buf.as_ptr().cast::(); + let n = buf.len() / Self::WIDTH; + unsafe { slice::from_raw_parts(buf_ptr, n) } + } + fn pack_slice_mut(buf: &mut [Self::Scalar]) -> &mut [Self] { + assert!( + buf.len() % Self::WIDTH == 0, + "Slice length (got {}) must be a multiple of packed field width ({}).", + buf.len(), + Self::WIDTH + ); + let buf_ptr = buf.as_mut_ptr().cast::(); + let n = buf.len() / Self::WIDTH; + unsafe { slice::from_raw_parts_mut(buf_ptr, n) } + } + + fn doubles(&self) -> Self { + *self * Self::Scalar::TWO + } +} + +unsafe impl PackedField for F { + type Scalar = Self; + + const WIDTH: usize = 1; + const ZEROS: Self = F::ZERO; + const ONES: Self = F::ONE; + + fn from_slice(slice: &[Self::Scalar]) -> &Self { + &slice[0] + } + fn from_slice_mut(slice: &mut [Self::Scalar]) -> &mut Self { + &mut slice[0] + } + fn as_slice(&self) -> &[Self::Scalar] { + slice::from_ref(self) + } + fn as_slice_mut(&mut self) -> &mut [Self::Scalar] { + slice::from_mut(self) + } + + fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) { + match block_len { + 1 => (*self, other), + _ => panic!("unsupported block length"), + } + } +}