Skip to content

Commit

Permalink
arithmetic: Dispatch x86_64 bn_mul_mont in Rust (Merge BoringSSL 7cb8df5
Browse files Browse the repository at this point in the history
)
  • Loading branch information
briansmith committed Jan 19, 2025
2 parents e8ffb44 + 7cb8df5 commit a3b7685
Show file tree
Hide file tree
Showing 10 changed files with 219 additions and 100 deletions.
4 changes: 4 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -887,10 +887,14 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
"bn_gather5",
"bn_mul_mont",
"bn_mul_mont_gather5",
"bn_mul_mont_nohw",
"bn_mul4x_mont",
"bn_mulx4x_mont",
"bn_neg_inv_mod_r_u64",
"bn_power5",
"bn_scatter5",
"bn_sqr8x_internal",
"bn_sqr8x_mont",
"bn_sqrx8x_internal",
"bsaes_ctr32_encrypt_blocks",
"bssl_constant_time_test_conditional_memcpy",
Expand Down
53 changes: 16 additions & 37 deletions crypto/fipsmodule/bn/asm/x86_64-mont.pl
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
# output, so this isn't useful anyway.
$addx = 1;

# void bn_mul_mont(
# void bn_mul_mont_nohw(
$rp="%rdi"; # BN_ULONG *rp,
$ap="%rsi"; # const BN_ULONG *ap,
$bp="%rdx"; # const BN_ULONG *bp,
Expand All @@ -87,33 +87,15 @@

.extern OPENSSL_ia32cap_P

.globl bn_mul_mont
.type bn_mul_mont,\@function,6
.globl bn_mul_mont_nohw
.type bn_mul_mont_nohw,\@function,6
.align 16
bn_mul_mont:
bn_mul_mont_nohw:
.cfi_startproc
_CET_ENDBR
mov ${num}d,${num}d
mov %rsp,%rax
.cfi_def_cfa_register %rax
test \$3,${num}d
jnz .Lmul_enter
cmp \$8,${num}d
jb .Lmul_enter
___
$code.=<<___ if ($addx);
leaq OPENSSL_ia32cap_P(%rip),%r11
mov 8(%r11),%r11d
___
$code.=<<___;
cmp $ap,$bp
jne .Lmul4x_enter
test \$7,${num}d
jz .Lsqr8x_enter
jmp .Lmul4x_enter

.align 16
.Lmul_enter:
push %rbx
.cfi_push %rbx
push %rbp
Expand Down Expand Up @@ -348,27 +330,21 @@
.Lmul_epilogue:
ret
.cfi_endproc
.size bn_mul_mont,.-bn_mul_mont
.size bn_mul_mont_nohw,.-bn_mul_mont_nohw
___
{{{
my @A=("%r10","%r11");
my @N=("%r13","%rdi");
$code.=<<___;
.globl bn_mul4x_mont
.type bn_mul4x_mont,\@function,6
.align 16
bn_mul4x_mont:
.cfi_startproc
_CET_ENDBR
mov ${num}d,${num}d
mov %rsp,%rax
.cfi_def_cfa_register %rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
and \$0x80100,%r11d
cmp \$0x80100,%r11d
je .Lmulx4x_enter
___
$code.=<<___;
push %rbx
.cfi_push %rbx
push %rbp
Expand Down Expand Up @@ -825,13 +801,15 @@
$code.=<<___;
.extern bn_sqr8x_internal # see x86_64-mont5 module

.globl bn_sqr8x_mont
.type bn_sqr8x_mont,\@function,6
.align 32
bn_sqr8x_mont:
.cfi_startproc
_CET_ENDBR
mov ${num}d,${num}d
mov %rsp,%rax
.cfi_def_cfa_register %rax
.Lsqr8x_enter:
push %rbx
.cfi_push %rbx
push %rbp
Expand Down Expand Up @@ -1024,13 +1002,14 @@
my $bp="%rdx"; # original value

$code.=<<___;
.globl bn_mulx4x_mont
.type bn_mulx4x_mont,\@function,6
.align 32
bn_mulx4x_mont:
.cfi_startproc
_CET_ENDBR
mov %rsp,%rax
.cfi_def_cfa_register %rax
.Lmulx4x_enter:
push %rbx
.cfi_push %rbx
push %rbp
Expand Down Expand Up @@ -1535,9 +1514,9 @@

.section .pdata
.align 4
.rva .LSEH_begin_bn_mul_mont
.rva .LSEH_end_bn_mul_mont
.rva .LSEH_info_bn_mul_mont
.rva .LSEH_begin_bn_mul_mont_nohw
.rva .LSEH_end_bn_mul_mont_nohw
.rva .LSEH_info_bn_mul_mont_nohw

.rva .LSEH_begin_bn_mul4x_mont
.rva .LSEH_end_bn_mul4x_mont
Expand All @@ -1555,7 +1534,7 @@
$code.=<<___;
.section .xdata
.align 8
.LSEH_info_bn_mul_mont:
.LSEH_info_bn_mul_mont_nohw:
.byte 9,0,0,0
.rva mul_handler
.rva .Lmul_body,.Lmul_epilogue # HandlerData[]
Expand Down
28 changes: 23 additions & 5 deletions crypto/fipsmodule/bn/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,22 +166,40 @@ typedef crypto_word_t BN_ULONG;
#endif


// If at least one of |ap| or |bp| is fully reduced, |rp| will be fully reduced.
// If neither is fully-reduced, the output may not be either.
//
// This function allocates |num| words on the stack, so |num| should be at most
// |BN_MONTGOMERY_MAX_WORDS|.
//
// TODO(davidben): The x86_64 implementation expects a 32-bit input and masks
// off upper bits. The aarch64 implementation expects a 64-bit input and does
// not. |size_t| is the safer option but not strictly correct for x86_64. But
// the |BN_MONTGOMERY_MAX_WORDS| bound makes this moot.
//
// See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word
// inputs.
//
// |num| must be at least 4, at least on x86.
//
// In other forks, |bn_mul_mont| returns an |int| indicating whether it
// actually did the multiplication. All our implementations always do the
// multiplication, and forcing callers to deal with the possibility of it
// failing just leads to further problems.
//
// In other forks, |bn_mod_mul|'s `num` argument has type |int| but it is
// implicitly treated as a |size_t|; when |int| is smaller than |size_t|
// then the |movq 48(%rsp),%r9| done by x86_64-xlate.pl implicitly does the
// conversion.
OPENSSL_STATIC_ASSERT(sizeof(int) == sizeof(size_t) ||
(sizeof(int) == 4 && sizeof(size_t) == 8),
"int and size_t ABI mismatch");
#if !defined(OPENSSL_X86_64)
void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
#else
void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
static inline void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
}
#endif

static inline void bn_umult_lohi(BN_ULONG *low_out, BN_ULONG *high_out,
BN_ULONG a, BN_ULONG b) {
Expand Down
5 changes: 5 additions & 0 deletions src/arithmetic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

#[macro_use]
mod ffi;

mod constant;

#[cfg(feature = "alloc")]
Expand All @@ -22,6 +25,8 @@ pub mod montgomery;

mod n0;

pub(crate) use self::ffi::BIGINT_MODULUS_MIN_LIMBS;

#[allow(dead_code)]
const BIGINT_MODULUS_MAX_LIMBS: usize = 8192 / crate::limb::LIMB_BITS;

Expand Down
9 changes: 3 additions & 6 deletions src/arithmetic/bigint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,7 @@ fn from_montgomery_amm<M>(limbs: BoxedLimbs<M>, m: &Modulus<M>) -> Elem<M, Unenc
one[0] = 1;
let one = &one[..m.limbs().len()];
limbs_mul_mont(
InOut::InPlace(&mut limbs),
one,
InOut::InPlace(&mut limbs, one),
m.limbs(),
m.n0(),
m.cpu_features(),
Expand Down Expand Up @@ -151,8 +150,7 @@ where
(AF, BF): ProductEncoding,
{
limbs_mul_mont(
InOut::InPlace(&mut b.limbs),
&a.limbs,
InOut::InPlace(&mut b.limbs, &a.limbs),
m.limbs(),
m.n0(),
m.cpu_features(),
Expand Down Expand Up @@ -480,8 +478,7 @@ pub fn elem_exp_consttime<M>(
let src2 = entry(previous, src2, num_limbs);
let dst = entry_mut(rest, 0, num_limbs);
limbs_mul_mont(
InOut::Disjoint(dst, src1),
src2,
InOut::Disjoint(dst, src1, src2),
m.limbs(),
m.n0(),
m.cpu_features(),
Expand Down
7 changes: 1 addition & 6 deletions src/arithmetic/bigint/modulus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,7 @@ use crate::{
};
use core::marker::PhantomData;

/// The x86 implementation of `bn_mul_mont`, at least, requires at least 4
/// limbs. For a long time we have required 4 limbs for all targets, though
/// this may be unnecessary. TODO: Replace this with
/// `n.len() < 256 / LIMB_BITS` so that 32-bit and 64-bit platforms behave the
/// same.
pub const MODULUS_MIN_LIMBS: usize = 4;
pub const MODULUS_MIN_LIMBS: usize = super::super::BIGINT_MODULUS_MIN_LIMBS;

pub const MODULUS_MAX_LIMBS: usize = super::super::BIGINT_MODULUS_MAX_LIMBS;

Expand Down
127 changes: 127 additions & 0 deletions src/arithmetic/ffi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// Copyright 2024-2025 Brian Smith.
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

use super::{n0::N0, InOut};
use crate::{c, limb::Limb};

// See below.
// TODO: Replace this with `n.len() < 256 / LIMB_BITS` so that 32-bit and
// 64-bit platforms behave the same.
pub(crate) const BIGINT_MODULUS_MIN_LIMBS: usize = 4;

/// `unsafe { (chunk_len, T) => f }` means it is safe to call `f` if
/// `n.len() >= chunk_len && n.len() % chunk_len == 0`, the slice(s) in
/// `in_out` have the same length as `n`, and we have constructed a value of
/// type `T`.
macro_rules! bn_mul_mont_ffi {
( $in_out:expr, $n:expr, $n0:expr, $cpu:expr, unsafe { ($CHUNK:expr, $Cpu:ty) => $f:ident }) => {{
use crate::{c, limb::Limb};
prefixed_extern! {
// `r` and/or 'a' and/or 'b' may alias.
// XXX: BoringSSL declares these functions to return `int`.
fn $f(
r: *mut Limb,
a: *const Limb,
b: *const Limb,
n: *const Limb,
n0: &N0,
len: c::size_t,
);
}
unsafe {
crate::arithmetic::ffi::bn_mul_mont_ffi::<$Cpu, { $CHUNK }>($in_out, $n, $n0, $cpu, $f)
}
}};
}

#[inline]
pub(super) unsafe fn bn_mul_mont_ffi<Cpu, const CHUNK: usize>(
in_out: InOut<[Limb]>,
n: &[Limb],
n0: &N0,
cpu: Cpu,
f: unsafe extern "C" fn(
r: *mut Limb,
a: *const Limb,
b: *const Limb,
n: *const Limb,
n0: &N0,
len: c::size_t,
),
) {
assert!(CHUNK > 0);

/// The x86 implementation of `bn_mul_mont`, at least, requires at least 4
/// limbs. For a long time we have required 4 limbs for all targets, though
/// this may be unnecessary.
const _BIGINT_MODULUS_MIN_LIMBS_AT_LEAST_4: () = assert!(BIGINT_MODULUS_MIN_LIMBS >= 4);
assert!(n.len() >= BIGINT_MODULUS_MIN_LIMBS);
assert!(n.len() >= CHUNK);
assert!(n.len() % CHUNK == 0);

let (r, a, b) = match in_out {
InOut::SquareInPlace(r) => {
assert_eq!(r.len(), n.len());
(r.as_mut_ptr(), r.as_ptr(), r.as_ptr())
}
InOut::InPlace(r, a) => {
assert_eq!(r.len(), n.len());
assert_eq!(a.len(), n.len());
(r.as_mut_ptr(), r.as_ptr(), a.as_ptr())
}
InOut::Disjoint(r, a, b) => {
assert_eq!(r.len(), n.len());
assert_eq!(a.len(), n.len());
assert_eq!(b.len(), n.len());
(r.as_mut_ptr(), a.as_ptr(), b.as_ptr())
}
};
let num_limbs = n.len();
let n = n.as_ptr();
let _: Cpu = cpu;
unsafe { f(r, a, b, n, n0, num_limbs) };
}

// `bn_sqr8x_mont` has a weird signature so it has to be handled separately.
// Note that MULX is in BMI2.
#[cfg(target_arch = "x86_64")]
pub(super) fn bn_sqr8x_mont(
r: &mut [Limb],
n: &[[Limb; 8]],
n0: &N0,
mulx_adx: Option<(crate::cpu::intel::Bmi2, crate::cpu::intel::Adx)>,
) {
use crate::{bssl, polyfill::slice};
prefixed_extern! {
// `rp` and `ap` may alias.
fn bn_sqr8x_mont(
rp: *mut Limb,
ap: *const Limb,
mulx_adx_capable: Limb,
np: *const Limb,
n0: &N0,
num: c::size_t) -> bssl::Result;
}
assert!(!n.is_empty());
let n = slice::flatten(n);
assert_eq!(r.len(), n.len());

let r_out = r.as_mut_ptr();
let r_in = r.as_ptr();
let mulx_adx_capable = Limb::from(mulx_adx.is_some());
let num = n.len();
let n = n.as_ptr();
let r = unsafe { bn_sqr8x_mont(r_out, r_in, mulx_adx_capable, n, n0, num) };
assert!(Result::from(r).is_ok());
}
5 changes: 3 additions & 2 deletions src/arithmetic/inout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

pub enum InOut<'io, T: ?Sized> {
InPlace(&'io mut T),
SquareInPlace(&'io mut T),
InPlace(&'io mut T, &'io T),
#[cfg_attr(target_arch = "x86_64", allow(dead_code))]
Disjoint(&'io mut T, &'io T),
Disjoint(&'io mut T, &'io T, &'io T),
}
Loading

0 comments on commit a3b7685

Please sign in to comment.