diff --git a/crypto/fipsmodule/bn/asm/armv8-mont.pl b/crypto/fipsmodule/bn/asm/armv8-mont.pl index efee19902..1fce07edd 100644 --- a/crypto/fipsmodule/bn/asm/armv8-mont.pl +++ b/crypto/fipsmodule/bn/asm/armv8-mont.pl @@ -55,7 +55,7 @@ $lo1,$hi1,$nj,$m1,$nlo,$nhi, $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); -# int bn_mul_mont( +# void bn_mul_mont_nohw( $rp="x0"; # BN_ULONG *rp, $ap="x1"; # const BN_ULONG *ap, $bp="x2"; # const BN_ULONG *bp, @@ -68,16 +68,11 @@ .text -.globl bn_mul_mont -.type bn_mul_mont,%function +.globl bn_mul_mont_nohw +.type bn_mul_mont_nohw,%function .align 5 -bn_mul_mont: +bn_mul_mont_nohw: AARCH64_SIGN_LINK_REGISTER - tst $num,#7 - b.eq __bn_sqr8x_mont - tst $num,#3 - b.eq __bn_mul4x_mont -.Lmul_mont: stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -272,7 +267,7 @@ ldr x29,[sp],#64 AARCH64_VALIDATE_LINK_REGISTER ret -.size bn_mul_mont,.-bn_mul_mont +.size bn_mul_mont_nohw,.-bn_mul_mont_nohw ___ { ######################################################################## @@ -285,14 +280,11 @@ my ($tp,$ap_end,$na0)=($bp,$np,$carry); $code.=<<___; -.type __bn_sqr8x_mont,%function +.globl bn_sqr8x_mont +.type bn_sqr8x_mont,%function .align 5 -__bn_sqr8x_mont: - // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to - // only from bn_mul_mont which has already signed the return address. - cmp $ap,$bp - b.ne __bn_mul4x_mont -.Lsqr8x_mont: +bn_sqr8x_mont: + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -1049,7 +1041,7 @@ // x30 is popped earlier AARCH64_VALIDATE_LINK_REGISTER ret -.size __bn_sqr8x_mont,.-__bn_sqr8x_mont +.size bn_sqr8x_mont,.-bn_sqr8x_mont ___ } @@ -1068,12 +1060,11 @@ my ($carry,$topmost) = ($rp,"x30"); $code.=<<___; -.type __bn_mul4x_mont,%function +.globl bn_mul4x_mont +.type bn_mul4x_mont,%function .align 5 -__bn_mul4x_mont: - // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to - // only from bn_mul_mont or __bn_mul8x_mont which have already signed the - // return address. +bn_mul4x_mont: + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] @@ -1510,7 +1501,7 @@ // x30 is popped earlier AARCH64_VALIDATE_LINK_REGISTER ret -.size __bn_mul4x_mont,.-__bn_mul4x_mont +.size bn_mul4x_mont,.-bn_mul4x_mont ___ } $code.=<<___; diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h index f370257ad..b888fce08 100644 --- a/crypto/fipsmodule/bn/internal.h +++ b/crypto/fipsmodule/bn/internal.h @@ -208,6 +208,16 @@ static inline void bn_mul_mont_small( const BN_ULONG *np, const BN_ULONG *n0, size_t num) { bn_mul_mont_nohw(rp, ap, bp, np, n0, num); } +#elif defined(OPENSSL_AARCH64) +void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num); +static inline void bn_mul_mont_small( + BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num) { + // No point in optimizing for P-256 because P-256 doesn't call into + // this on AArch64. + bn_mul_mont_nohw(rp, ap, bp, np, n0, num); +} #elif defined(OPENSSL_ARM) void bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, size_t num); diff --git a/src/arithmetic.rs b/src/arithmetic.rs index ec892c9a9..f810741f0 100644 --- a/src/arithmetic.rs +++ b/src/arithmetic.rs @@ -17,6 +17,7 @@ use crate::{error::LenMismatchError, limb::LIMB_BITS}; #[macro_use] mod ffi; +mod aarch64_mont; mod x86_64_mont; mod constant; diff --git a/src/arithmetic/aarch64_mont.rs b/src/arithmetic/aarch64_mont.rs new file mode 100644 index 000000000..4e88b303e --- /dev/null +++ b/src/arithmetic/aarch64_mont.rs @@ -0,0 +1,59 @@ +// Copyright 2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(all(target_arch = "aarch64", target_endian = "little"))] + +use super::{inout::AliasingSlices3 as _, n0::N0, LimbSliceError, MAX_LIMBS}; +use crate::{ + c, + limb::Limb, + polyfill::slice::{AsChunks, AsChunksMut}, +}; +use core::num::NonZeroUsize; + +#[inline] +pub(super) fn sqr_mont5( + mut in_out: AsChunksMut, + n: AsChunks, + n0: &N0, +) -> Result<(), LimbSliceError> { + prefixed_extern! { + // `r` and/or 'a' may alias. + // XXX: BoringSSL (kinda, implicitly) declares this to return `int`. + // `num` must be a non-zero multiple of 8. + fn bn_sqr8x_mont( + rp: *mut Limb, + ap: *const Limb, + ap_again: *const Limb, + np: *const Limb, + n0: &N0, + num: c::NonZero_size_t); + } + + let in_out = in_out.as_flattened_mut(); + let n = n.as_flattened(); + let num_limbs = NonZeroUsize::new(n.len()).ok_or_else(|| LimbSliceError::too_short(n.len()))?; + + // Avoid stack overflow from the alloca inside. + if num_limbs.get() > MAX_LIMBS { + return Err(LimbSliceError::too_long(num_limbs.get())); + } + + in_out + .with_non_dangling_non_null_pointers_rab(num_limbs, |r, a, a_again| { + let n = n.as_ptr(); // Non-dangling because num_limbs > 0. + unsafe { bn_sqr8x_mont(r, a, a_again, n, n0, num_limbs) }; + }) + .map_err(LimbSliceError::len_mismatch) +} diff --git a/src/arithmetic/montgomery.rs b/src/arithmetic/montgomery.rs index 7060e528b..895ef180c 100644 --- a/src/arithmetic/montgomery.rs +++ b/src/arithmetic/montgomery.rs @@ -125,9 +125,17 @@ pub(super) fn limbs_mul_mont( cfg_if! { if #[cfg(all(target_arch = "aarch64", target_endian = "little"))] { let _: cpu::Features = cpu; - bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { - (MIN_LIMBS, MOD_FALLBACK, ()) => bn_mul_mont - }) + const MIN_4X: usize = 4; + const MOD_4X: usize = 4; + if n.len() >= MIN_4X && n.len() % MOD_4X == 0 { + bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { + (MIN_4X, MOD_4X, ()) => bn_mul4x_mont + }) + } else { + bn_mul_mont_ffi!(in_out, n, n0, (), unsafe { + (MIN_LIMBS, MOD_FALLBACK, ()) => bn_mul_mont_nohw + }) + } } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] { const MIN_8X: usize = 8; const MOD_8X: usize = 8; @@ -297,6 +305,15 @@ pub(super) fn limbs_square_mont( n0: &N0, cpu: cpu::Features, ) -> Result<(), LimbSliceError> { + #[cfg(all(target_arch = "aarch64", target_endian = "little"))] + { + use super::aarch64_mont; + use crate::polyfill::slice; + if let ((r, []), (n, [])) = (slice::as_chunks_mut(r), slice::as_chunks(n)) { + return aarch64_mont::sqr_mont5(r, n, n0); + } + } + #[cfg(target_arch = "x86_64")] { use super::x86_64_mont;