diff --git a/build.rs b/build.rs index d392cb323..81b317f73 100644 --- a/build.rs +++ b/build.rs @@ -855,8 +855,11 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "CRYPTO_poly1305_update", "CRYPTO_poly1305_update_neon", "ChaCha20_ctr32", + "ChaCha20_ctr32_avx2", "ChaCha20_ctr32_neon", "ChaCha20_ctr32_nohw", + "ChaCha20_ctr32_ssse3", + "ChaCha20_ctr32_ssse3_4x", "LIMBS_add_mod", "LIMBS_are_even", "LIMBS_are_zero", diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl index 7c82c6da6..4b7c9e668 100755 --- a/crypto/chacha/asm/chacha-x86_64.pl +++ b/crypto/chacha/asm/chacha-x86_64.pl @@ -76,8 +76,6 @@ $code.=<<___; .text -.extern OPENSSL_ia32cap_P - .section .rodata .align 64 .Lzero: @@ -226,20 +224,12 @@ sub ROUND { # critical path is 24 cycles per round ######################################################################## # Generic code path that handles all lengths on pre-SSSE3 processors. $code.=<<___; -.globl ChaCha20_ctr32 -.type ChaCha20_ctr32,\@function,5 +.globl ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,\@function,5 .align 64 -ChaCha20_ctr32: +ChaCha20_ctr32_nohw: .cfi_startproc _CET_ENDBR - cmp \$0,$len - je .Lno_data - mov OPENSSL_ia32cap_P+4(%rip),%r10 -___ -$code.=<<___; - test \$`1<<(41-32)`,%r10d - jnz .LChaCha20_ssse3 - push %rbx .cfi_push rbx push %rbp @@ -411,7 +401,7 @@ sub ROUND { # critical path is 24 cycles per round .Lno_data: ret .cfi_endproc -.size ChaCha20_ctr32,.-ChaCha20_ctr32 +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw ___ ######################################################################## @@ -446,19 +436,16 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round my $xframe = $win64 ? 32+8 : 8; $code.=<<___; -.type ChaCha20_ssse3,\@function,5 +.globl ChaCha20_ctr32_ssse3 +.type ChaCha20_ctr32_ssse3,\@function,5 .align 32 -ChaCha20_ssse3: -.LChaCha20_ssse3: +ChaCha20_ctr32_ssse3: .cfi_startproc + _CET_ENDBR mov %rsp,%r9 # frame pointer .cfi_def_cfa_register r9 ___ $code.=<<___; - cmp \$128,$len # we might throw away some data, - ja .LChaCha20_4x # but overall it won't be slower - -.Ldo_sse3_after_all: sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); @@ -568,7 +555,7 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round .Lssse3_epilogue: ret .cfi_endproc -.size ChaCha20_ssse3,.-ChaCha20_ssse3 +.size ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3 ___ } @@ -706,29 +693,17 @@ sub SSSE3_lane_ROUND { my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; -.type ChaCha20_4x,\@function,5 +.globl ChaCha20_ctr32_ssse3_4x +.type ChaCha20_ctr32_ssse3_4x,\@function,5 .align 32 -ChaCha20_4x: -.LChaCha20_4x: +ChaCha20_ctr32_ssse3_4x: .cfi_startproc + _CET_ENDBR mov %rsp,%r9 # frame pointer .cfi_def_cfa_register r9 mov %r10,%r11 ___ -$code.=<<___ if ($avx>1); - shr \$32,%r10 # OPENSSL_ia32cap_P+8 - test \$`1<<5`,%r10 # test AVX2 - jnz .LChaCha20_8x -___ $code.=<<___; - cmp \$192,$len - ja .Lproceed4x - - and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE - cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE - je .Ldo_sse3_after_all # to detect Atom - -.Lproceed4x: sub \$0x140+$xframe,%rsp ___ ################ stack layout @@ -1156,7 +1131,7 @@ sub SSSE3_lane_ROUND { .L4x_epilogue: ret .cfi_endproc -.size ChaCha20_4x,.-ChaCha20_4x +.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x ___ } @@ -1285,11 +1260,12 @@ sub AVX2_lane_ROUND { my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; -.type ChaCha20_8x,\@function,5 +.globl ChaCha20_ctr32_avx2 +.type ChaCha20_ctr32_avx2,\@function,5 .align 32 -ChaCha20_8x: -.LChaCha20_8x: +ChaCha20_ctr32_avx2: .cfi_startproc + _CET_ENDBR mov %rsp,%r9 # frame register .cfi_def_cfa_register r9 sub \$0x280+$xframe,%rsp @@ -1801,7 +1777,7 @@ sub AVX2_lane_ROUND { .L8x_epilogue: ret .cfi_endproc -.size ChaCha20_8x,.-ChaCha20_8x +.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2 ___ } @@ -1985,42 +1961,42 @@ sub AVX2_lane_ROUND { .section .pdata .align 4 - .rva .LSEH_begin_ChaCha20_ctr32 - .rva .LSEH_end_ChaCha20_ctr32 - .rva .LSEH_info_ChaCha20_ctr32 + .rva .LSEH_begin_ChaCha20_ctr32_nohw + .rva .LSEH_end_ChaCha20_ctr32_nohw + .rva .LSEH_info_ChaCha20_ctr32_nohw - .rva .LSEH_begin_ChaCha20_ssse3 - .rva .LSEH_end_ChaCha20_ssse3 - .rva .LSEH_info_ChaCha20_ssse3 + .rva .LSEH_begin_ChaCha20_ctr32_ssse3 + .rva .LSEH_end_ChaCha20_ctr32_ssse3 + .rva .LSEH_info_ChaCha20_ctr32_ssse3 - .rva .LSEH_begin_ChaCha20_4x - .rva .LSEH_end_ChaCha20_4x - .rva .LSEH_info_ChaCha20_4x + .rva .LSEH_begin_ChaCha20_ctr32_ssse3_4x + .rva .LSEH_end_ChaCha20_ctr32_ssse3_4x + .rva .LSEH_info_ChaCha20_ctr32_ssse3_4x ___ $code.=<<___ if ($avx>1); - .rva .LSEH_begin_ChaCha20_8x - .rva .LSEH_end_ChaCha20_8x - .rva .LSEH_info_ChaCha20_8x + .rva .LSEH_begin_ChaCha20_ctr32_avx2 + .rva .LSEH_end_ChaCha20_ctr32_avx2 + .rva .LSEH_info_ChaCha20_ctr32_avx2 ___ $code.=<<___; .section .xdata .align 8 -.LSEH_info_ChaCha20_ctr32: +.LSEH_info_ChaCha20_ctr32_nohw: .byte 9,0,0,0 .rva se_handler -.LSEH_info_ChaCha20_ssse3: +.LSEH_info_ChaCha20_ctr32_ssse3: .byte 9,0,0,0 .rva ssse3_handler .rva .Lssse3_body,.Lssse3_epilogue -.LSEH_info_ChaCha20_4x: +.LSEH_info_ChaCha20_ctr32_ssse3_4x: .byte 9,0,0,0 .rva full_handler .rva .L4x_body,.L4x_epilogue ___ $code.=<<___ if ($avx>1); -.LSEH_info_ChaCha20_8x: +.LSEH_info_ChaCha20_ctr32_avx2: .byte 9,0,0,0 .rva full_handler .rva .L8x_body,.L8x_epilogue # HandlerData[] diff --git a/crypto/cpu_intel.c b/crypto/cpu_intel.c index 76b99fd64..ce5c2c0d1 100644 --- a/crypto/cpu_intel.c +++ b/crypto/cpu_intel.c @@ -150,7 +150,8 @@ void OPENSSL_cpuid_setup(void) { // Clear the XSAVE bit on Knights Landing to mimic Silvermont. This enables // some Silvermont-specific codepaths which perform better. See OpenSSL - // commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f. + // commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f and + // |CRYPTO_cpu_perf_is_like_silvermont|. if ((eax & 0x0fff0ff0) == 0x00050670 /* Knights Landing */ || (eax & 0x0fff0ff0) == 0x00080650 /* Knights Mill (per SDE) */) { ecx &= ~(1u << 26); @@ -177,7 +178,8 @@ void OPENSSL_cpuid_setup(void) { // Clear AVX2 and AVX512* bits. // // TODO(davidben): Should bits 17 and 26-28 also be cleared? Upstream - // doesn't clear those. + // doesn't clear those. See the comments in + // |CRYPTO_hardware_supports_XSAVE|. extended_features[0] &= ~((1u << 5) | (1u << 16) | (1u << 21) | (1u << 30) | (1u << 31)); } diff --git a/src/aead/chacha.rs b/src/aead/chacha.rs index 77d40bbf3..e46a52294 100644 --- a/src/aead/chacha.rs +++ b/src/aead/chacha.rs @@ -97,8 +97,8 @@ impl Key { } if in_out.len() >= 1 { chacha20_ctr32_ffi!( - unsafe { (1, cpu::Features, Overlapping<'_>) => ChaCha20_ctr32_nohw }, - self, counter, in_out, cpu) + unsafe { (1, (), Overlapping<'_>) => ChaCha20_ctr32_nohw }, + self, counter, in_out, ()) } } else if #[cfg(all(target_arch = "arm", target_endian = "little"))] { use cpu::{GetFeature as _, arm::Neon}; @@ -112,18 +112,45 @@ impl Key { } if in_out.len() >= 1 { chacha20_ctr32_ffi!( - unsafe { (1, cpu::Features, &mut [u8]) => ChaCha20_ctr32_nohw }, - self, counter, in_out.copy_within(), cpu) + unsafe { (1, (), &mut [u8]) => ChaCha20_ctr32_nohw }, + self, counter, in_out.copy_within(), ()) } } else if #[cfg(target_arch = "x86")] { chacha20_ctr32_ffi!( unsafe { (0, cpu::Features, &mut [u8]) => ChaCha20_ctr32 }, self, counter, in_out.copy_within(), cpu) } else if #[cfg(target_arch = "x86_64")] { - chacha20_ctr32_ffi!( - unsafe { (0, cpu::Features, Overlapping<'_>) => ChaCha20_ctr32 }, - self, counter, in_out, cpu) + use cpu::{GetFeature, intel::{Avx2, Ssse3}}; + const SSE_MIN_LEN: usize = 128 + 1; // Also AVX2, SSSE3_4X, SSSE3 + if in_out.len() >= SSE_MIN_LEN { + if let Some(cpu) = cpu.get_feature() { + return chacha20_ctr32_ffi!( + unsafe { (SSE_MIN_LEN, Avx2, Overlapping<'_>) => ChaCha20_ctr32_avx2 }, + self, counter, in_out, cpu); + } + if let Some(cpu) = >::get_feature(&cpu) { + if in_out.len() >= 192 || !cpu.perf_is_like_silvermont() { + return chacha20_ctr32_ffi!( + unsafe { + (SSE_MIN_LEN, Ssse3, Overlapping<'_>) => + ChaCha20_ctr32_ssse3_4x + }, + self, counter, in_out, cpu) + } + return chacha20_ctr32_ffi!( + unsafe { + (SSE_MIN_LEN, Ssse3, Overlapping<'_>) => ChaCha20_ctr32_ssse3 + }, + self, counter, in_out, cpu) + } + } + if in_out.len() >= 1 { + chacha20_ctr32_ffi!( + unsafe { (1, (), Overlapping<'_>) => ChaCha20_ctr32_nohw }, + self, counter, in_out, ()) + } } else { + let _: cpu::Features = cpu; fallback::ChaCha20_ctr32(self, counter, in_out) } } diff --git a/src/cpu/intel.rs b/src/cpu/intel.rs index 3c168b575..fe6af6180 100644 --- a/src/cpu/intel.rs +++ b/src/cpu/intel.rs @@ -146,11 +146,23 @@ cfg_if! { mask: 1 << 22, }; + // We intentionally avoid defining an `XSave` accessor function. See + // `Ssse3::cpu_perf_is_like_silvermont`. + const XSAVE_BUT_NOT_REALLY: Feature = Feature { + word: 1, + mask: 1 << 26, + }; + pub(crate) const AVX: Feature = Feature { word: 1, mask: 1 << 28, }; + const AVX2: Feature = Feature { + word: 2, + mask: 1 << 5, + }; + const SHA: Feature = Feature { word: 2, mask: 1 << 29, @@ -159,7 +171,36 @@ cfg_if! { impl_get_feature!{ SSE41 => Sse41 } impl_get_feature!{ MOVBE => Movbe } impl_get_feature!{ AVX => Avx } + impl_get_feature!{ AVX2 => Avx2 } impl_get_feature!{ SHA => Sha } + + impl Ssse3 { + /// BoringSSL's counterpart is `CRYPTO_cpu_perf_is_like_silvermont`. + /// + /// Returns true if, based on a heuristic, the + /// CPU has Silvermont-like performance characteristics. It is often faster to + /// run different codepaths on these CPUs than the available instructions would + /// otherwise select. See chacha-x86_64.pl. + /// + /// Bonnell, Silvermont's predecessor in the Atom lineup, will also be matched by + /// this. Goldmont (Silvermont's successor in the Atom lineup) added XSAVE so it + /// isn't matched by this. Various sources indicate AMD first implemented MOVBE + /// and XSAVE at the same time in Jaguar, so it seems like AMD chips will not be + /// matched by this. That seems to be the case for other x86(-64) CPUs. + /// + /// WARNING: This MUST NOT be used to guard the execution of the XSAVE + /// instruction. This is the "hardware supports XSAVE" bit, not the OSXSAVE bit + /// that indicates whether we can safely execute XSAVE. This bit may be set + /// even when XSAVE is disabled (by the operating system). See how the users of + /// this bit use it. + /// + /// Historically, the XSAVE bit was artificially cleared on Knights Landing + /// and Knights Mill chips, but as Intel has removed all support from GCC, + /// LLVM, and SDE, we assume they are no longer worth special-casing. + pub fn perf_is_like_silvermont(self) -> bool { + XSAVE_BUT_NOT_REALLY.available(self.0) && MOVBE.available(self.0) + } + } } }