diff --git a/.github/.cspell/project-dictionary.txt b/.github/.cspell/project-dictionary.txt index da0cdbec..6361c3b1 100644 --- a/.github/.cspell/project-dictionary.txt +++ b/.github/.cspell/project-dictionary.txt @@ -72,6 +72,9 @@ lcgr ldar ldaxp ldclrp +ldfadd +ldfmax +ldfmin ldiapp ldrexd ldsetp @@ -188,6 +191,7 @@ versatilepb virt vmlinux vmovdqa +vreg vtable vtables wokwi diff --git a/build.rs b/build.rs index 4a0b8500..bc557a73 100644 --- a/build.rs +++ b/build.rs @@ -47,18 +47,18 @@ fn main() { if version.minor >= 80 { println!( - r#"cargo:rustc-check-cfg=cfg(target_feature,values("experimental-zacas","fast-serialization","load-store-on-cond","distinct-ops","miscellaneous-extensions-3"))"# + r#"cargo:rustc-check-cfg=cfg(target_feature,values("lsfe","experimental-zacas","fast-serialization","load-store-on-cond","distinct-ops","miscellaneous-extensions-3"))"# ); // Custom cfgs set by build script. Not public API. // grep -F 'cargo:rustc-cfg=' build.rs | grep -Ev '^ *//' | sed -E 's/^.*cargo:rustc-cfg=//; s/(=\\)?".*$//' | LC_ALL=C sort -u | tr '\n' ',' | sed -E 's/,$/\n/' println!( - "cargo:rustc-check-cfg=cfg(portable_atomic_disable_fiq,portable_atomic_force_amo,portable_atomic_ll_sc_rmw,portable_atomic_new_atomic_intrinsics,portable_atomic_no_asm,portable_atomic_no_asm_maybe_uninit,portable_atomic_no_atomic_64,portable_atomic_no_atomic_cas,portable_atomic_no_atomic_load_store,portable_atomic_no_atomic_min_max,portable_atomic_no_cfg_target_has_atomic,portable_atomic_no_cmpxchg16b_intrinsic,portable_atomic_no_cmpxchg16b_target_feature,portable_atomic_no_const_mut_refs,portable_atomic_no_const_raw_ptr_deref,portable_atomic_no_const_transmute,portable_atomic_no_core_unwind_safe,portable_atomic_no_diagnostic_namespace,portable_atomic_no_offset_of,portable_atomic_no_strict_provenance,portable_atomic_no_stronger_failure_ordering,portable_atomic_no_track_caller,portable_atomic_no_unsafe_op_in_unsafe_fn,portable_atomic_pre_llvm_15,portable_atomic_pre_llvm_16,portable_atomic_pre_llvm_18,portable_atomic_s_mode,portable_atomic_sanitize_thread,portable_atomic_target_feature,portable_atomic_unsafe_assume_single_core,portable_atomic_unstable_asm,portable_atomic_unstable_asm_experimental_arch,portable_atomic_unstable_cfg_target_has_atomic,portable_atomic_unstable_isa_attribute)" + "cargo:rustc-check-cfg=cfg(portable_atomic_disable_fiq,portable_atomic_force_amo,portable_atomic_ll_sc_rmw,portable_atomic_new_atomic_intrinsics,portable_atomic_no_asm,portable_atomic_no_asm_maybe_uninit,portable_atomic_no_atomic_64,portable_atomic_no_atomic_cas,portable_atomic_no_atomic_load_store,portable_atomic_no_atomic_min_max,portable_atomic_no_cfg_target_has_atomic,portable_atomic_no_cmpxchg16b_intrinsic,portable_atomic_no_cmpxchg16b_target_feature,portable_atomic_no_const_mut_refs,portable_atomic_no_const_raw_ptr_deref,portable_atomic_no_const_transmute,portable_atomic_no_core_unwind_safe,portable_atomic_no_diagnostic_namespace,portable_atomic_no_offset_of,portable_atomic_no_strict_provenance,portable_atomic_no_stronger_failure_ordering,portable_atomic_no_track_caller,portable_atomic_no_unsafe_op_in_unsafe_fn,portable_atomic_pre_llvm_15,portable_atomic_pre_llvm_16,portable_atomic_pre_llvm_18,portable_atomic_pre_llvm_20,portable_atomic_s_mode,portable_atomic_sanitize_thread,portable_atomic_target_feature,portable_atomic_unsafe_assume_single_core,portable_atomic_unstable_asm,portable_atomic_unstable_asm_experimental_arch,portable_atomic_unstable_cfg_target_has_atomic,portable_atomic_unstable_isa_attribute)" ); // TODO: handle multi-line target_feature_fallback // grep -F 'target_feature_fallback("' build.rs | grep -Ev '^ *//' | sed -E 's/^.*target_feature_fallback\(//; s/",.*$/"/' | LC_ALL=C sort -u | tr '\n' ',' | sed -E 's/,$/\n/' println!( - r#"cargo:rustc-check-cfg=cfg(portable_atomic_target_feature,values("cmpxchg16b","distinct-ops","experimental-zacas","fast-serialization","load-store-on-cond","lse","lse128","lse2","mclass","miscellaneous-extensions-3","quadword-atomics","rcpc3","v6","zaamo","zabha"))"# + r#"cargo:rustc-check-cfg=cfg(portable_atomic_target_feature,values("cmpxchg16b","distinct-ops","experimental-zacas","fast-serialization","load-store-on-cond","lse","lse128","lse2","lsfe","mclass","miscellaneous-extensions-3","quadword-atomics","rcpc3","v6","zaamo","zabha"))"# ); } @@ -206,12 +206,15 @@ fn main() { println!("cargo:rustc-cfg=portable_atomic_no_atomic_load_store"); } - if version.llvm < 18 { - println!("cargo:rustc-cfg=portable_atomic_pre_llvm_18"); - if version.llvm < 16 { - println!("cargo:rustc-cfg=portable_atomic_pre_llvm_16"); - if version.llvm < 15 { - println!("cargo:rustc-cfg=portable_atomic_pre_llvm_15"); + if version.llvm < 20 { + println!("cargo:rustc-cfg=portable_atomic_pre_llvm_20"); + if version.llvm < 18 { + println!("cargo:rustc-cfg=portable_atomic_pre_llvm_18"); + if version.llvm < 16 { + println!("cargo:rustc-cfg=portable_atomic_pre_llvm_16"); + if version.llvm < 15 { + println!("cargo:rustc-cfg=portable_atomic_pre_llvm_15"); + } } } } @@ -282,6 +285,9 @@ fn main() { target_feature_fallback("lse", lse); } } + // As of rustc 1.84, target_feature "lsfe" is not available on rustc side: + // https://github.com/rust-lang/rust/blob/1.84.0/compiler/rustc_target/src/target_features.rs + target_feature_fallback("lsfe", false); // As of Apple M1/M1 Pro, on Apple hardware, CAS-loop-based RMW is much slower than // LL/SC-loop-based RMW: https://github.com/taiki-e/portable-atomic/pull/89 diff --git a/src/imp/atomic128/aarch64.rs b/src/imp/atomic128/aarch64.rs index 7cfb7fae..9b2c7a1c 100644 --- a/src/imp/atomic128/aarch64.rs +++ b/src/imp/atomic128/aarch64.rs @@ -435,10 +435,10 @@ macro_rules! atomic_rmw_inst { }; ($op:ident, $order:ident, write = $write:ident) => { match $order { - Ordering::Relaxed => $op!("2", ""), - Ordering::Acquire => $op!("a", ""), - Ordering::Release => $op!("6", ""), - Ordering::AcqRel => $op!("e", ""), + Ordering::Relaxed => $op!("2", ""), // "" + Ordering::Acquire => $op!("a", ""), // "a" + Ordering::Release => $op!("6", ""), // "l" + Ordering::AcqRel => $op!("e", ""), // "al" // In MSVC environments, SeqCst stores/writes needs fences after writes. // https://reviews.llvm.org/D141748 #[cfg(target_env = "msvc")] diff --git a/src/imp/detect/aarch64_aa64reg.rs b/src/imp/detect/aarch64_aa64reg.rs index 127d95d5..ca1f5155 100644 --- a/src/imp/detect/aarch64_aa64reg.rs +++ b/src/imp/detect/aarch64_aa64reg.rs @@ -44,12 +44,20 @@ include!("common.rs"); struct AA64Reg { aa64isar0: u64, aa64isar1: u64, + #[cfg(test)] + aa64isar3: u64, aa64mmfr2: u64, } #[cold] fn _detect(info: &mut CpuInfo) { - let AA64Reg { aa64isar0, aa64isar1, aa64mmfr2 } = imp::aa64reg(); + let AA64Reg { + aa64isar0, + aa64isar1, + #[cfg(test)] + aa64isar3, + aa64mmfr2, + } = imp::aa64reg(); // ID_AA64ISAR0_EL1, AArch64 Instruction Set Attribute Register 0 // https://developer.arm.com/documentation/ddi0601/2024-12/AArch64-Registers/ID-AA64ISAR0-EL1--AArch64-Instruction-Set-Attribute-Register-0 @@ -65,6 +73,12 @@ fn _detect(info: &mut CpuInfo) { if extract(aa64isar1, 23, 20) >= 0b0011 { info.set(CpuInfo::HAS_RCPC3); } + #[cfg(test)] + // ID_AA64ISAR3_EL1, AArch64 Instruction Set Attribute Register 3 + // https://developer.arm.com/documentation/ddi0601/2024-12/AArch64-Registers/ID-AA64ISAR3-EL1--AArch64-Instruction-Set-Attribute-Register-3 + if extract(aa64isar3, 19, 16) >= 0b0001 { + info.set(CpuInfo::HAS_LSFE); + } // ID_AA64MMFR2_EL1, AArch64 Memory Model Feature Register 2 // https://developer.arm.com/documentation/ddi0601/2024-12/AArch64-Registers/ID-AA64MMFR2-EL1--AArch64-Memory-Model-Feature-Register-2 if extract(aa64mmfr2, 35, 32) >= 0b0001 { @@ -102,13 +116,27 @@ mod imp { out(reg) aa64isar1, options(pure, nomem, nostack, preserves_flags), ); + #[cfg(test)] + let aa64isar3: u64; + #[cfg(test)] + asm!( + "mrs {0}, ID_AA64ISAR3_EL1", + out(reg) aa64isar3, + options(pure, nomem, nostack, preserves_flags), + ); let aa64mmfr2: u64; asm!( "mrs {0}, ID_AA64MMFR2_EL1", out(reg) aa64mmfr2, options(pure, nomem, nostack, preserves_flags), ); - AA64Reg { aa64isar0, aa64isar1, aa64mmfr2 } + AA64Reg { + aa64isar0, + aa64isar1, + #[cfg(test)] + aa64isar3, + aa64mmfr2, + } } } } @@ -200,6 +228,8 @@ mod imp { Some(AA64Reg { aa64isar0: buf.ac_aa64isar0, aa64isar1: buf.ac_aa64isar1, + #[cfg(test)] + aa64isar3: 0, aa64mmfr2: buf.ac_aa64mmfr2, }) } @@ -213,7 +243,13 @@ mod imp { // https://github.com/golang/sys/commit/ef9fd89ba245e184bdd308f7f2b4f3c551fa5b0f match sysctl_cpu_id(c!("machdep.cpu0.cpu_id")) { Some(cpu_id) => cpu_id, - None => AA64Reg { aa64isar0: 0, aa64isar1: 0, aa64mmfr2: 0 }, + None => AA64Reg { + aa64isar0: 0, + aa64isar1: 0, + #[cfg(test)] + aa64isar3: 0, + aa64mmfr2: 0, + }, } } } @@ -273,7 +309,13 @@ mod imp { let aa64isar0 = sysctl64(&[ffi::CTL_MACHDEP, ffi::CPU_ID_AA64ISAR0]).unwrap_or(0); let aa64isar1 = sysctl64(&[ffi::CTL_MACHDEP, ffi::CPU_ID_AA64ISAR1]).unwrap_or(0); let aa64mmfr2 = sysctl64(&[ffi::CTL_MACHDEP, ffi::CPU_ID_AA64MMFR2]).unwrap_or(0); - AA64Reg { aa64isar0, aa64isar1, aa64mmfr2 } + AA64Reg { + aa64isar0, + aa64isar1, + #[cfg(test)] + aa64isar3: 0, + aa64mmfr2, + } } fn sysctl64(mib: &[ffi::c_int]) -> Option { @@ -322,9 +364,10 @@ mod tests { #[test] fn test_aa64reg() { - let AA64Reg { aa64isar0, aa64isar1, aa64mmfr2 } = imp::aa64reg(); + let AA64Reg { aa64isar0, aa64isar1, aa64isar3, aa64mmfr2 } = imp::aa64reg(); std::eprintln!("aa64isar0={}", aa64isar0); std::eprintln!("aa64isar1={}", aa64isar1); + std::eprintln!("aa64isar3={}", aa64isar3); std::eprintln!("aa64mmfr2={}", aa64mmfr2); if cfg!(target_os = "openbsd") { let output = Command::new("sysctl").arg("machdep").output().unwrap(); @@ -361,6 +404,12 @@ mod tests { } else { assert!(lrcpc < 0b0011, "{}", lrcpc); } + let lsfe = extract(aa64isar3, 19, 16); + if detect().test(CpuInfo::HAS_LSFE) { + assert_eq!(lsfe, 0b0001); + } else { + assert_eq!(lsfe, 0b0000); + } let at = extract(aa64mmfr2, 35, 32); if detect().test(CpuInfo::HAS_LSE2) { assert_eq!(at, 0b0001); @@ -496,6 +545,7 @@ mod tests { Ok(AA64Reg { aa64isar0: buf.ac_aa64isar0, aa64isar1: buf.ac_aa64isar1, + aa64isar3: 0, aa64mmfr2: buf.ac_aa64mmfr2, }) } diff --git a/src/imp/detect/aarch64_apple.rs b/src/imp/detect/aarch64_apple.rs index 117d4b8a..f1c074f6 100644 --- a/src/imp/detect/aarch64_apple.rs +++ b/src/imp/detect/aarch64_apple.rs @@ -85,6 +85,10 @@ fn _detect(info: &mut CpuInfo) { if sysctlbyname32(c!("hw.optional.arm.FEAT_LSE128")).unwrap_or(0) != 0 { info.set(CpuInfo::HAS_LSE128); } + #[cfg(test)] + if sysctlbyname32(c!("hw.optional.arm.FEAT_LSFE")).unwrap_or(0) != 0 { + info.set(CpuInfo::HAS_LSFE); + } if sysctlbyname32(c!("hw.optional.arm.FEAT_LRCPC3")).unwrap_or(0) != 0 { info.set(CpuInfo::HAS_RCPC3); } @@ -108,6 +112,8 @@ mod tests { assert_eq!(sysctlbyname32(c!("hw.optional.arm.FEAT_LSE2")), Some(1)); assert_eq!(sysctlbyname32(c!("hw.optional.arm.FEAT_LSE128")), None); assert_eq!(std::io::Error::last_os_error().kind(), std::io::ErrorKind::NotFound); + assert_eq!(sysctlbyname32(c!("hw.optional.arm.FEAT_LSFE")), None); + assert_eq!(std::io::Error::last_os_error().kind(), std::io::ErrorKind::NotFound); assert_eq!(sysctlbyname32(c!("hw.optional.arm.FEAT_LRCPC")), Some(1)); assert_eq!(sysctlbyname32(c!("hw.optional.arm.FEAT_LRCPC2")), Some(1)); assert_eq!(sysctlbyname32(c!("hw.optional.arm.FEAT_LRCPC3")), None); @@ -234,6 +240,7 @@ mod tests { c!("hw.optional.arm.FEAT_LSE"), c!("hw.optional.arm.FEAT_LSE2"), c!("hw.optional.arm.FEAT_LSE128"), + c!("hw.optional.arm.FEAT_LSFE"), c!("hw.optional.arm.FEAT_LRCPC"), c!("hw.optional.arm.FEAT_LRCPC2"), c!("hw.optional.arm.FEAT_LRCPC3"), diff --git a/src/imp/detect/common.rs b/src/imp/detect/common.rs index 2ef28bed..06be510f 100644 --- a/src/imp/detect/common.rs +++ b/src/imp/detect/common.rs @@ -104,6 +104,13 @@ flags! { // > If FEAT_LSE128 is implemented, then FEAT_LSE is implemented. #[cfg_attr(not(test), allow(dead_code))] HAS_LSE128(has_lse128, "lse128", any(target_feature, portable_atomic_target_feature)), + // FEAT_LSFE, Large System Float Extension + // https://developer.arm.com/documentation/109697/2024_12/Feature-descriptions/The-Armv9-6-architecture-extension + // > This feature is supported in AArch64 state only. + // > FEAT_LSFE is OPTIONAL from Armv9.3. + // > If FEAT_LSFE is implemented, then FEAT_FP is implemented. + #[cfg(test)] + HAS_LSFE(has_lsfe, "lsfe", any(target_feature, portable_atomic_target_feature)), } #[cfg(target_arch = "powerpc64")] @@ -398,6 +405,11 @@ mod tests_common { assert!(!lse128); } } + if detect().has_lsfe() { + assert!(detect().test(CpuInfo::HAS_LSFE)); + } else { + assert!(!detect().test(CpuInfo::HAS_LSFE)); + } if detect().has_rcpc3() { assert!(detect().test(CpuInfo::HAS_RCPC3)); if let Ok(test_helper::cpuinfo::ProcCpuinfo { rcpc3: Some(rcpc3), .. }) = proc_cpuinfo { diff --git a/src/imp/float/aarch64.rs b/src/imp/float/aarch64.rs new file mode 100644 index 00000000..d32f7224 --- /dev/null +++ b/src/imp/float/aarch64.rs @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + +/* +Atomic float implementation based on AArch64 with FEAT_LSFE. + +This module provides atomic float implementations using FEAT_LSFE instructions. +*/ + +#[cfg(not(portable_atomic_no_asm))] +use core::arch::asm; +use core::sync::atomic::Ordering; + +#[cfg(portable_atomic_unstable_f128)] +use super::int::AtomicF128; +#[cfg(portable_atomic_unstable_f16)] +use super::int::AtomicF16; +use super::int::{AtomicF32, AtomicF64}; + +// TODO: optimize no return cases: +// https://developer.arm.com/documentation/ddi0602/2024-12/SIMD-FP-Instructions/STFADD--STFADDL--Floating-point-atomic-add-in-memory--without-return- +// https://developer.arm.com/documentation/ddi0602/2024-12/SIMD-FP-Instructions/STFMAX--STFMAXL--Floating-point-atomic-maximum-in-memory--without-return- +// https://developer.arm.com/documentation/ddi0602/2024-12/SIMD-FP-Instructions/STFMIN--STFMINL--Floating-point-atomic-minimum-in-memory--without-return- + +#[cfg(not(portable_atomic_pre_llvm_20))] +macro_rules! start_lsfe { + () => { + ".arch_extension lsfe" + }; +} + +#[cfg(not(portable_atomic_pre_llvm_20))] +macro_rules! atomic_rmw { + ($op:ident, $order:ident) => { + atomic_rmw!($op, $order, write = $order) + }; + ($op:ident, $order:ident, write = $write:ident) => { + match $order { + Ordering::Relaxed => $op!("", "", ""), + Ordering::Acquire => $op!("a", "", ""), + Ordering::Release => $op!("", "l", ""), + Ordering::AcqRel => $op!("a", "l", ""), + // In MSVC environments, SeqCst stores/writes needs fences after writes. + // https://reviews.llvm.org/D141748 + #[cfg(target_env = "msvc")] + Ordering::SeqCst if $write == Ordering::SeqCst => $op!("a", "l", "dmb ish"), + // AcqRel and SeqCst RMWs are equivalent in non-MSVC environments. + Ordering::SeqCst => $op!("a", "l", ""), + _ => unreachable!(), + } + }; +} +#[cfg(portable_atomic_pre_llvm_20)] +macro_rules! atomic_rmw_inst { + ($op:ident, $order:ident) => { + atomic_rmw_inst!($op, $order, write = $order) + }; + ($op:ident, $order:ident, write = $write:ident) => { + match $order { + Ordering::Relaxed => $op!("2", ""), // "" + Ordering::Acquire => $op!("a", ""), // "a" + Ordering::Release => $op!("6", ""), // "l" + Ordering::AcqRel => $op!("e", ""), // "al" + // In MSVC environments, SeqCst stores/writes needs fences after writes. + // https://reviews.llvm.org/D141748 + #[cfg(target_env = "msvc")] + Ordering::SeqCst if $write == Ordering::SeqCst => $op!("e", "dmb ish"), + // AcqRel and SeqCst RMWs are equivalent in non-MSVC environments. + Ordering::SeqCst => $op!("e", ""), + _ => unreachable!(), + } + }; +} + +macro_rules! atomic_float { + ($atomic_type:ident, $float_type:ident, $modifier:tt, $inst_modifier:tt) => { + impl $atomic_type { + #[inline] + pub(crate) fn fetch_add(&self, val: $float_type, order: Ordering) -> $float_type { + let dst = self.as_ptr(); + let out; + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + // + // Refs: https://developer.arm.com/documentation/ddi0602/2024-12/SIMD-FP-Instructions/LDFADD--LDFADDA--LDFADDAL--LDFADDL--Floating-point-atomic-add-in-memory- + unsafe { + #[cfg(not(portable_atomic_pre_llvm_20))] + macro_rules! add { + ($acquire:tt, $release:tt, $fence:tt) => { + asm!( + start_lsfe!(), + concat!("ldfadd", $acquire, $release, " {out:", $modifier, "}, {val:", $modifier, "}, [{dst}]"), + $fence, + dst = in(reg) ptr_reg!(dst), + val = in(vreg) val, + out = out(vreg) out, + options(nostack), + ) + }; + } + #[cfg(not(portable_atomic_pre_llvm_20))] + atomic_rmw!(add, order); + // LLVM supports FEAT_LSFE instructions on LLVM 20+, so use .inst directive on old LLVM. + // https://github.com/llvm/llvm-project/commit/67ff5ba9af9754261abe11d762af11532a816126 + #[cfg(portable_atomic_pre_llvm_20)] + macro_rules! add { + ($order:tt, $fence:tt) => { + asm!( + // ldfadd{,a,l,al} {h,s,d}0, {h,s,d}1, [x2] + concat!(".inst 0x", $inst_modifier, "c", $order, "00041"), + $fence, + in("x2") ptr_reg!(dst), + in("v1") val, + out("v0") out, + options(nostack), + ) + }; + } + #[cfg(portable_atomic_pre_llvm_20)] + atomic_rmw_inst!(add, order); + } + out + } + #[inline] + pub(crate) fn fetch_sub(&self, val: $float_type, order: Ordering) -> $float_type { + // There is no atomic sub instruction, so add `-val`. + self.fetch_add(-val, order) + } + #[inline] + pub(crate) fn fetch_max(&self, val: $float_type, order: Ordering) -> $float_type { + let dst = self.as_ptr(); + let out; + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + // + // Refs: https://developer.arm.com/documentation/ddi0602/2024-12/SIMD-FP-Instructions/LDFMAX--LDFMAXA--LDFMAXAL--LDFMAXL--Floating-point-atomic-maximum-in-memory- + unsafe { + #[cfg(not(portable_atomic_pre_llvm_20))] + macro_rules! max { + ($acquire:tt, $release:tt, $fence:tt) => { + asm!( + start_lsfe!(), + concat!("ldfmax", $acquire, $release, " {out:", $modifier, "}, {val:", $modifier, "}, [{dst}]"), + $fence, + dst = in(reg) ptr_reg!(dst), + val = in(vreg) val, + out = out(vreg) out, + options(nostack), + ) + }; + } + #[cfg(not(portable_atomic_pre_llvm_20))] + atomic_rmw!(max, order); + // LLVM supports FEAT_LSFE instructions on LLVM 20+, so use .inst directive on old LLVM. + // https://github.com/llvm/llvm-project/commit/67ff5ba9af9754261abe11d762af11532a816126 + #[cfg(portable_atomic_pre_llvm_20)] + macro_rules! max { + ($order:tt, $fence:tt) => { + asm!( + // ldfmax{,a,l,al} {h,s,d}0, {h,s,d}1, [x2] + concat!(".inst 0x", $inst_modifier, "c", $order, "04041"), + $fence, + in("x2") ptr_reg!(dst), + in("v1") val, + out("v0") out, + options(nostack), + ) + }; + } + #[cfg(portable_atomic_pre_llvm_20)] + atomic_rmw_inst!(max, order); + } + out + } + #[inline] + pub(crate) fn fetch_min(&self, val: $float_type, order: Ordering) -> $float_type { + let dst = self.as_ptr(); + let out; + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + // + // Refs: https://developer.arm.com/documentation/ddi0602/2024-12/SIMD-FP-Instructions/LDFMIN--LDFMINA--LDFMINAL--LDFMINL--Floating-point-atomic-minimum-in-memory- + unsafe { + #[cfg(not(portable_atomic_pre_llvm_20))] + macro_rules! min { + ($acquire:tt, $release:tt, $fence:tt) => { + asm!( + start_lsfe!(), + concat!("ldfmin", $acquire, $release, " {out:", $modifier, "}, {val:", $modifier, "}, [{dst}]"), + $fence, + dst = in(reg) ptr_reg!(dst), + val = in(vreg) val, + out = out(vreg) out, + options(nostack), + ) + }; + } + #[cfg(not(portable_atomic_pre_llvm_20))] + atomic_rmw!(min, order); + // LLVM supports FEAT_LSFE instructions on LLVM 20+, so use .inst directive on old LLVM. + // https://github.com/llvm/llvm-project/commit/67ff5ba9af9754261abe11d762af11532a816126 + #[cfg(portable_atomic_pre_llvm_20)] + macro_rules! min { + ($order:tt, $fence:tt) => { + asm!( + // ldfmin{,a,l,al} {h,s,d}0, {h,s,d}1, [x2] + concat!(".inst 0x", $inst_modifier, "c", $order, "05041"), + $fence, + in("x2") ptr_reg!(dst), + in("v1") val, + out("v0") out, + options(nostack), + ) + }; + } + #[cfg(portable_atomic_pre_llvm_20)] + atomic_rmw_inst!(min, order); + } + out + } + } + }; +} + +#[cfg(portable_atomic_unstable_f16)] +atomic_float!(AtomicF16, f16, "h", "7"); +atomic_float!(AtomicF32, f32, "s", "b"); +atomic_float!(AtomicF64, f64, "d", "f"); + +#[cfg(portable_atomic_unstable_f128)] +impl AtomicF128 { + #[inline] + pub(crate) fn fetch_add(&self, val: f128, order: Ordering) -> f128 { + self.fetch_update_(order, |x| x + val) + } + #[inline] + pub(crate) fn fetch_sub(&self, val: f128, order: Ordering) -> f128 { + self.fetch_update_(order, |x| x - val) + } + #[inline] + pub(super) fn fetch_update_(&self, order: Ordering, mut f: F) -> f128 + where + F: FnMut(f128) -> f128, + { + // This is a private function and all instances of `f` only operate on the value + // loaded, so there is no need to synchronize the first load/failed CAS. + let mut prev = self.load(Ordering::Relaxed); + loop { + let next = f(prev); + match self.compare_exchange_weak(prev, next, order, Ordering::Relaxed) { + Ok(x) => return x, + Err(next_prev) => prev = next_prev, + } + } + } + #[inline] + pub(crate) fn fetch_max(&self, val: f128, order: Ordering) -> f128 { + self.fetch_update_(order, |x| x.max(val)) + } + #[inline] + pub(crate) fn fetch_min(&self, val: f128, order: Ordering) -> f128 { + self.fetch_update_(order, |x| x.min(val)) + } +} diff --git a/src/imp/float/int.rs b/src/imp/float/int.rs index bda88364..483b418a 100644 --- a/src/imp/float/int.rs +++ b/src/imp/float/int.rs @@ -9,7 +9,8 @@ Note that most of `fetch_*` operations of atomic floats are implemented using CAS loops, which can be slower than equivalent operations of atomic integers. AArch64 with FEAT_LSFE and GPU targets have atomic instructions for float. -Both will use architecture-specific implementations instead of this implementation in the +See aarch64.rs for AArch64 with FEAT_LSFE. +GPU targets will also use architecture-specific implementations instead of this implementation in the future: https://github.com/taiki-e/portable-atomic/issues/34 / https://github.com/taiki-e/portable-atomic/pull/45 */ @@ -138,18 +139,39 @@ macro_rules! atomic_float { } } + #[cfg(not(all( + any(target_arch = "aarch64", target_arch = "arm64ec"), + any(target_feature = "lsfe", portable_atomic_target_feature = "lsfe"), + target_feature = "neon", // for vreg + not(any(miri, portable_atomic_sanitize_thread)), + any(not(portable_atomic_no_asm), portable_atomic_unstable_asm), + )))] #[inline] #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces pub(crate) fn fetch_add(&self, val: $float_type, order: Ordering) -> $float_type { self.fetch_update_(order, |x| x + val) } + #[cfg(not(all( + any(target_arch = "aarch64", target_arch = "arm64ec"), + any(target_feature = "lsfe", portable_atomic_target_feature = "lsfe"), + target_feature = "neon", // for vreg + not(any(miri, portable_atomic_sanitize_thread)), + any(not(portable_atomic_no_asm), portable_atomic_unstable_asm), + )))] #[inline] #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces pub(crate) fn fetch_sub(&self, val: $float_type, order: Ordering) -> $float_type { self.fetch_update_(order, |x| x - val) } + #[cfg(not(all( + any(target_arch = "aarch64", target_arch = "arm64ec"), + any(target_feature = "lsfe", portable_atomic_target_feature = "lsfe"), + target_feature = "neon", // for vreg + not(any(miri, portable_atomic_sanitize_thread)), + any(not(portable_atomic_no_asm), portable_atomic_unstable_asm), + )))] #[inline] #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces fn fetch_update_(&self, order: Ordering, mut f: F) -> $float_type @@ -168,12 +190,26 @@ macro_rules! atomic_float { } } + #[cfg(not(all( + any(target_arch = "aarch64", target_arch = "arm64ec"), + any(target_feature = "lsfe", portable_atomic_target_feature = "lsfe"), + target_feature = "neon", // for vreg + not(any(miri, portable_atomic_sanitize_thread)), + any(not(portable_atomic_no_asm), portable_atomic_unstable_asm), + )))] #[inline] #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces pub(crate) fn fetch_max(&self, val: $float_type, order: Ordering) -> $float_type { self.fetch_update_(order, |x| x.max(val)) } + #[cfg(not(all( + any(target_arch = "aarch64", target_arch = "arm64ec"), + any(target_feature = "lsfe", portable_atomic_target_feature = "lsfe"), + target_feature = "neon", // for vreg + not(any(miri, portable_atomic_sanitize_thread)), + any(not(portable_atomic_no_asm), portable_atomic_unstable_asm), + )))] #[inline] #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces pub(crate) fn fetch_min(&self, val: $float_type, order: Ordering) -> $float_type { diff --git a/src/imp/float/mod.rs b/src/imp/float/mod.rs index a36f0983..cfd6433e 100644 --- a/src/imp/float/mod.rs +++ b/src/imp/float/mod.rs @@ -8,6 +8,15 @@ Atomic float implementations mod int; +#[cfg(all( + any(target_arch = "aarch64", target_arch = "arm64ec"), + any(target_feature = "lsfe", portable_atomic_target_feature = "lsfe"), + target_feature = "neon", // for vreg + not(any(miri, portable_atomic_sanitize_thread)), + any(not(portable_atomic_no_asm), portable_atomic_unstable_asm), +))] +mod aarch64; + #[cfg(portable_atomic_unstable_f16)] cfg_has_atomic_16! { pub(crate) use self::int::AtomicF16; diff --git a/tools/build.sh b/tools/build.sh index 669d1103..7f0cfe2f 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -623,6 +623,9 @@ build() { CARGO_TARGET_DIR="${target_dir}/lse128-rcpc3" \ RUSTFLAGS="${target_rustflags} -C target-feature=+lse2,+lse128,+rcpc3" \ x_cargo "${args[@]}" "$@" + CARGO_TARGET_DIR="${target_dir}/lsfe" \ + RUSTFLAGS="${target_rustflags} -C target-feature=+lsfe" \ + x_cargo "${args[@]}" "$@" ;; powerpc64-*) # powerpc64le- (little-endian) is skipped because it is pwr8 by default