Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Implement OpenSSL's AES GCM acceleration for armv8 #16601

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions config/toolchain-simd.m4
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD], [
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVEOPT
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVES
;;

arm64 | aarch64)
ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_ARM_AES
;;

esac
])

Expand Down Expand Up @@ -386,6 +391,26 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES], [
])
])

dnl #
dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_ARM_AES
dnl #
AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_ARM_AES], [
AC_MSG_CHECKING([whether host toolchain supports ARM AES Crypto Extensions])

AC_LINK_IFELSE([AC_LANG_SOURCE([
[
void main()
{
__asm__ __volatile__("aese v0.16b, v1.16b");
}
]])], [
AC_MSG_RESULT([yes])
AC_DEFINE([HAVE_ARM_AES], 1, [Define if host toolchain supports ARM AES Crypto Extensions])
], [
AC_MSG_RESULT([no])
])
])

dnl #
dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ
dnl #
Expand Down
20 changes: 20 additions & 0 deletions include/os/freebsd/spl/sys/simd_aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
* zfs_neon_available()
* zfs_sha256_available()
* zfs_sha512_available()
* zfs_aes_available()
* zfs_pmull_available()
*/

#ifndef _FREEBSD_SIMD_AARCH64_H
Expand Down Expand Up @@ -91,4 +93,22 @@ zfs_sha512_available(void)
return (elf_hwcap & HWCAP_SHA512);
}

/*
* Check if AES is available
*/
static inline boolean_t
zfs_aes_available(void)
{
return (elf_hwcap & HWCAP_AES);
}

/*
* Check if PMULL is available
*/
static inline boolean_t
zfs_pmull_available(void)
{
return (elf_hwcap & HWCAP_PMULL);
}

#endif /* _FREEBSD_SIMD_AARCH64_H */
149 changes: 144 additions & 5 deletions include/os/linux/kernel/linux/simd_aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
* zfs_neon_available()
* zfs_sha256_available()
* zfs_sha512_available()
* zfs_aes_available()
* zfs_pmull_available()
*/

#ifndef _LINUX_SIMD_AARCH64_H
Expand All @@ -65,13 +67,128 @@
#define kfpu_allowed() 1
#define kfpu_begin() kernel_neon_begin()
#define kfpu_end() kernel_neon_end()
#else
#define kfpu_allowed() 0
#define kfpu_begin() do {} while (0)
#define kfpu_end() do {} while (0)
#endif
#define kfpu_init() (0)
#define kfpu_fini() do {} while (0)
#else
#ifndef HAVE_KERNEL_FPU_INTERNAL
#error Should have one of HAVE_KERNEL_FPU_INTERNAL or HAVE KERNEL_NEON
#endif
#define kfpu_allowed() 1

extern uint8_t **zfs_kfpu_fpregs;


/*
* Free buffer to store FPU state.
*/
static inline void
kfpu_fini(void)
{
int cpu;

if (zfs_kfpu_fpregs == NULL)
return;

for_each_possible_cpu(cpu) {
if (zfs_kfpu_fpregs[cpu] != NULL) {
kfree(zfs_kfpu_fpregs[cpu]);
zfs_kfpu_fpregs[cpu] = NULL;
}
}

kfree(zfs_kfpu_fpregs);

zfs_kfpu_fpregs = NULL;
}

/*
* Alloc buffer to store FPU state.
*/
static inline int
kfpu_init(void)
{
int cpu;

zfs_kfpu_fpregs = kzalloc(num_possible_cpus() * sizeof (uint8_t *),
GFP_KERNEL);

if (zfs_kfpu_fpregs == NULL)
return (-ENOMEM);

for_each_possible_cpu(cpu) {
// 32 vector registers + 2 status registers
zfs_kfpu_fpregs[cpu] = kzalloc((16 * 32) + (2 * 8), GFP_KERNEL);

if (zfs_kfpu_fpregs[cpu] == NULL) {
kfpu_fini();
return (-ENOMEM);
}
}

return (0);
}

static inline void
store_neon_state(uint8_t *buffer) {
asm volatile(
"st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[buf]], #64\n"
"st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%[buf]], #64\n"
"st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [%[buf]], #64\n"
"st1 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[buf]], #64\n"
"st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[buf]], #64\n"
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[buf]], #64\n"
"st1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[buf]], #64\n"
"st1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[buf]], #64\n"
"mrs x1, fpsr\n"
"mrs x2, fpcr\n"
"stp x1, x2, [%[buf]]\n"
: // no outputs
: [buf] "r" (buffer)
: "x1", "x2");
}

static inline void
restore_neon_state(const uint8_t *buffer) {
asm volatile(
"ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[buf]], #64\n"
"ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%[buf]], #64\n"
"ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [%[buf]], #64\n"
"ld1 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[buf]], #64\n"
"ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[buf]], #64\n"
"ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[buf]], #64\n"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[buf]], #64\n"
"ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[buf]], #64\n"
"ldp x1, x2, [%[buf]]\n"
"msr fpsr, x1\n"
"msr fpcr, x2\n"
: // no outputs
: [buf] "r" (buffer)
: "x1", "x2");
}

static inline void
kfpu_begin(void)
{
/*
* Preemption and interrupts must be disabled for the critical
* region where the FPU state is being modified.
*/
preempt_disable();
local_irq_disable();

store_neon_state(zfs_kfpu_fpregs[smp_processor_id()]);
}

static inline void
kfpu_end(void)
{
restore_neon_state(zfs_kfpu_fpregs[smp_processor_id()]);

local_irq_enable();
preempt_enable();
}
#endif


#define get_ftr(id) { \
unsigned long __val; \
Expand Down Expand Up @@ -109,4 +226,26 @@ zfs_sha512_available(void)
return (ftr & 0x2);
}

/*
* Check if AES is available
*/
static inline boolean_t
zfs_aes_available(void)
{
unsigned long ftr = ((get_ftr(ID_AA64ISAR0_EL1)) >> 4) & 0x3;
return (ftr & 0b10 || ftr & 0b01);
}

/*
* Check if PMULL is available
*/
static inline boolean_t
zfs_pmull_available(void)
{
unsigned long ftr = ((get_ftr(ID_AA64ISAR0_EL1)) >> 4) & 0x3;
return (ftr & 0b10);
}



#endif /* _LINUX_SIMD_AARCH64_H */
6 changes: 5 additions & 1 deletion lib/libicp/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ nodist_libicp_la_SOURCES = \
module/icp/api/kcf_cipher.c \
module/icp/api/kcf_mac.c \
module/icp/algs/aes/aes_impl_aesni.c \
module/icp/algs/aes/aes_impl_armv8_crypto.c \
module/icp/algs/aes/aes_impl_generic.c \
module/icp/algs/aes/aes_impl_x86-64.c \
module/icp/algs/aes/aes_impl.c \
Expand Down Expand Up @@ -43,7 +44,10 @@ nodist_libicp_la_SOURCES += \
module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S \
module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S \
module/icp/asm-aarch64/sha2/sha256-armv8.S \
module/icp/asm-aarch64/sha2/sha512-armv8.S
module/icp/asm-aarch64/sha2/sha512-armv8.S \
module/icp/asm-aarch64/sha2/aes-armv8-crypto.S
module/icp/asm-aarch64/modes/ghashv8-armx.S \
module/icp/asm-aarch64/modes/aes-gcm-armv8_64.S
endif

if TARGET_CPU_ARM
Expand Down
22 changes: 22 additions & 0 deletions lib/libspl/include/sys/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,8 @@ zfs_sha256_available(void)
#define kfpu_end() do {} while (0)

#define HWCAP_FP 0x00000001
#define HWCAP_AES 0x00000008
#define HWCAP_PMULL 0x00000010
#define HWCAP_SHA2 0x00000040
#define HWCAP_SHA512 0x00200000

Expand All @@ -529,6 +531,26 @@ zfs_neon_available(void)
return (hwcap & HWCAP_FP);
}

/*
* Check if AES is available
*/
static inline boolean_t
zfs_aes_available(void)
{
unsigned long hwcap = getauxval(AT_HWCAP);
return (hwcap & HWCAP_AES);
}

/*
* Check if PMULL is available
*/
static inline boolean_t
zfs_pmull_available(void)
{
unsigned long hwcap = getauxval(AT_HWCAP);
return (hwcap & HWCAP_PMULL);
}

/*
* Check if SHA2 is available
*/
Expand Down
6 changes: 5 additions & 1 deletion module/Kbuild.in
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,11 @@ ICP_OBJS_ARM64 := \
asm-aarch64/blake3/b3_aarch64_sse2.o \
asm-aarch64/blake3/b3_aarch64_sse41.o \
asm-aarch64/sha2/sha256-armv8.o \
asm-aarch64/sha2/sha512-armv8.o
asm-aarch64/sha2/sha512-armv8.o \
asm-aarch64/aes/aes-armv8-crypto.o \
asm-aarch64/modes/ghashv8-armx.o \
asm-aarch64/modes/aes-gcm-armv8_64.o \
algs/aes/aes_impl_armv8_crypto.o

ICP_OBJS_PPC_PPC64 := \
asm-ppc64/blake3/b3_ppc64le_sse2.o \
Expand Down
5 changes: 5 additions & 0 deletions module/Makefile.bsd
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,11 @@ CFLAGS.zstd_lazy.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
CFLAGS.zstd_ldm.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}
CFLAGS.zstd_opt.c+= ${__ZFS_ZSTD_AARCH64_FLAGS}

aes-armv8-crypto.o: aes-armv8-crypto.S
${CC} -c ${CFLAGS:N-mgeneral-regs-only} ${WERROR} ${.IMPSRC} \
-o ${.TARGET}
${CTFCONVERT_CMD}

sha256-armv8.o: sha256-armv8.S
${CC} -c ${CFLAGS:N-mgeneral-regs-only} ${WERROR} ${.IMPSRC} \
-o ${.TARGET}
Expand Down
15 changes: 13 additions & 2 deletions module/icp/algs/aes/aes_impl.c
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,9 @@ static const aes_impl_ops_t *aes_all_impl[] = {
#if defined(__x86_64) && defined(HAVE_AES)
&aes_aesni_impl,
#endif
#if defined(__aarch64__) && defined(HAVE_ARM_AES)
&aes_armv8_crypto_impl,
#endif
};

/* Indicate that benchmark has been completed */
Expand Down Expand Up @@ -325,8 +328,16 @@ aes_impl_init(void)
sizeof (aes_fastest_impl));
}
#else
memcpy(&aes_fastest_impl, &aes_generic_impl,
sizeof (aes_fastest_impl));
#if defined(__aarch64__) && defined(HAVE_ARM_AES)
if (aes_armv8_crypto_impl.is_supported()) {
memcpy(&aes_fastest_impl, &aes_armv8_crypto_impl,
sizeof (aes_fastest_impl));
} else
#endif
{
memcpy(&aes_fastest_impl, &aes_generic_impl,
sizeof (aes_fastest_impl));
}
#endif

strlcpy(aes_fastest_impl.name, "fastest", AES_IMPL_NAME_MAX);
Expand Down
Loading
Loading