diff --git a/config/toolchain-simd.m4 b/config/toolchain-simd.m4 index 061576fd94e3..08a93308e985 100644 --- a/config/toolchain-simd.m4 +++ b/config/toolchain-simd.m4 @@ -28,6 +28,11 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD], [ ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVEOPT ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVES ;; + + arm64 | aarch64) + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_ARM_AES + ;; + esac ]) @@ -386,6 +391,26 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES], [ ]) ]) +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_ARM_AES +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_ARM_AES], [ + AC_MSG_CHECKING([whether host toolchain supports ARM AES Crypto Extensions]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("aese v0.16b, v1.16b"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_ARM_AES], 1, [Define if host toolchain supports ARM AES Crypto Extensions]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + dnl # dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ dnl # diff --git a/include/os/freebsd/spl/sys/simd_aarch64.h b/include/os/freebsd/spl/sys/simd_aarch64.h index 234f401db791..d33bbaf6198f 100644 --- a/include/os/freebsd/spl/sys/simd_aarch64.h +++ b/include/os/freebsd/spl/sys/simd_aarch64.h @@ -38,6 +38,8 @@ * zfs_neon_available() * zfs_sha256_available() * zfs_sha512_available() + * zfs_aes_available() + * zfs_pmull_available() */ #ifndef _FREEBSD_SIMD_AARCH64_H @@ -91,4 +93,22 @@ zfs_sha512_available(void) return (elf_hwcap & HWCAP_SHA512); } +/* + * Check if AES is available + */ +static inline boolean_t +zfs_aes_available(void) +{ + return (elf_hwcap & HWCAP_AES); +} + +/* + * Check if PMULL is available + */ +static inline boolean_t +zfs_pmull_available(void) +{ + return (elf_hwcap & HWCAP_PMULL); +} + #endif /* _FREEBSD_SIMD_AARCH64_H */ diff --git a/include/os/linux/kernel/linux/simd_aarch64.h b/include/os/linux/kernel/linux/simd_aarch64.h index e580fbe23ea5..bb4cb14c20d4 100644 --- a/include/os/linux/kernel/linux/simd_aarch64.h +++ b/include/os/linux/kernel/linux/simd_aarch64.h @@ -46,6 +46,8 @@ * zfs_neon_available() * zfs_sha256_available() * zfs_sha512_available() + * zfs_aes_available() + * zfs_pmull_available() */ #ifndef _LINUX_SIMD_AARCH64_H @@ -65,13 +67,128 @@ #define kfpu_allowed() 1 #define kfpu_begin() kernel_neon_begin() #define kfpu_end() kernel_neon_end() -#else -#define kfpu_allowed() 0 -#define kfpu_begin() do {} while (0) -#define kfpu_end() do {} while (0) -#endif #define kfpu_init() (0) #define kfpu_fini() do {} while (0) +#else +#ifndef HAVE_KERNEL_FPU_INTERNAL +#error Should have one of HAVE_KERNEL_FPU_INTERNAL or HAVE KERNEL_NEON +#endif +#define kfpu_allowed() 1 + +extern uint8_t **zfs_kfpu_fpregs; + + +/* + * Free buffer to store FPU state. + */ +static inline void +kfpu_fini(void) +{ + int cpu; + + if (zfs_kfpu_fpregs == NULL) + return; + + for_each_possible_cpu(cpu) { + if (zfs_kfpu_fpregs[cpu] != NULL) { + kfree(zfs_kfpu_fpregs[cpu]); + zfs_kfpu_fpregs[cpu] = NULL; + } + } + + kfree(zfs_kfpu_fpregs); + + zfs_kfpu_fpregs = NULL; +} + +/* + * Alloc buffer to store FPU state. + */ +static inline int +kfpu_init(void) +{ + int cpu; + + zfs_kfpu_fpregs = kzalloc(num_possible_cpus() * sizeof (uint8_t *), + GFP_KERNEL); + + if (zfs_kfpu_fpregs == NULL) + return (-ENOMEM); + + for_each_possible_cpu(cpu) { + // 32 vector registers + 2 status registers + zfs_kfpu_fpregs[cpu] = kzalloc((16 * 32) + (2 * 8), GFP_KERNEL); + + if (zfs_kfpu_fpregs[cpu] == NULL) { + kfpu_fini(); + return (-ENOMEM); + } + } + + return (0); +} + +static inline void +store_neon_state(uint8_t *buffer) { + asm volatile( + "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[buf]], #64\n" + "st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%[buf]], #64\n" + "st1 {v8.16b, v9.16b, v10.16b, v11.16b}, [%[buf]], #64\n" + "st1 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[buf]], #64\n" + "st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[buf]], #64\n" + "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[buf]], #64\n" + "st1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[buf]], #64\n" + "st1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[buf]], #64\n" + "mrs x1, fpsr\n" + "mrs x2, fpcr\n" + "stp x1, x2, [%[buf]]\n" + : // no outputs + : [buf] "r" (buffer) + : "x1", "x2"); +} + +static inline void +restore_neon_state(const uint8_t *buffer) { + asm volatile( + "ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[buf]], #64\n" + "ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%[buf]], #64\n" + "ld1 {v8.16b, v9.16b, v10.16b, v11.16b}, [%[buf]], #64\n" + "ld1 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[buf]], #64\n" + "ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[buf]], #64\n" + "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[buf]], #64\n" + "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[buf]], #64\n" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[buf]], #64\n" + "ldp x1, x2, [%[buf]]\n" + "msr fpsr, x1\n" + "msr fpcr, x2\n" + : // no outputs + : [buf] "r" (buffer) + : "x1", "x2"); +} + +static inline void +kfpu_begin(void) +{ + /* + * Preemption and interrupts must be disabled for the critical + * region where the FPU state is being modified. + */ + preempt_disable(); + local_irq_disable(); + + store_neon_state(zfs_kfpu_fpregs[smp_processor_id()]); +} + +static inline void +kfpu_end(void) +{ + restore_neon_state(zfs_kfpu_fpregs[smp_processor_id()]); + + local_irq_enable(); + preempt_enable(); +} +#endif + #define get_ftr(id) { \ unsigned long __val; \ @@ -109,4 +226,26 @@ zfs_sha512_available(void) return (ftr & 0x2); } +/* + * Check if AES is available + */ +static inline boolean_t +zfs_aes_available(void) +{ + unsigned long ftr = ((get_ftr(ID_AA64ISAR0_EL1)) >> 4) & 0x3; + return (ftr & 0b10 || ftr & 0b01); +} + +/* + * Check if PMULL is available + */ +static inline boolean_t +zfs_pmull_available(void) +{ + unsigned long ftr = ((get_ftr(ID_AA64ISAR0_EL1)) >> 4) & 0x3; + return (ftr & 0b10); +} + + + #endif /* _LINUX_SIMD_AARCH64_H */ diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am index ce24d13a760f..42742ce4e251 100644 --- a/lib/libicp/Makefile.am +++ b/lib/libicp/Makefile.am @@ -9,6 +9,7 @@ nodist_libicp_la_SOURCES = \ module/icp/api/kcf_cipher.c \ module/icp/api/kcf_mac.c \ module/icp/algs/aes/aes_impl_aesni.c \ + module/icp/algs/aes/aes_impl_armv8_crypto.c \ module/icp/algs/aes/aes_impl_generic.c \ module/icp/algs/aes/aes_impl_x86-64.c \ module/icp/algs/aes/aes_impl.c \ @@ -43,7 +44,10 @@ nodist_libicp_la_SOURCES += \ module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S \ module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S \ module/icp/asm-aarch64/sha2/sha256-armv8.S \ - module/icp/asm-aarch64/sha2/sha512-armv8.S + module/icp/asm-aarch64/sha2/sha512-armv8.S \ + module/icp/asm-aarch64/sha2/aes-armv8-crypto.S + module/icp/asm-aarch64/modes/ghashv8-armx.S \ + module/icp/asm-aarch64/modes/aes-gcm-armv8_64.S endif if TARGET_CPU_ARM diff --git a/lib/libspl/include/sys/simd.h b/lib/libspl/include/sys/simd.h index 2926dc680764..2053118ee667 100644 --- a/lib/libspl/include/sys/simd.h +++ b/lib/libspl/include/sys/simd.h @@ -516,6 +516,8 @@ zfs_sha256_available(void) #define kfpu_end() do {} while (0) #define HWCAP_FP 0x00000001 +#define HWCAP_AES 0x00000008 +#define HWCAP_PMULL 0x00000010 #define HWCAP_SHA2 0x00000040 #define HWCAP_SHA512 0x00200000 @@ -529,6 +531,26 @@ zfs_neon_available(void) return (hwcap & HWCAP_FP); } +/* + * Check if AES is available + */ +static inline boolean_t +zfs_aes_available(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP_AES); +} + +/* + * Check if PMULL is available + */ +static inline boolean_t +zfs_pmull_available(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP_PMULL); +} + /* * Check if SHA2 is available */ diff --git a/module/Kbuild.in b/module/Kbuild.in index dcbdbc912f6d..c077fbf31fd4 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -151,7 +151,11 @@ ICP_OBJS_ARM64 := \ asm-aarch64/blake3/b3_aarch64_sse2.o \ asm-aarch64/blake3/b3_aarch64_sse41.o \ asm-aarch64/sha2/sha256-armv8.o \ - asm-aarch64/sha2/sha512-armv8.o + asm-aarch64/sha2/sha512-armv8.o \ + asm-aarch64/aes/aes-armv8-crypto.o \ + asm-aarch64/modes/ghashv8-armx.o \ + asm-aarch64/modes/aes-gcm-armv8_64.o \ + algs/aes/aes_impl_armv8_crypto.o ICP_OBJS_PPC_PPC64 := \ asm-ppc64/blake3/b3_ppc64le_sse2.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 61a664c5bf66..df8f6958f3e8 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -532,6 +532,11 @@ CFLAGS.zstd_lazy.c+= ${__ZFS_ZSTD_AARCH64_FLAGS} CFLAGS.zstd_ldm.c+= ${__ZFS_ZSTD_AARCH64_FLAGS} CFLAGS.zstd_opt.c+= ${__ZFS_ZSTD_AARCH64_FLAGS} +aes-armv8-crypto.o: aes-armv8-crypto.S + ${CC} -c ${CFLAGS:N-mgeneral-regs-only} ${WERROR} ${.IMPSRC} \ + -o ${.TARGET} + ${CTFCONVERT_CMD} + sha256-armv8.o: sha256-armv8.S ${CC} -c ${CFLAGS:N-mgeneral-regs-only} ${WERROR} ${.IMPSRC} \ -o ${.TARGET} diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c index 9daa975226fe..478a21624e05 100644 --- a/module/icp/algs/aes/aes_impl.c +++ b/module/icp/algs/aes/aes_impl.c @@ -233,6 +233,9 @@ static const aes_impl_ops_t *aes_all_impl[] = { #if defined(__x86_64) && defined(HAVE_AES) &aes_aesni_impl, #endif +#if defined(__aarch64__) && defined(HAVE_ARM_AES) + &aes_armv8_crypto_impl, +#endif }; /* Indicate that benchmark has been completed */ @@ -325,8 +328,16 @@ aes_impl_init(void) sizeof (aes_fastest_impl)); } #else - memcpy(&aes_fastest_impl, &aes_generic_impl, - sizeof (aes_fastest_impl)); +#if defined(__aarch64__) && defined(HAVE_ARM_AES) + if (aes_armv8_crypto_impl.is_supported()) { + memcpy(&aes_fastest_impl, &aes_armv8_crypto_impl, + sizeof (aes_fastest_impl)); + } else +#endif + { + memcpy(&aes_fastest_impl, &aes_generic_impl, + sizeof (aes_fastest_impl)); + } #endif strlcpy(aes_fastest_impl.name, "fastest", AES_IMPL_NAME_MAX); diff --git a/module/icp/algs/aes/aes_impl_armv8_crypto.c b/module/icp/algs/aes/aes_impl_armv8_crypto.c new file mode 100644 index 000000000000..d49239ec0236 --- /dev/null +++ b/module/icp/algs/aes/aes_impl_armv8_crypto.c @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#if defined(__aarch64__) && defined(HAVE_ARM_AES) + +#include +#include +#include + +extern ASMABI int aes_v8_set_encrypt_key(const uint32_t cipherKey[], + const int bits, uint32_t rk[]); +extern ASMABI int aes_v8_set_decrypt_key(const uint32_t cipherKey[], + const int bits, uint32_t rk[]); +extern ASMABI void aes_v8_encrypt(const uint32_t pt[4], uint32_t ct[4], + const uint32_t rk[], int Nr); +extern ASMABI void aes_v8_decrypt(const uint32_t ct[4], uint32_t pt[4], + const uint32_t rk[], int Nr); + +#include + +/* + * Expand the 32-bit AES cipher key array into the encryption and decryption + * key schedules. + * + * Parameters: + * key AES key schedule to be initialized + * keyarr32 User key + * keyBits AES key size (128, 192, or 256 bits) + */ +static void +aes_armv8_crypto_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits) +{ + kfpu_begin(); + aes_v8_set_encrypt_key(keyarr32, keybits, key->encr_ks.ks32); + aes_v8_set_decrypt_key(keyarr32, keybits, key->decr_ks.ks32); + kfpu_end(); +} + +/* + * Encrypt one block of data. The block is assumed to be an array + * of four uint32_t values, so copy for alignment (and byte-order + * reversal for little endian systems might be necessary on the + * input and output byte streams. + * The size of the key schedule depends on the number of rounds + * (which can be computed from the size of the key), i.e. 4*(Nr + 1). + * + * Parameters: + * rk Key schedule, of aes_ks_t (60 32-bit integers) + * Nr Number of rounds + * pt Input block (plain text) + * ct Output block (crypto text). Can overlap with pt + */ +static void +aes_armv8_crypto_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4], + uint32_t ct[4]) +{ + kfpu_begin(); + aes_v8_encrypt(pt, ct, rk, Nr); + kfpu_end(); +} + +/* + * Decrypt one block of data. The block is assumed to be an array + * of four uint32_t values, so copy for alignment (and byte-order + * reversal for little endian systems might be necessary on the + * input and output byte streams. + * The size of the key schedule depends on the number of rounds + * (which can be computed from the size of the key), i.e. 4*(Nr + 1). + * + * Parameters: + * rk Key schedule, of aes_ks_t (60 32-bit integers) + * Nr Number of rounds + * ct Input block (crypto text) + * pt Output block (plain text). Can overlap with pt + */ +static void +aes_armv8_crypto_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4], + uint32_t pt[4]) +{ + kfpu_begin(); + aes_v8_decrypt(ct, pt, rk, Nr); + kfpu_end(); +} + +static boolean_t +aes_armv8_crypto_will_work(void) +{ + return (kfpu_allowed() && zfs_aes_available()); +} + +const aes_impl_ops_t aes_armv8_crypto_impl = { + .generate = &aes_armv8_crypto_generate, + .encrypt = &aes_armv8_crypto_encrypt, + .decrypt = &aes_armv8_crypto_decrypt, + .is_supported = &aes_armv8_crypto_will_work, + .needs_byteswap = B_FALSE, + .name = "armv8_crypto" +}; + +#endif /* defined(__aarch64__) && defined(HAVE_ARM_AES) */ diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c index 21f4301d584d..4be1e7756b6d 100644 --- a/module/icp/algs/modes/gcm.c +++ b/module/icp/algs/modes/gcm.c @@ -44,7 +44,7 @@ #define IMPL_FASTEST (UINT32_MAX) #define IMPL_CYCLE (UINT32_MAX-1) #ifdef CAN_USE_GCM_ASM -#define IMPL_AVX (UINT32_MAX-2) +#define IMPL_HARDWARE (UINT32_MAX-2) #endif #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) static uint32_t icp_gcm_impl = IMPL_FASTEST; @@ -52,27 +52,27 @@ static uint32_t user_sel_impl = IMPL_FASTEST; #ifdef CAN_USE_GCM_ASM /* Does the architecture we run on support the MOVBE instruction? */ -boolean_t gcm_avx_can_use_movbe = B_FALSE; +boolean_t gcm_hardware_can_use_movbe = B_FALSE; /* * Whether to use the optimized openssl gcm and ghash implementations. - * Set to true if module parameter icp_gcm_impl == "avx". + * Set to true if module parameter icp_gcm_impl == "hardware". */ -static boolean_t gcm_use_avx = B_FALSE; -#define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) +static boolean_t gcm_use_hardware = B_FALSE; +#define GCM_IMPL_USE_HARDWARE (*(volatile boolean_t *)&gcm_use_hardware) extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *); -static inline boolean_t gcm_avx_will_work(void); -static inline void gcm_set_avx(boolean_t); -static inline boolean_t gcm_toggle_avx(void); +static inline boolean_t gcm_hardware_will_work(void); +static inline void gcm_set_hardware(boolean_t); +static inline boolean_t gcm_toggle_hardware(void); static inline size_t gcm_simd_get_htab_size(boolean_t); -static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, +static int gcm_mode_encrypt_contiguous_blocks_hardware(gcm_ctx_t *, char *, size_t, crypto_data_t *, size_t); -static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); -static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); -static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *, +static int gcm_encrypt_final_hardware(gcm_ctx_t *, crypto_data_t *, size_t); +static int gcm_decrypt_final_hardware(gcm_ctx_t *, crypto_data_t *, size_t); +static int gcm_init_hardware(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *, size_t, size_t); #endif /* ifdef CAN_USE_GCM_ASM */ @@ -88,8 +88,8 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, void (*xor_block)(uint8_t *, uint8_t *)) { #ifdef CAN_USE_GCM_ASM - if (ctx->gcm_use_avx == B_TRUE) - return (gcm_mode_encrypt_contiguous_blocks_avx( + if (ctx->gcm_use_hardware == B_TRUE) + return (gcm_mode_encrypt_contiguous_blocks_hardware( ctx, data, length, out, block_size)); #endif @@ -207,8 +207,8 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, { (void) copy_block; #ifdef CAN_USE_GCM_ASM - if (ctx->gcm_use_avx == B_TRUE) - return (gcm_encrypt_final_avx(ctx, out, block_size)); + if (ctx->gcm_use_hardware == B_TRUE) + return (gcm_encrypt_final_hardware(ctx, out, block_size)); #endif const gcm_impl_ops_t *gops; @@ -373,8 +373,8 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, void (*xor_block)(uint8_t *, uint8_t *)) { #ifdef CAN_USE_GCM_ASM - if (ctx->gcm_use_avx == B_TRUE) - return (gcm_decrypt_final_avx(ctx, out, block_size)); + if (ctx->gcm_use_hardware == B_TRUE) + return (gcm_decrypt_final_hardware(ctx, out, block_size)); #endif const gcm_impl_ops_t *gops; @@ -586,7 +586,7 @@ gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, } /* - * Init the GCM context struct. Handle the cycle and avx implementations here. + * Init the GCM context struct. Handle the cycle and hardware implementations here. */ int gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, @@ -630,39 +630,41 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap; if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { - gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; + gcm_ctx->gcm_use_hardware = GCM_IMPL_USE_HARDWARE; } else { /* - * Handle the "cycle" implementation by creating avx and - * non-avx contexts alternately. + * Handle the "cycle" implementation by creating hardware and + * non-hardware contexts alternately. */ - gcm_ctx->gcm_use_avx = gcm_toggle_avx(); + gcm_ctx->gcm_use_hardware = gcm_toggle_hardware(); - /* The avx impl. doesn't handle byte swapped key schedules. */ - if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) { - gcm_ctx->gcm_use_avx = B_FALSE; + /* The hardware impl. doesn't handle byte swapped key schedules. */ + if (gcm_ctx->gcm_use_hardware == B_TRUE && needs_bswap == B_TRUE) { + gcm_ctx->gcm_use_hardware = B_FALSE; } +#if defined(__x86_64__) /* * If this is a GCM context, use the MOVBE and the BSWAP * variants alternately. */ - if (gcm_ctx->gcm_use_avx == B_TRUE && + if (gcm_ctx->gcm_use_hardware == B_TRUE && zfs_movbe_available() == B_TRUE) { (void) atomic_toggle_boolean_nv( - (volatile boolean_t *)&gcm_avx_can_use_movbe); + (volatile boolean_t *)&gcm_hardware_can_use_movbe); } +#endif } /* - * We don't handle byte swapped key schedules in the avx code path, + * We don't handle byte swapped key schedules in the hardware code path, * still they could be created by the aes generic implementation. * Make sure not to use them since we'll corrupt data if we do. */ - if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) { - gcm_ctx->gcm_use_avx = B_FALSE; + if (gcm_ctx->gcm_use_hardware == B_TRUE && needs_bswap == B_TRUE) { + gcm_ctx->gcm_use_hardware = B_FALSE; cmn_err_once(CE_WARN, "ICP: Can't use the aes generic or cycle implementations " - "in combination with the gcm avx implementation!"); + "in combination with the gcm hardware implementation!"); cmn_err_once(CE_WARN, "ICP: Falling back to a compatible implementation, " "aes-gcm performance will likely be degraded."); @@ -672,8 +674,8 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, } /* Allocate Htab memory as needed. */ - if (gcm_ctx->gcm_use_avx == B_TRUE) { - size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); + if (gcm_ctx->gcm_use_hardware == B_TRUE) { + size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_hardware); if (htab_len == 0) { return (CRYPTO_MECHANISM_PARAM_INVALID); @@ -686,8 +688,8 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, return (CRYPTO_HOST_MEMORY); } } - /* Avx and non avx context initialization differs from here on. */ - if (gcm_ctx->gcm_use_avx == B_FALSE) { + /* hardware and non hardware context initialization differs from here on. */ + if (gcm_ctx->gcm_use_hardware == B_FALSE) { #endif /* ifdef CAN_USE_GCM_ASM */ if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size, encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) { @@ -695,7 +697,7 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, } #ifdef CAN_USE_GCM_ASM } else { - if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len, + if (gcm_init_hardware(gcm_ctx, iv, iv_len, aad, aad_len, block_size) != CRYPTO_SUCCESS) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } @@ -765,11 +767,11 @@ gcm_impl_get_ops(void) ops = gcm_supp_impl[idx]; break; #ifdef CAN_USE_GCM_ASM - case IMPL_AVX: + case IMPL_HARDWARE: /* * Make sure that we return a valid implementation while - * switching to the avx implementation since there still - * may be unfinished non-avx contexts around. + * switching to the hardware implementation since there still + * may be unfinished non-hardware contexts around. */ ops = &gcm_generic_impl; break; @@ -824,17 +826,17 @@ gcm_impl_init(void) #ifdef CAN_USE_GCM_ASM /* - * Use the avx implementation if it's available and the implementation + * Use the hardware implementation if it's available and the implementation * hasn't changed from its default value of fastest on module load. */ - if (gcm_avx_will_work()) { + if (gcm_hardware_will_work()) { #ifdef HAVE_MOVBE if (zfs_movbe_available() == B_TRUE) { - atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE); + atomic_swap_32(&gcm_hardware_can_use_movbe, B_TRUE); } #endif if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { - gcm_set_avx(B_TRUE); + gcm_set_hardware(B_TRUE); } } #endif @@ -850,7 +852,7 @@ static const struct { { "cycle", IMPL_CYCLE }, { "fastest", IMPL_FASTEST }, #ifdef CAN_USE_GCM_ASM - { "avx", IMPL_AVX }, + { "hardware", IMPL_HARDWARE }, #endif }; @@ -886,8 +888,8 @@ gcm_impl_set(const char *val) /* Check mandatory options */ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { #ifdef CAN_USE_GCM_ASM - /* Ignore avx implementation if it won't work. */ - if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { + /* Ignore hardware implementation if it won't work. */ + if (gcm_impl_opts[i].sel == IMPL_HARDWARE && !gcm_hardware_will_work()) { continue; } #endif @@ -911,14 +913,14 @@ gcm_impl_set(const char *val) } #ifdef CAN_USE_GCM_ASM /* - * Use the avx implementation if available and the requested one is - * avx or fastest. + * Use the hardware implementation if available and the requested one is + * hardware or fastest. */ - if (gcm_avx_will_work() == B_TRUE && - (impl == IMPL_AVX || impl == IMPL_FASTEST)) { - gcm_set_avx(B_TRUE); + if (gcm_hardware_will_work() == B_TRUE && + (impl == IMPL_HARDWARE || impl == IMPL_FASTEST)) { + gcm_set_hardware(B_TRUE); } else { - gcm_set_avx(B_FALSE); + gcm_set_hardware(B_FALSE); } #endif @@ -952,8 +954,8 @@ icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) /* list mandatory options */ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { #ifdef CAN_USE_GCM_ASM - /* Ignore avx implementation if it won't work. */ - if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { + /* Ignore hardware implementation if it won't work. */ + if (gcm_impl_opts[i].sel == IMPL_HARDWARE && !gcm_hardware_will_work()) { continue; } #endif @@ -983,36 +985,38 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); * The openssl asm routines are 6x aggregated and need that many bytes * at minimum. */ -#define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6) -#define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3) +#define GCM_HARDWARE_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6) +#define GCM_HARDWARE_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3) /* * Ensure the chunk size is reasonable since we are allocating a - * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts. + * GCM_HARDWARE_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts. */ -#define GCM_AVX_MAX_CHUNK_SIZE \ - (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES) +#define GCM_HARDWARE_MAX_CHUNK_SIZE \ + (((128*1024)/GCM_HARDWARE_MIN_DECRYPT_BYTES) * GCM_HARDWARE_MIN_DECRYPT_BYTES) /* Clear the FPU registers since they hold sensitive internal state. */ -#define clear_fpu_regs() clear_fpu_regs_avx() -#define GHASH_AVX(ctx, in, len) \ - gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \ +#define clear_fpu_regs() clear_fpu_regs_hardware() +#define GHASH_HARDWARE(ctx, in, len) \ + gcm_ghash_hardware((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \ in, len) #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) /* Get the chunk size module parameter. */ -#define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size +#define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_hardware_chunk_size /* * Module parameter: number of bytes to process at once while owning the FPU. - * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is - * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES. + * Rounded down to the next GCM_HARDWARE_MIN_DECRYPT_BYTES byte boundary and is + * ensured to be greater or equal than GCM_HARDWARE_MIN_DECRYPT_BYTES. */ -static uint32_t gcm_avx_chunk_size = - ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; +static uint32_t gcm_hardware_chunk_size = + ((32 * 1024) / GCM_HARDWARE_MIN_DECRYPT_BYTES) * GCM_HARDWARE_MIN_DECRYPT_BYTES; +#if defined(__x86_64__) extern void ASMABI clear_fpu_regs_avx(void); extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst); + extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr, const uint32_t pt[4], uint32_t ct[4]); @@ -1026,28 +1030,128 @@ extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, const void *, uint64_t *, uint64_t *); +#define clear_fpu_regs_hardware clear_fpu_regs_avx +#define gcm_xor_hardware gcm_xor_avx +#define aes_hardware_encrypt_block aes_encrypt_intel +#define aes_hardware_encrypt_chunk aesni_gcm_encrypt +#define aes_hardware_decrypt_chunk aesni_gcm_decrypt +#define gcm_ghash_hardware gcm_ghash_avx +#define gcm_init_htab_hardware gcm_init_htab_avx + +#else // aarch64 +extern void ASMABI clear_fpu_regs_v8(void); +extern void ASMABI gcm_xor_v8(const uint8_t *src, uint8_t *dst); + +extern ASMABI void aes_v8_encrypt(const uint32_t pt[4], uint32_t ct[4], + const uint32_t rk[], int Nr); + +extern size_t ASMABI aes_gcm_enc_128_kernel(const uint8_t *plaintext, + uint64_t plaintext_length, uint8_t *ciphertext, uint64_t *Xi, + uint64_t *ivec, const void *key); + +extern size_t ASMABI aes_gcm_enc_192_kernel(const uint8_t *plaintext, + uint64_t plaintext_length, uint8_t *ciphertext, uint64_t *Xi, + uint64_t *ivec, const void *key); + +extern size_t ASMABI aes_gcm_enc_256_kernel(const uint8_t *plaintext, + uint64_t plaintext_length, uint8_t *ciphertext, uint64_t *Xi, + uint64_t *ivec, const void *key); + +extern size_t ASMABI aes_gcm_dec_128_kernel(const uint8_t *ciphertext, + uint64_t ciphertext_length, uint8_t *plaintext, uint64_t *Xi, + uint64_t *ivec, const void *key); + +extern size_t ASMABI aes_gcm_dec_192_kernel(const uint8_t *ciphertext, + uint64_t ciphertext_length, uint8_t *plaintext, uint64_t *Xi, + uint64_t *ivec, const void *key); + +extern size_t ASMABI aes_gcm_dec_256_kernel(const uint8_t *ciphertext, + uint64_t ciphertext_length, uint8_t *plaintext, uint64_t *Xi, + uint64_t *ivec, const void *key); + + +extern ASMABI void gcm_init_v8(uint64_t *Htable, const uint64_t H[2]); +extern ASMABI void gcm_ghash_v8(uint64_t ghash[2], const uint64_t *Htable, + const uint8_t *in, size_t len); + +#define clear_fpu_regs_hardware clear_fpu_regs_v8 +#define gcm_xor_hardware gcm_xor_v8 + +// Reorder args: +#define aes_hardware_encrypt_block(rk, nr, pt, ct) aes_v8_encrypt(pt, ct, rk, nr) + +static size_t aes_hardware_encrypt_chunk(const uint8_t *in, uint8_t *out, + size_t len, const aes_key_t *key, uint64_t *cb, uint64_t *ghash) +{ + size_t ret = 0; + len = len - (len % 16); + len = len * 8; + switch (key->nr) { + case 10: + ret = aes_gcm_enc_128_kernel(in, len, out, ghash, cb, key); + break; + case 12: + ret = aes_gcm_enc_192_kernel(in, len, out, ghash, cb, key); + break; + case 14: + ret = aes_gcm_enc_256_kernel(in, len, out, ghash, cb, key); + break; + } + return (ret); +} + +static size_t aes_hardware_decrypt_chunk(const uint8_t *in, uint8_t *out, + size_t len, const aes_key_t *key, uint64_t *cb, uint64_t *ghash) +{ + size_t ret = 0; + len = len - (len % 16); + len = len * 8; + switch (key->nr) { + case 10: + ret = aes_gcm_dec_128_kernel(in, len, out, ghash, cb, key); + break; + case 12: + ret = aes_gcm_dec_192_kernel(in, len, out, ghash, cb, key); + break; + case 14: + ret = aes_gcm_dec_256_kernel(in, len, out, ghash, cb, key); + break; + } + return (ret); +} + +#define gcm_ghash_hardware gcm_ghash_v8 +#define gcm_init_htab_hardware gcm_init_v8 + +#endif + static inline boolean_t -gcm_avx_will_work(void) +gcm_hardware_will_work(void) { - /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */ +#if defined(__x86_64__) + /* hardware should imply aes-ni and pclmulqdq, but make sure anyhow. */ return (kfpu_allowed() && - zfs_avx_available() && zfs_aes_available() && + zfs_hardware_available() && zfs_aes_available() && zfs_pclmulqdq_available()); +#else + return (kfpu_allowed() && zfs_pmull_available() && + zfs_aes_available()); +#endif } static inline void -gcm_set_avx(boolean_t val) +gcm_set_hardware(boolean_t val) { - if (gcm_avx_will_work() == B_TRUE) { - atomic_swap_32(&gcm_use_avx, val); + if (gcm_hardware_will_work() == B_TRUE) { + atomic_swap_32(&gcm_use_hardware, val); } } static inline boolean_t -gcm_toggle_avx(void) +gcm_toggle_hardware(void) { - if (gcm_avx_will_work() == B_TRUE) { - return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX)); + if (gcm_hardware_will_work() == B_TRUE) { + return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_HARDWARE)); } else { return (B_FALSE); } @@ -1080,11 +1184,11 @@ gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) /* * Encrypt multiple blocks of data in GCM mode. - * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines + * This is done in gcm_hardware_chunk_size chunks, utilizing HARDWARE assembler routines * if possible. While processing a chunk the FPU is "locked". */ static int -gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, +gcm_mode_encrypt_contiguous_blocks_hardware(gcm_ctx_t *ctx, char *data, size_t length, crypto_data_t *out, size_t block_size) { size_t bleft = length; @@ -1128,7 +1232,7 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, } /* Allocate a buffer to encrypt to if there is enough input. */ - if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { + if (bleft >= GCM_HARDWARE_MIN_ENCRYPT_BYTES) { ct_buf = vmem_alloc(chunk_size, KM_SLEEP); if (ct_buf == NULL) { return (CRYPTO_HOST_MEMORY); @@ -1138,11 +1242,11 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, /* If we completed an incomplete block, encrypt and write it out. */ if (ctx->gcm_remainder_len > 0) { kfpu_begin(); - aes_encrypt_intel(key->encr_ks.ks32, key->nr, + aes_hardware_encrypt_block(key->encr_ks.ks32, key->nr, (const uint32_t *)cb, (uint32_t *)tmp); - gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp); - GHASH_AVX(ctx, tmp, block_size); + gcm_xor_hardware((const uint8_t *) ctx->gcm_remainder, tmp); + GHASH_HARDWARE(ctx, tmp, block_size); clear_fpu_regs(); kfpu_end(); rv = crypto_put_output_data(tmp, out, block_size); @@ -1157,7 +1261,7 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, /* Do the bulk encryption in chunk_size blocks. */ for (; bleft >= chunk_size; bleft -= chunk_size) { kfpu_begin(); - done = aesni_gcm_encrypt( + done = aes_hardware_encrypt_chunk( datap, ct_buf, chunk_size, key, cb, ghash); clear_fpu_regs(); @@ -1180,8 +1284,8 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, } /* Bulk encrypt the remaining data. */ kfpu_begin(); - if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { - done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash); + if (bleft >= GCM_HARDWARE_MIN_ENCRYPT_BYTES) { + done = aes_hardware_encrypt_chunk(datap, ct_buf, bleft, key, cb, ghash); if (done == 0) { rv = CRYPTO_FAILED; goto out; @@ -1196,7 +1300,7 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, bleft -= done; } - /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */ + /* Less than GCM_HARDWARE_MIN_ENCRYPT_BYTES remain, operate on blocks. */ while (bleft > 0) { if (bleft < block_size) { memcpy(ctx->gcm_remainder, datap, bleft); @@ -1205,11 +1309,11 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, goto out; } /* Encrypt, hash and write out. */ - aes_encrypt_intel(key->encr_ks.ks32, key->nr, + aes_hardware_encrypt_block(key->encr_ks.ks32, key->nr, (const uint32_t *)cb, (uint32_t *)tmp); - gcm_xor_avx(datap, tmp); - GHASH_AVX(ctx, tmp, block_size); + gcm_xor_hardware(datap, tmp); + GHASH_HARDWARE(ctx, tmp, block_size); rv = crypto_put_output_data(tmp, out, block_size); if (rv != CRYPTO_SUCCESS) { goto out; @@ -1235,7 +1339,7 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, * incomplete last block. Encrypt the ICB. Calculate the tag and write it out. */ static int -gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) +gcm_encrypt_final_hardware(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) { uint8_t *ghash = (uint8_t *)ctx->gcm_ghash; uint32_t *J0 = (uint32_t *)ctx->gcm_J0; @@ -1259,22 +1363,22 @@ gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; const uint32_t *cb = (uint32_t *)ctx->gcm_cb; - aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp); + aes_hardware_encrypt_block(keysched, aes_rounds, cb, (uint32_t *)tmp); memset(remainder + rem_len, 0, block_size - rem_len); for (int i = 0; i < rem_len; i++) { remainder[i] ^= tmp[i]; } - GHASH_AVX(ctx, remainder, block_size); + GHASH_HARDWARE(ctx, remainder, block_size); ctx->gcm_processed_data_len += rem_len; /* No need to increment counter_block, it's the last block. */ } /* Finish tag. */ ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); - GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size); - aes_encrypt_intel(keysched, aes_rounds, J0, J0); + GHASH_HARDWARE(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size); + aes_hardware_encrypt_block(keysched, aes_rounds, J0, J0); - gcm_xor_avx((uint8_t *)J0, ghash); + gcm_xor_hardware((uint8_t *)J0, ghash); clear_fpu_regs(); kfpu_end(); @@ -1299,7 +1403,7 @@ gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) * decrypt it here inplace. */ static int -gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) +gcm_decrypt_final_hardware(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) { ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len); ASSERT3U(block_size, ==, 16); @@ -1317,13 +1421,13 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) size_t bleft, done; /* - * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be - * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of - * GCM_AVX_MIN_DECRYPT_BYTES. + * Decrypt in chunks of gcm_hardware_chunk_size, which is asserted to be + * greater or equal than GCM_HARDWARE_MIN_ENCRYPT_BYTES, and a multiple of + * GCM_HARDWARE_MIN_DECRYPT_BYTES. */ for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) { kfpu_begin(); - done = aesni_gcm_decrypt(datap, datap, chunk_size, + done = aes_hardware_decrypt_chunk(datap, datap, chunk_size, (const void *)key, ctx->gcm_cb, ghash); clear_fpu_regs(); kfpu_end(); @@ -1334,8 +1438,8 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) } /* Decrypt remainder, which is less than chunk size, in one go. */ kfpu_begin(); - if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) { - done = aesni_gcm_decrypt(datap, datap, bleft, + if (bleft >= GCM_HARDWARE_MIN_DECRYPT_BYTES) { + done = aes_hardware_decrypt_chunk(datap, datap, bleft, (const void *)key, ctx->gcm_cb, ghash); if (done == 0) { clear_fpu_regs(); @@ -1345,10 +1449,10 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) datap += done; bleft -= done; } - ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES); + ASSERT(bleft < GCM_HARDWARE_MIN_DECRYPT_BYTES); /* - * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain, + * Now less than GCM_HARDWARE_MIN_DECRYPT_BYTES bytes remain, * decrypt them block by block. */ while (bleft > 0) { @@ -1359,17 +1463,17 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) memset(lastb, 0, block_size); memcpy(lastb, datap, bleft); /* The GCM processing. */ - GHASH_AVX(ctx, lastb, block_size); - aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); + GHASH_HARDWARE(ctx, lastb, block_size); + aes_hardware_encrypt_block(key->encr_ks.ks32, key->nr, cb, tmp); for (size_t i = 0; i < bleft; i++) { datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i]; } break; } /* The GCM processing. */ - GHASH_AVX(ctx, datap, block_size); - aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); - gcm_xor_avx((uint8_t *)tmp, datap); + GHASH_HARDWARE(ctx, datap, block_size); + aes_hardware_encrypt_block(key->encr_ks.ks32, key->nr, cb, tmp); + gcm_xor_hardware((uint8_t *)tmp, datap); gcm_incr_counter_block(ctx); datap += block_size; @@ -1382,11 +1486,11 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) } /* Decryption done, finish the tag. */ ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); - GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size); - aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0, + GHASH_HARDWARE(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size); + aes_hardware_encrypt_block(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0, (uint32_t *)ctx->gcm_J0); - gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash); + gcm_xor_hardware((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash); /* We are done with the FPU, restore its state. */ clear_fpu_regs(); @@ -1410,7 +1514,7 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) * initial counter block. */ static int -gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, +gcm_init_hardware(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, const uint8_t *auth_data, size_t auth_data_len, size_t block_size) { uint8_t *cb = (uint8_t *)ctx->gcm_cb; @@ -1429,10 +1533,10 @@ gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash)); memset(H, 0, sizeof (ctx->gcm_H)); kfpu_begin(); - aes_encrypt_intel(keysched, aes_rounds, + aes_hardware_encrypt_block(keysched, aes_rounds, (const uint32_t *)H, (uint32_t *)H); - gcm_init_htab_avx(ctx->gcm_Htable, H); + gcm_init_htab_hardware(ctx->gcm_Htable, H); if (iv_len == 12) { memcpy(cb, iv, 12); @@ -1460,7 +1564,7 @@ gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, /* Ghash AAD in chunk_size blocks. */ for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) { - GHASH_AVX(ctx, datap, chunk_size); + GHASH_HARDWARE(ctx, datap, chunk_size); datap += chunk_size; clear_fpu_regs(); kfpu_end(); @@ -1472,7 +1576,7 @@ gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, bleft -= incomp; if (bleft > 0) { - GHASH_AVX(ctx, datap, bleft); + GHASH_HARDWARE(ctx, datap, bleft); datap += bleft; } if (incomp > 0) { @@ -1481,7 +1585,7 @@ gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, memset(authp, 0, block_size); memcpy(authp, datap, incomp); - GHASH_AVX(ctx, authp, block_size); + GHASH_HARDWARE(ctx, authp, block_size); } } clear_fpu_regs(); @@ -1491,7 +1595,7 @@ gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, #if defined(_KERNEL) static int -icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) +icp_gcm_hardware_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) { unsigned long val; char val_rounded[16]; @@ -1501,9 +1605,9 @@ icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) if (error) return (error); - val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; + val = (val / GCM_HARDWARE_MIN_DECRYPT_BYTES) * GCM_HARDWARE_MIN_DECRYPT_BYTES; - if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE) + if (val < GCM_HARDWARE_MIN_ENCRYPT_BYTES || val > GCM_HARDWARE_MAX_CHUNK_SIZE) return (-EINVAL); snprintf(val_rounded, 16, "%u", (uint32_t)val); @@ -1511,10 +1615,10 @@ icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) return (error); } -module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size, - param_get_uint, &gcm_avx_chunk_size, 0644); +module_param_call(icp_gcm_hardware_chunk_size, icp_gcm_hardware_set_chunk_size, + param_get_uint, &gcm_hardware_chunk_size, 0644); -MODULE_PARM_DESC(icp_gcm_avx_chunk_size, +MODULE_PARM_DESC(icp_gcm_hardware_chunk_size, "How many bytes to process while owning the FPU"); #endif /* defined(__KERNEL) */ diff --git a/module/icp/algs/modes/modes.c b/module/icp/algs/modes/modes.c index 786a89f10c90..b1f7f448c90a 100644 --- a/module/icp/algs/modes/modes.c +++ b/module/icp/algs/modes/modes.c @@ -170,7 +170,7 @@ gcm_clear_ctx(gcm_ctx_t *ctx) explicit_memset(ctx->gcm_remainder, 0, sizeof (ctx->gcm_remainder)); explicit_memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); #if defined(CAN_USE_GCM_ASM) - if (ctx->gcm_use_avx == B_TRUE) { + if (ctx->gcm_use_hardware == B_TRUE) { ASSERT3P(ctx->gcm_Htable, !=, NULL); memset(ctx->gcm_Htable, 0, ctx->gcm_htab_len); kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len); diff --git a/module/icp/asm-aarch64/aes/aes-armv8-crypto.S b/module/icp/asm-aarch64/aes/aes-armv8-crypto.S new file mode 100644 index 000000000000..92bc566820d6 --- /dev/null +++ b/module/icp/asm-aarch64/aes/aes-armv8-crypto.S @@ -0,0 +1,508 @@ +/* + * Copyright 2011-2024 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#if defined(__aarch64__) +#ifndef OSSL_CRYPTO_ARM_ARCH_H +# define OSSL_CRYPTO_ARM_ARCH_H + +# if !defined(__ARM_ARCH__) +# if defined(__CC_ARM) +# define __ARM_ARCH__ __TARGET_ARCH_ARM +# if defined(__BIG_ENDIAN) +# define __ARMEB__ +# else +# define __ARMEL__ +# endif +# elif defined(__GNUC__) +# if defined(__aarch64__) +# define __ARM_ARCH__ 8 + /* + * Why doesn't gcc define __ARM_ARCH__? Instead it defines + * bunch of below macros. See all_architectures[] table in + * gcc/config/arm/arm.c. On a side note it defines + * __ARMEL__/__ARMEB__ for little-/big-endian. + */ +# elif defined(__ARM_ARCH) +# define __ARM_ARCH__ __ARM_ARCH +# elif defined(__ARM_ARCH_8A__) +# define __ARM_ARCH__ 8 +# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ + defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \ + defined(__ARM_ARCH_7EM__) +# define __ARM_ARCH__ 7 +# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \ + defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_6T2__) +# define __ARM_ARCH__ 6 +# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ + defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \ + defined(__ARM_ARCH_5TEJ__) +# define __ARM_ARCH__ 5 +# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) +# define __ARM_ARCH__ 4 +# else +# error "unsupported ARM architecture" +# endif +# elif defined(__ARM_ARCH) +# define __ARM_ARCH__ __ARM_ARCH +# endif +# endif + +# if !defined(__ARM_MAX_ARCH__) +# define __ARM_MAX_ARCH__ __ARM_ARCH__ +# endif + +# if __ARM_MAX_ARCH__<__ARM_ARCH__ +# error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__" +# elif __ARM_MAX_ARCH__!=__ARM_ARCH__ +# if __ARM_ARCH__<7 && __ARM_MAX_ARCH__>=7 && defined(__ARMEB__) +# error "can't build universal big-endian binary" +# endif +# endif + +# ifndef __ASSEMBLER__ +extern unsigned int OPENSSL_armcap_P; +extern unsigned int OPENSSL_arm_midr; +extern unsigned int OPENSSL_armv8_rsa_neonized; +# endif + +# define ARMV7_NEON (1<<0) +# define ARMV7_TICK (1<<1) +# define ARMV8_AES (1<<2) +# define ARMV8_SHA1 (1<<3) +# define ARMV8_SHA256 (1<<4) +# define ARMV8_PMULL (1<<5) +# define ARMV8_SHA512 (1<<6) +# define ARMV8_CPUID (1<<7) +# define ARMV8_RNG (1<<8) +# define ARMV8_SM3 (1<<9) +# define ARMV8_SM4 (1<<10) +# define ARMV8_SHA3 (1<<11) +# define ARMV8_UNROLL8_EOR3 (1<<12) +# define ARMV8_SVE (1<<13) +# define ARMV8_SVE2 (1<<14) +# define ARMV8_HAVE_SHA3_AND_WORTH_USING (1<<15) +# define ARMV8_UNROLL12_EOR3 (1<<16) + +/* + * MIDR_EL1 system register + * + * 63___ _ ___32_31___ _ ___24_23_____20_19_____16_15__ _ __4_3_______0 + * | | | | | | | + * |RES0 | Implementer | Variant | Arch | PartNum |Revision| + * |____ _ _____|_____ _ _____|_________|_______ _|____ _ ___|________| + * + */ + +# define ARM_CPU_IMP_ARM 0x41 +# define HISI_CPU_IMP 0x48 +# define ARM_CPU_IMP_APPLE 0x61 +# define ARM_CPU_IMP_MICROSOFT 0x6D +# define ARM_CPU_IMP_AMPERE 0xC0 + +# define ARM_CPU_PART_CORTEX_A72 0xD08 +# define ARM_CPU_PART_N1 0xD0C +# define ARM_CPU_PART_V1 0xD40 +# define ARM_CPU_PART_N2 0xD49 +# define HISI_CPU_PART_KP920 0xD01 +# define ARM_CPU_PART_V2 0xD4F + +# define APPLE_CPU_PART_M1_ICESTORM 0x022 +# define APPLE_CPU_PART_M1_FIRESTORM 0x023 +# define APPLE_CPU_PART_M1_ICESTORM_PRO 0x024 +# define APPLE_CPU_PART_M1_FIRESTORM_PRO 0x025 +# define APPLE_CPU_PART_M1_ICESTORM_MAX 0x028 +# define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029 +# define APPLE_CPU_PART_M2_BLIZZARD 0x032 +# define APPLE_CPU_PART_M2_AVALANCHE 0x033 +# define APPLE_CPU_PART_M2_BLIZZARD_PRO 0x034 +# define APPLE_CPU_PART_M2_AVALANCHE_PRO 0x035 +# define APPLE_CPU_PART_M2_BLIZZARD_MAX 0x038 +# define APPLE_CPU_PART_M2_AVALANCHE_MAX 0x039 + +# define MICROSOFT_CPU_PART_COBALT_100 0xD49 + +# define MIDR_PARTNUM_SHIFT 4 +# define MIDR_PARTNUM_MASK (0xfffU << MIDR_PARTNUM_SHIFT) +# define MIDR_PARTNUM(midr) \ + (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT) + +# define MIDR_IMPLEMENTER_SHIFT 24 +# define MIDR_IMPLEMENTER_MASK (0xffU << MIDR_IMPLEMENTER_SHIFT) +# define MIDR_IMPLEMENTER(midr) \ + (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT) + +# define MIDR_ARCHITECTURE_SHIFT 16 +# define MIDR_ARCHITECTURE_MASK (0xfU << MIDR_ARCHITECTURE_SHIFT) +# define MIDR_ARCHITECTURE(midr) \ + (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT) + +# define MIDR_CPU_MODEL_MASK \ + (MIDR_IMPLEMENTER_MASK | \ + MIDR_PARTNUM_MASK | \ + MIDR_ARCHITECTURE_MASK) + +# define MIDR_CPU_MODEL(imp, partnum) \ + (((imp) << MIDR_IMPLEMENTER_SHIFT) | \ + (0xfU << MIDR_ARCHITECTURE_SHIFT) | \ + ((partnum) << MIDR_PARTNUM_SHIFT)) + +# define MIDR_IS_CPU_MODEL(midr, imp, partnum) \ + (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum)) + +#if defined(__ASSEMBLER__) + + /* + * Support macros for + * - Armv8.3-A Pointer Authentication and + * - Armv8.5-A Branch Target Identification + * features which require emitting a .note.gnu.property section with the + * appropriate architecture-dependent feature bits set. + * Read more: "ELF for the Arm® 64-bit Architecture" + */ + +# if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 +# define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */ +# define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */ +# else +# define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */ +# define AARCH64_VALID_CALL_TARGET +# endif + +# if defined(__ARM_FEATURE_PAC_DEFAULT) && \ + (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 /* Signed with A-key */ +# define GNU_PROPERTY_AARCH64_POINTER_AUTH \ + (1 << 1) /* Has Pointer Authentication */ +# define AARCH64_SIGN_LINK_REGISTER hint #25 /* PACIASP */ +# define AARCH64_VALIDATE_LINK_REGISTER hint #29 /* AUTIASP */ +# elif defined(__ARM_FEATURE_PAC_DEFAULT) && \ + (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 /* Signed with B-key */ +# define GNU_PROPERTY_AARCH64_POINTER_AUTH \ + (1 << 1) /* Has Pointer Authentication */ +# define AARCH64_SIGN_LINK_REGISTER hint #27 /* PACIBSP */ +# define AARCH64_VALIDATE_LINK_REGISTER hint #31 /* AUTIBSP */ +# else +# define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 /* No Pointer Authentication */ +# if GNU_PROPERTY_AARCH64_BTI != 0 +# define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET +# else +# define AARCH64_SIGN_LINK_REGISTER +# endif +# define AARCH64_VALIDATE_LINK_REGISTER +# endif + +# if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0 + .pushsection .note.gnu.property, "a"; + .balign 8; + .long 4; + .long 0x10; + .long 0x5; + .asciz "GNU"; + .long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ + .long 4; + .long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI); + .long 0; + .popsection; +# endif + +# endif /* defined __ASSEMBLER__ */ + +# define IS_CPU_SUPPORT_UNROLL8_EOR3() \ + (OPENSSL_armcap_P & ARMV8_UNROLL8_EOR3) + +#endif + +.arch armv8-a+crypto +.text +.align 5 +.Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.globl aes_v8_set_encrypt_key +.type aes_v8_set_encrypt_key,%function +.align 5 +aes_v8_set_encrypt_key: +.Lenc_key: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x3,#-1 + cmp x0,#0 + b.eq .Lenc_key_abort + cmp x2,#0 + b.eq .Lenc_key_abort + mov x3,#-2 + cmp w1,#128 + b.lt .Lenc_key_abort + cmp w1,#256 + b.gt .Lenc_key_abort + tst w1,#0x3f + b.ne .Lenc_key_abort + + adr x3,.Lrcon + cmp w1,#192 + + eor v0.16b,v0.16b,v0.16b + ld1 {v3.16b},[x0],#16 + mov w1,#8 // reuse w1 + ld1 {v1.4s,v2.4s},[x3],#32 + + b.lt .Loop128 + b.eq .L192 + b .L256 + +.align 4 +.Loop128: + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + b.ne .Loop128 + + ld1 {v1.4s},[x3] + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2] + add x2,x2,#0x50 + + mov w12,#10 + b .Ldone + +.align 4 +.L192: + ld1 {v4.8b},[x0],#8 + movi v6.16b,#8 // borrow v6.16b + st1 {v3.4s},[x2],#16 + sub v2.16b,v2.16b,v6.16b // adjust the mask + +.Loop192: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 +#ifdef __AARCH64EB__ + st1 {v4.4s},[x2],#16 + sub x2,x2,#8 +#else + st1 {v4.8b},[x2],#8 +#endif + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + + dup v5.4s,v3.s[3] + eor v5.16b,v5.16b,v4.16b + eor v6.16b,v6.16b,v1.16b + ext v4.16b,v0.16b,v4.16b,#12 + shl v1.16b,v1.16b,#1 + eor v4.16b,v4.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + eor v4.16b,v4.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.ne .Loop192 + + mov w12,#12 + add x2,x2,#0x20 + b .Ldone + +.align 4 +.L256: + ld1 {v4.16b},[x0] + mov w1,#7 + mov w12,#14 + st1 {v3.4s},[x2],#16 + +.Loop256: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.eq .Ldone + + dup v6.4s,v3.s[3] // just splat + ext v5.16b,v0.16b,v4.16b,#12 + aese v6.16b,v0.16b + + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + + eor v4.16b,v4.16b,v6.16b + b .Loop256 + +.Ldone: + // str w12,[x2] // don't store, there is the next key in ICP + mov x3,#0 + +.Lenc_key_abort: + mov x0,x3 // return value + ldr x29,[sp],#16 + ret +.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key + +.globl aes_v8_set_decrypt_key +.type aes_v8_set_decrypt_key,%function +.align 5 +aes_v8_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + bl .Lenc_key + + cmp x0,#0 + b.ne .Ldec_key_abort + + sub x2,x2,#240 // restore original x2 + mov x4,#-16 + add x0,x2,x12,lsl#4 // end of key schedule + + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + +.Loop_imc: + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + cmp x0,x2 + b.hi .Loop_imc + + ld1 {v0.4s},[x2] + aesimc v0.16b,v0.16b + st1 {v0.4s},[x0] + + eor x0,x0,x0 // return value +.Ldec_key_abort: + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key + +.globl aes_v8_encrypt +.type aes_v8_encrypt,%function +.align 5 +aes_v8_encrypt: + AARCH64_VALID_CALL_TARGET + // ldr w3,[x2,#240] not required in ICP interface + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +.Loop_enc: + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aese v2.16b,v1.16b + aesmc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt .Loop_enc + + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aese v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret +.size aes_v8_encrypt,.-aes_v8_encrypt + +.globl aes_v8_decrypt +.type aes_v8_decrypt,%function +.align 5 +aes_v8_decrypt: + AARCH64_VALID_CALL_TARGET + // ldr w3,[x2,#240] not required in ICP interface + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +.Loop_dec: + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aesd v2.16b,v1.16b + aesimc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt .Loop_dec + + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aesd v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret +.size aes_v8_decrypt,.-aes_v8_decrypt + +#endif \ No newline at end of file diff --git a/module/icp/asm-aarch64/modes/aes-gcm-armv8_64.S b/module/icp/asm-aarch64/modes/aes-gcm-armv8_64.S new file mode 100644 index 000000000000..71845082414f --- /dev/null +++ b/module/icp/asm-aarch64/modes/aes-gcm-armv8_64.S @@ -0,0 +1,6451 @@ +/* + * Copyright 2011-2024 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#if defined(__aarch64__) +#include "arm_arch.h" + +.arch armv8.1-a+crypto +.text +.globl aes_gcm_enc_128_kernel +.type aes_gcm_enc_128_kernel,%function +.align 4 +aes_gcm_enc_128_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L128_enc_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #160] //load rk10 +#ifdef __AARCH64EB__ + ror x13, x13, #32 + ror x14, x14, #32 +#endif + ld1 {v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + lsr x5, x1, #3 //byte_len + mov x15, x5 + + ld1 {v18.4s}, [x8], #16 //load rk0 + add x4, x0, x1, lsr #3 //end_input_ptr + sub x5, x5, #1 //byte_len - 1 + + lsr x12, x11, #32 + ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + fmov d1, x10 //CTR block 1 + rev w12, w12 //rev_ctr32 + + add w12, w12, #1 //increment rev_ctr32 + orr w11, w11, w11 + ld1 {v19.4s}, [x8], #16 //load rk1 + + rev w9, w12 //CTR block 1 + add w12, w12, #1 //CTR block 1 + fmov d3, x10 //CTR block 3 + + orr x9, x11, x9, lsl #32 //CTR block 1 + ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + + fmov v1.d[1], x9 //CTR block 1 + rev w9, w12 //CTR block 2 + + fmov d2, x10 //CTR block 2 + orr x9, x11, x9, lsl #32 //CTR block 2 + add w12, w12, #1 //CTR block 2 + + fmov v2.d[1], x9 //CTR block 2 + rev w9, w12 //CTR block 3 + + orr x9, x11, x9, lsl #32 //CTR block 3 + ld1 {v20.4s}, [x8], #16 //load rk2 + + add w12, w12, #1 //CTR block 3 + fmov v3.d[1], x9 //CTR block 3 + + ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + ld1 {v21.4s}, [x8], #16 //load rk3 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + ld1 {v22.4s}, [x8], #16 //load rk4 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + ld1 {v23.4s}, [x8], #16 //load rk5 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + trn2 v17.2d, v14.2d, v15.2d //h4l | h3l + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + ld1 {v24.4s}, [x8], #16 //load rk6 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + ld1 {v25.4s}, [x8], #16 //load rk7 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + trn1 v9.2d, v14.2d, v15.2d //h4h | h3h + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + ld1 {v26.4s}, [x8], #16 //load rk8 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + eor v17.16b, v17.16b, v9.16b //h4k | h3k + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + ld1 {v27.4s}, [x8], #16 //load rk9 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + + and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + trn2 v16.2d, v12.2d, v13.2d //h2l | h1l + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + add x5, x5, x0 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + cmp x0, x5 //check if we have <= 4 blocks + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + trn1 v8.2d, v12.2d, v13.2d //h2h | h1h + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + + aese v2.16b, v27.16b //AES block 2 - round 9 + + aese v0.16b, v27.16b //AES block 0 - round 9 + + eor v16.16b, v16.16b, v8.16b //h2k | h1k + + aese v1.16b, v27.16b //AES block 1 - round 9 + + aese v3.16b, v27.16b //AES block 3 - round 9 + b.ge .L128_enc_tail //handle tail + + ldp x6, x7, [x0, #0] //AES block 0 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + ldp x21, x22, [x0, #32] //AES block 2 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif + ldp x19, x20, [x0, #16] //AES block 1 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif + ldp x23, x24, [x0, #48] //AES block 3 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif + eor x6, x6, x13 //AES block 0 - round 10 low + eor x7, x7, x14 //AES block 0 - round 10 high + + eor x21, x21, x13 //AES block 2 - round 10 low + fmov d4, x6 //AES block 0 - mov low + + eor x19, x19, x13 //AES block 1 - round 10 low + eor x22, x22, x14 //AES block 2 - round 10 high + fmov v4.d[1], x7 //AES block 0 - mov high + + fmov d5, x19 //AES block 1 - mov low + eor x20, x20, x14 //AES block 1 - round 10 high + + eor x23, x23, x13 //AES block 3 - round 10 low + fmov v5.d[1], x20 //AES block 1 - mov high + + fmov d6, x21 //AES block 2 - mov low + eor x24, x24, x14 //AES block 3 - round 10 high + rev w9, w12 //CTR block 4 + + fmov v6.d[1], x22 //AES block 2 - mov high + orr x9, x11, x9, lsl #32 //CTR block 4 + + eor v4.16b, v4.16b, v0.16b //AES block 0 - result + fmov d0, x10 //CTR block 4 + add w12, w12, #1 //CTR block 4 + + fmov v0.d[1], x9 //CTR block 4 + rev w9, w12 //CTR block 5 + + eor v5.16b, v5.16b, v1.16b //AES block 1 - result + fmov d1, x10 //CTR block 5 + orr x9, x11, x9, lsl #32 //CTR block 5 + + add w12, w12, #1 //CTR block 5 + add x0, x0, #64 //AES input_ptr update + fmov v1.d[1], x9 //CTR block 5 + + fmov d7, x23 //AES block 3 - mov low + rev w9, w12 //CTR block 6 + st1 { v4.16b}, [x2], #16 //AES block 0 - store result + + fmov v7.d[1], x24 //AES block 3 - mov high + orr x9, x11, x9, lsl #32 //CTR block 6 + + add w12, w12, #1 //CTR block 6 + eor v6.16b, v6.16b, v2.16b //AES block 2 - result + st1 { v5.16b}, [x2], #16 //AES block 1 - store result + + fmov d2, x10 //CTR block 6 + cmp x0, x5 //check if we have <= 8 blocks + + fmov v2.d[1], x9 //CTR block 6 + rev w9, w12 //CTR block 7 + st1 { v6.16b}, [x2], #16 //AES block 2 - store result + + orr x9, x11, x9, lsl #32 //CTR block 7 + + eor v7.16b, v7.16b, v3.16b //AES block 3 - result + st1 { v7.16b}, [x2], #16 //AES block 3 - store result + b.ge .L128_enc_prepretail //do prepretail + +.L128_enc_main_loop: //main loop start + ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif + rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) + rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + fmov d3, x10 //CTR block 4k+3 + + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + add w12, w12, #1 //CTR block 4k+3 + fmov v3.d[1], x9 //CTR block 4k+3 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + mov d30, v5.d[1] //GHASH block 4k+1 - mid + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + eor v4.16b, v4.16b, v11.16b //PRE 1 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor x24, x24, x14 //AES block 4k+3 - round 10 high + + pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + rev w9, w12 //CTR block 4k+8 + + eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid + mov d8, v4.d[1] //GHASH block 4k - mid + orr x9, x11, x9, lsl #32 //CTR block 4k+8 + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + add w12, w12, #1 //CTR block 4k+8 + mov d10, v17.d[1] //GHASH block 4k - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high + + pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) + + pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid + + pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + eor x7, x7, x14 //AES block 4k+4 - round 10 high + + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + eor x6, x6, x13 //AES block 4k+4 - round 10 low + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + + pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + movi v8.8b, #0xc2 + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + shl d8, d8, #56 //mod_constant + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + eor x19, x19, x13 //AES block 4k+5 - round 10 low + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + eor x23, x23, x13 //AES block 4k+3 - round 10 low + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + fmov d4, x6 //AES block 4k+4 - mov low + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + fmov v4.d[1], x7 //AES block 4k+4 - mov high + + add x0, x0, #64 //AES input_ptr update + fmov d7, x23 //AES block 4k+3 - mov low + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + fmov d5, x19 //AES block 4k+5 - mov low + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + eor x20, x20, x14 //AES block 4k+5 - round 10 high + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + fmov v5.d[1], x20 //AES block 4k+5 - mov high + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + fmov v7.d[1], x24 //AES block 4k+3 - mov high + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + cmp x0, x5 //.LOOP CONTROL + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + aese v0.16b, v27.16b //AES block 4k+4 - round 9 + eor x21, x21, x13 //AES block 4k+6 - round 10 low + eor x22, x22, x14 //AES block 4k+6 - round 10 high + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + fmov d6, x21 //AES block 4k+6 - mov low + + aese v1.16b, v27.16b //AES block 4k+5 - round 9 + fmov v6.d[1], x22 //AES block 4k+6 - mov high + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result + + fmov d0, x10 //CTR block 4k+8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + fmov v0.d[1], x9 //CTR block 4k+8 + rev w9, w12 //CTR block 4k+9 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result + + add w12, w12, #1 //CTR block 4k+9 + orr x9, x11, x9, lsl #32 //CTR block 4k+9 + fmov d1, x10 //CTR block 4k+9 + + pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + fmov v1.d[1], x9 //CTR block 4k+9 + rev w9, w12 //CTR block 4k+10 + + aese v2.16b, v27.16b //AES block 4k+6 - round 9 + st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result + eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result + orr x9, x11, x9, lsl #32 //CTR block 4k+10 + + aese v3.16b, v27.16b //AES block 4k+7 - round 9 + add w12, w12, #1 //CTR block 4k+10 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + fmov d2, x10 //CTR block 4k+10 + + eor v11.16b, v11.16b, v9.16b //MODULO - fold into low + st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result + + fmov v2.d[1], x9 //CTR block 4k+10 + st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result + rev w9, w12 //CTR block 4k+11 + + orr x9, x11, x9, lsl #32 //CTR block 4k+11 + eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result + b.lt .L128_enc_main_loop + +.L128_enc_prepretail: //PREPRETAIL + rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) + fmov d3, x10 //CTR block 4k+3 + rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) + + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + add w12, w12, #1 //CTR block 4k+3 + fmov v3.d[1], x9 //CTR block 4k+3 + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) + + pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + + rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) + eor v4.16b, v4.16b, v11.16b //PRE 1 + + pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + mov d30, v5.d[1] //GHASH block 4k+1 - mid + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d8, v4.d[1] //GHASH block 4k - mid + + mov d31, v6.d[1] //GHASH block 4k+2 - mid + mov d10, v17.d[1] //GHASH block 4k - mid + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid + + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + + pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + + pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + movi v8.8b, #0xc2 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + shl d8, d8, #56 //mod_constant + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + + pmull v28.1q, v9.1d, v8.1d + eor v10.16b, v10.16b, v9.16b //karatsuba tidy up + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + ext v9.16b, v9.16b, v9.16b, #8 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v11.16b + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + eor v10.16b, v10.16b, v28.16b + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v9.16b + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + + pmull v28.1q, v10.1d, v8.1d + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + ext v10.16b, v10.16b, v10.16b, #8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v28.16b + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + + aese v3.16b, v27.16b //AES block 4k+7 - round 9 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + + aese v0.16b, v27.16b //AES block 4k+4 - round 9 + + aese v1.16b, v27.16b //AES block 4k+5 - round 9 + eor v11.16b, v11.16b, v10.16b + + aese v2.16b, v27.16b //AES block 4k+6 - round 9 +.L128_enc_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + cmp x5, #48 + + ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag + eor x6, x6, x13 //AES block 4k+4 - round 10 low + eor x7, x7, x14 //AES block 4k+4 - round 10 high + + fmov d4, x6 //AES block 4k+4 - mov low + + fmov v4.d[1], x7 //AES block 4k+4 - mov high + + eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result + + b.gt .L128_enc_blocks_more_than_3 + + sub w12, w12, #1 + movi v11.8b, #0 + mov v3.16b, v2.16b + + cmp x5, #32 + mov v2.16b, v1.16b + movi v9.8b, #0 + + movi v10.8b, #0 + b.gt .L128_enc_blocks_more_than_2 + + mov v3.16b, v1.16b + cmp x5, #16 + + sub w12, w12, #1 + b.gt .L128_enc_blocks_more_than_1 + + sub w12, w12, #1 + b .L128_enc_blocks_less_than_1 +.L128_enc_blocks_more_than_3: //blocks left > 3 + st1 { v5.16b}, [x2], #16 //AES final-3 block - store result + + ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + rev64 v4.16b, v5.16b //GHASH final-3 block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + eor x7, x7, x14 //AES final-2 block - round 10 high + eor x6, x6, x13 //AES final-2 block - round 10 low + + fmov d5, x6 //AES final-2 block - mov low + + movi v8.8b, #0 //suppress further partial tag feed in + fmov v5.d[1], x7 //AES final-2 block - mov high + + pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low + mov d22, v4.d[1] //GHASH final-3 block - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high + + mov d10, v17.d[1] //GHASH final-3 block - mid + + eor v5.16b, v5.16b, v1.16b //AES final-2 block - result + eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid + + pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid +.L128_enc_blocks_more_than_2: //blocks left > 2 + + st1 { v5.16b}, [x2], #16 //AES final-2 block - store result + + rev64 v4.16b, v5.16b //GHASH final-2 block + ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor x6, x6, x13 //AES final-1 block - round 10 low + + fmov d5, x6 //AES final-1 block - mov low + eor x7, x7, x14 //AES final-1 block - round 10 high + + pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high + fmov v5.d[1], x7 //AES final-1 block - mov high + + mov d22, v4.d[1] //GHASH final-2 block - mid + + pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high + + eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid + + eor v5.16b, v5.16b, v2.16b //AES final-1 block - result + + eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low + + pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + + eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid +.L128_enc_blocks_more_than_1: //blocks left > 1 + + st1 { v5.16b}, [x2], #16 //AES final-1 block - store result + + rev64 v4.16b, v5.16b //GHASH final-1 block + ldp x6, x7, [x0], #16 //AES final block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor x7, x7, x14 //AES final block - round 10 high + eor x6, x6, x13 //AES final block - round 10 low + + fmov d5, x6 //AES final block - mov low + + pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high + fmov v5.d[1], x7 //AES final block - mov high + + mov d22, v4.d[1] //GHASH final-1 block - mid + + pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low + + eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid + + eor v5.16b, v5.16b, v3.16b //AES final block - result + + ins v22.d[1], v22.d[0] //GHASH final-1 block - mid + + pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high + + eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid + movi v8.8b, #0 //suppress further partial tag feed in +.L128_enc_blocks_less_than_1: //blocks left <= 1 + + and x1, x1, #127 //bit_length %= 128 + mvn x13, xzr //rk10_l = 0xffffffffffffffff + + mvn x14, xzr //rk10_h = 0xffffffffffffffff + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + and x1, x1, #127 //bit_length %= 128 + + lsr x14, x14, x1 //rk10_h is mask for top 64b of last block + cmp x1, #64 + + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + + fmov d0, x6 //ctr0b is mask for last block + + fmov v0.d[1], x7 + + and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v4.16b, v5.16b //GHASH final block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + mov d8, v4.d[1] //GHASH final block - mid + + pmull v21.1q, v4.1d, v12.1d //GHASH final block - low + ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + + eor v8.8b, v8.8b, v4.8b //GHASH final block - mid +#ifndef __AARCH64EB__ + rev w9, w12 +#else + mov w9, w12 +#endif + pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high + + pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final block - high + + eor v10.16b, v10.16b, v8.16b //GHASH final block - mid + movi v8.8b, #0xc2 + + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + shl d8, d8, #56 //mod_constant + + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing + + eor v11.16b, v11.16b, v9.16b //MODULO - fold into low + st1 { v5.16b}, [x2] //store all 16B + + str w9, [x16, #12] //store the updated counter + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L128_enc_ret: + mov w0, #0x0 + ret +.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel +.globl aes_gcm_dec_128_kernel +.type aes_gcm_dec_128_kernel,%function +.align 4 +aes_gcm_dec_128_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L128_dec_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + lsr x5, x1, #3 //byte_len + mov x15, x5 + ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #160] //load rk10 +#ifdef __AARCH64EB__ + ror x14, x14, 32 + ror x13, x13, 32 +#endif + sub x5, x5, #1 //byte_len - 1 + ld1 {v18.4s}, [x8], #16 //load rk0 + + and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + + ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + lsr x12, x11, #32 + fmov d2, x10 //CTR block 2 + + ld1 {v19.4s}, [x8], #16 //load rk1 + orr w11, w11, w11 + rev w12, w12 //rev_ctr32 + + fmov d1, x10 //CTR block 1 + add w12, w12, #1 //increment rev_ctr32 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + rev w9, w12 //CTR block 1 + + orr x9, x11, x9, lsl #32 //CTR block 1 + ld1 {v20.4s}, [x8], #16 //load rk2 + add w12, w12, #1 //CTR block 1 + + fmov v1.d[1], x9 //CTR block 1 + rev w9, w12 //CTR block 2 + add w12, w12, #1 //CTR block 2 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + orr x9, x11, x9, lsl #32 //CTR block 2 + + fmov v2.d[1], x9 //CTR block 2 + rev w9, w12 //CTR block 3 + + fmov d3, x10 //CTR block 3 + orr x9, x11, x9, lsl #32 //CTR block 3 + add w12, w12, #1 //CTR block 3 + + fmov v3.d[1], x9 //CTR block 3 + add x4, x0, x1, lsr #3 //end_input_ptr + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + ld1 {v21.4s}, [x8], #16 //load rk3 + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + ld1 {v22.4s}, [x8], #16 //load rk4 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ld1 {v23.4s}, [x8], #16 //load rk5 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + ld1 {v24.4s}, [x8], #16 //load rk6 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + ld1 {v25.4s}, [x8], #16 //load rk7 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + ld1 {v26.4s}, [x8], #16 //load rk8 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + ld1 {v27.4s}, [x8], #16 //load rk9 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + trn1 v8.2d, v12.2d, v13.2d //h2h | h1h + + ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + trn2 v16.2d, v12.2d, v13.2d //h2l | h1l + add x5, x5, x0 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + eor v16.16b, v16.16b, v8.16b //h2k | h1k + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + trn2 v17.2d, v14.2d, v15.2d //h4l | h3l + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + trn1 v9.2d, v14.2d, v15.2d //h4h | h3h + + aese v2.16b, v27.16b //AES block 2 - round 9 + + aese v3.16b, v27.16b //AES block 3 - round 9 + + aese v0.16b, v27.16b //AES block 0 - round 9 + cmp x0, x5 //check if we have <= 4 blocks + + aese v1.16b, v27.16b //AES block 1 - round 9 + eor v17.16b, v17.16b, v9.16b //h4k | h3k + b.ge .L128_dec_tail //handle tail + + ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0 - load ciphertext; AES block 1 - load ciphertext + + eor v1.16b, v5.16b, v1.16b //AES block 1 - result + ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext + + eor v0.16b, v4.16b, v0.16b //AES block 0 - result + rev64 v4.16b, v4.16b //GHASH block 0 + rev w9, w12 //CTR block 4 + + orr x9, x11, x9, lsl #32 //CTR block 4 + add w12, w12, #1 //CTR block 4 + ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext + + rev64 v5.16b, v5.16b //GHASH block 1 + mov x19, v1.d[0] //AES block 1 - mov low + + mov x20, v1.d[1] //AES block 1 - mov high + + mov x6, v0.d[0] //AES block 0 - mov low + cmp x0, x5 //check if we have <= 8 blocks + + mov x7, v0.d[1] //AES block 0 - mov high + + fmov d0, x10 //CTR block 4 + + fmov v0.d[1], x9 //CTR block 4 + rev w9, w12 //CTR block 5 + eor x19, x19, x13 //AES block 1 - round 10 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif + fmov d1, x10 //CTR block 5 + add w12, w12, #1 //CTR block 5 + orr x9, x11, x9, lsl #32 //CTR block 5 + + fmov v1.d[1], x9 //CTR block 5 + rev w9, w12 //CTR block 6 + add w12, w12, #1 //CTR block 6 + + orr x9, x11, x9, lsl #32 //CTR block 6 + + eor x20, x20, x14 //AES block 1 - round 10 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif + eor x6, x6, x13 //AES block 0 - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor v2.16b, v6.16b, v2.16b //AES block 2 - result + + eor x7, x7, x14 //AES block 0 - round 10 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + stp x6, x7, [x2], #16 //AES block 0 - store result + + stp x19, x20, [x2], #16 //AES block 1 - store result + b.ge .L128_dec_prepretail //do prepretail + +.L128_dec_main_loop: //main loop start + eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + mov x21, v2.d[0] //AES block 4k+2 - mov low + + pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + mov x22, v2.d[1] //AES block 4k+2 - mov high + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + fmov d2, x10 //CTR block 4k+6 + + rev64 v6.16b, v6.16b //GHASH block 4k+2 + fmov v2.d[1], x9 //CTR block 4k+6 + rev w9, w12 //CTR block 4k+7 + + mov x23, v3.d[0] //AES block 4k+3 - mov low + eor v4.16b, v4.16b, v11.16b //PRE 1 + mov d30, v5.d[1] //GHASH block 4k+1 - mid + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + rev64 v7.16b, v7.16b //GHASH block 4k+3 + + pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + mov x24, v3.d[1] //AES block 4k+3 - mov high + orr x9, x11, x9, lsl #32 //CTR block 4k+7 + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + fmov d3, x10 //CTR block 4k+7 + eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + fmov v3.d[1], x9 //CTR block 4k+7 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + mov d10, v17.d[1] //GHASH block 4k - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low + + pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + mov d8, v4.d[1] //GHASH block 4k - mid + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + + pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + eor x23, x23, x13 //AES block 4k+3 - round 10 low +#ifdef __AARCH64EB__ + rev x23, x23 +#endif + pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid + eor x22, x22, x14 //AES block 4k+2 - round 10 high +#ifdef __AARCH64EB__ + rev x22, x22 +#endif + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid + + pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + eor x24, x24, x14 //AES block 4k+3 - round 10 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + eor x21, x21, x13 //AES block 4k+2 - round 10 low +#ifdef __AARCH64EB__ + rev x21, x21 +#endif + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + movi v8.8b, #0xc2 + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + stp x21, x22, [x2], #16 //AES block 4k+2 - store result + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high + ld1 {v4.16b}, [x0], #16 //AES block 4k+3 - load ciphertext + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + add w12, w12, #1 //CTR block 4k+7 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + shl d8, d8, #56 //mod_constant + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + stp x23, x24, [x2], #16 //AES block 4k+3 - store result + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + rev w9, w12 //CTR block 4k+8 + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + ld1 {v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v0.16b, v27.16b //AES block 4k+4 - round 9 + orr x9, x11, x9, lsl #32 //CTR block 4k+8 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + aese v1.16b, v27.16b //AES block 4k+5 - round 9 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + ld1 {v6.16b}, [x0], #16 //AES block 4k+5 - load ciphertext + + add w12, w12, #1 //CTR block 4k+8 + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + ld1 {v7.16b}, [x0], #16 //AES block 4k+6 - load ciphertext + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + + rev64 v5.16b, v5.16b //GHASH block 4k+5 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + mov x7, v0.d[1] //AES block 4k+4 - mov high + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + mov x6, v0.d[0] //AES block 4k+4 - mov low + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + fmov d0, x10 //CTR block 4k+8 + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + fmov v0.d[1], x9 //CTR block 4k+8 + rev w9, w12 //CTR block 4k+9 + + aese v2.16b, v27.16b //AES block 4k+6 - round 9 + orr x9, x11, x9, lsl #32 //CTR block 4k+9 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + eor x7, x7, x14 //AES block 4k+4 - round 10 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + mov x20, v1.d[1] //AES block 4k+5 - mov high + eor x6, x6, x13 //AES block 4k+4 - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result + mov x19, v1.d[0] //AES block 4k+5 - mov low + add w12, w12, #1 //CTR block 4k+9 + + aese v3.16b, v27.16b //AES block 4k+7 - round 9 + fmov d1, x10 //CTR block 4k+9 + cmp x0, x5 //.LOOP CONTROL + + rev64 v4.16b, v4.16b //GHASH block 4k+4 + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + fmov v1.d[1], x9 //CTR block 4k+9 + + rev w9, w12 //CTR block 4k+10 + add w12, w12, #1 //CTR block 4k+10 + + eor x20, x20, x14 //AES block 4k+5 - round 10 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif + stp x6, x7, [x2], #16 //AES block 4k+4 - store result + + eor x19, x19, x13 //AES block 4k+5 - round 10 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif + stp x19, x20, [x2], #16 //AES block 4k+5 - store result + + orr x9, x11, x9, lsl #32 //CTR block 4k+10 + b.lt .L128_dec_main_loop + +.L128_dec_prepretail: //PREPRETAIL + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + mov x21, v2.d[0] //AES block 4k+2 - mov low + mov d30, v5.d[1] //GHASH block 4k+1 - mid + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + mov x22, v2.d[1] //AES block 4k+2 - mov high + + eor v4.16b, v4.16b, v11.16b //PRE 1 + fmov d2, x10 //CTR block 4k+6 + rev64 v6.16b, v6.16b //GHASH block 4k+2 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + fmov v2.d[1], x9 //CTR block 4k+6 + + rev w9, w12 //CTR block 4k+7 + mov x23, v3.d[0] //AES block 4k+3 - mov low + eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d10, v17.d[1] //GHASH block 4k - mid + mov x24, v3.d[1] //AES block 4k+3 - mov high + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + orr x9, x11, x9, lsl #32 //CTR block 4k+7 + + pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + mov d8, v4.d[1] //GHASH block 4k - mid + fmov d3, x10 //CTR block 4k+7 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + fmov v3.d[1], x9 //CTR block 4k+7 + + pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + rev64 v7.16b, v7.16b //GHASH block 4k+3 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low + + pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high + + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid + + pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high + movi v8.8b, #0xc2 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + eor x23, x23, x13 //AES block 4k+3 - round 10 low +#ifdef __AARCH64EB__ + rev x23, x23 +#endif + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + eor x21, x21, x13 //AES block 4k+2 - round 10 low +#ifdef __AARCH64EB__ + rev x21, x21 +#endif + eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + shl d8, d8, #56 //mod_constant + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + aese v1.16b, v27.16b //AES block 4k+5 - round 9 + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + eor x24, x24, x14 //AES block 4k+3 - round 10 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + eor x22, x22, x14 //AES block 4k+2 - round 10 high +#ifdef __AARCH64EB__ + rev x22, x22 +#endif + aese v0.16b, v27.16b //AES block 4k+4 - round 9 + stp x21, x22, [x2], #16 //AES block 4k+2 - store result + + aese v2.16b, v27.16b //AES block 4k+6 - round 9 + add w12, w12, #1 //CTR block 4k+7 + stp x23, x24, [x2], #16 //AES block 4k+3 - store result + + aese v3.16b, v27.16b //AES block 4k+7 - round 9 + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low +.L128_dec_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + + eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result + + mov x7, v0.d[1] //AES block 4k+4 - mov high + + mov x6, v0.d[0] //AES block 4k+4 - mov low + + cmp x5, #48 + + eor x7, x7, x14 //AES block 4k+4 - round 10 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag + eor x6, x6, x13 //AES block 4k+4 - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + b.gt .L128_dec_blocks_more_than_3 + + mov v3.16b, v2.16b + sub w12, w12, #1 + movi v11.8b, #0 + + movi v9.8b, #0 + mov v2.16b, v1.16b + + movi v10.8b, #0 + cmp x5, #32 + b.gt .L128_dec_blocks_more_than_2 + + cmp x5, #16 + + mov v3.16b, v1.16b + sub w12, w12, #1 + b.gt .L128_dec_blocks_more_than_1 + + sub w12, w12, #1 + b .L128_dec_blocks_less_than_1 +.L128_dec_blocks_more_than_3: //blocks left > 3 + rev64 v4.16b, v5.16b //GHASH final-3 block + ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + mov d10, v17.d[1] //GHASH final-3 block - mid + stp x6, x7, [x2], #16 //AES final-3 block - store result + eor v0.16b, v5.16b, v1.16b //AES final-2 block - result + + mov d22, v4.d[1] //GHASH final-3 block - mid + mov x7, v0.d[1] //AES final-2 block - mov high + + pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low + mov x6, v0.d[0] //AES final-2 block - mov low + + pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high + + eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + eor x7, x7, x14 //AES final-2 block - round 10 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid + eor x6, x6, x13 //AES final-2 block - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif +.L128_dec_blocks_more_than_2: //blocks left > 2 + + rev64 v4.16b, v5.16b //GHASH final-2 block + ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor v0.16b, v5.16b, v2.16b //AES final-1 block - result + stp x6, x7, [x2], #16 //AES final-2 block - store result + + mov d22, v4.d[1] //GHASH final-2 block - mid + + pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low + + pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high + mov x6, v0.d[0] //AES final-1 block - mov low + + mov x7, v0.d[1] //AES final-1 block - mov high + eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + + pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid + + eor x6, x6, x13 //AES final-1 block - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high + + eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid + eor x7, x7, x14 //AES final-1 block - round 10 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif +.L128_dec_blocks_more_than_1: //blocks left > 1 + + rev64 v4.16b, v5.16b //GHASH final-1 block + + ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + mov d22, v4.d[1] //GHASH final-1 block - mid + + eor v0.16b, v5.16b, v3.16b //AES final block - result + + eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid + + stp x6, x7, [x2], #16 //AES final-1 block - store result + mov x6, v0.d[0] //AES final block - mov low + + mov x7, v0.d[1] //AES final block - mov high + ins v22.d[1], v22.d[0] //GHASH final-1 block - mid + + pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low + + pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high + + pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid + movi v8.8b, #0 //suppress further partial tag feed in + + eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high + eor x7, x7, x14 //AES final block - round 10 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + eor x6, x6, x13 //AES final block - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid +.L128_dec_blocks_less_than_1: //blocks left <= 1 + + mvn x14, xzr //rk10_h = 0xffffffffffffffff + and x1, x1, #127 //bit_length %= 128 + + mvn x13, xzr //rk10_l = 0xffffffffffffffff + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + and x1, x1, #127 //bit_length %= 128 + + lsr x14, x14, x1 //rk10_h is mask for top 64b of last block + cmp x1, #64 + + csel x10, x14, xzr, lt + csel x9, x13, x14, lt + + fmov d0, x9 //ctr0b is mask for last block + + mov v0.d[1], x10 + + and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v4.16b, v5.16b //GHASH final block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + ldp x4, x5, [x2] //load existing bytes we need to not overwrite + + and x7, x7, x10 + + pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high + mov d8, v4.d[1] //GHASH final block - mid + + eor v8.8b, v8.8b, v4.8b //GHASH final block - mid + eor v9.16b, v9.16b, v20.16b //GHASH final block - high + + pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid + + pmull v21.1q, v4.1d, v12.1d //GHASH final block - low + bic x4, x4, x9 //mask out low existing bytes + and x6, x6, x9 + +#ifndef __AARCH64EB__ + rev w9, w12 +#else + mov w9, w12 +#endif + + eor v10.16b, v10.16b, v8.16b //GHASH final block - mid + movi v8.8b, #0xc2 + + eor v11.16b, v11.16b, v21.16b //GHASH final block - low + + bic x5, x5, x10 //mask out high existing bytes + shl d8, d8, #56 //mod_constant + + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + orr x6, x6, x4 + str w9, [x16, #12] //store the updated counter + + orr x7, x7, x5 + stp x6, x7, [x2] + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L128_dec_ret: + mov w0, #0x0 + ret +.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel +.globl aes_gcm_enc_192_kernel +.type aes_gcm_enc_192_kernel,%function +.align 4 +aes_gcm_enc_192_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L192_enc_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #192] //load rk12 +#ifdef __AARCH64EB__ + ror x13, x13, #32 + ror x14, x14, #32 +#endif + ld1 {v18.4s}, [x8], #16 //load rk0 + + ld1 {v19.4s}, [x8], #16 //load rk1 + + ld1 {v20.4s}, [x8], #16 //load rk2 + + lsr x12, x11, #32 + ld1 {v21.4s}, [x8], #16 //load rk3 + orr w11, w11, w11 + + ld1 {v22.4s}, [x8], #16 //load rk4 + rev w12, w12 //rev_ctr32 + + add w12, w12, #1 //increment rev_ctr32 + fmov d3, x10 //CTR block 3 + + rev w9, w12 //CTR block 1 + add w12, w12, #1 //CTR block 1 + fmov d1, x10 //CTR block 1 + + orr x9, x11, x9, lsl #32 //CTR block 1 + ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + + fmov v1.d[1], x9 //CTR block 1 + rev w9, w12 //CTR block 2 + add w12, w12, #1 //CTR block 2 + + fmov d2, x10 //CTR block 2 + orr x9, x11, x9, lsl #32 //CTR block 2 + + fmov v2.d[1], x9 //CTR block 2 + rev w9, w12 //CTR block 3 + + orr x9, x11, x9, lsl #32 //CTR block 3 + ld1 {v23.4s}, [x8], #16 //load rk5 + + fmov v3.d[1], x9 //CTR block 3 + + ld1 {v24.4s}, [x8], #16 //load rk6 + + ld1 {v25.4s}, [x8], #16 //load rk7 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + ld1 {v26.4s}, [x8], #16 //load rk8 + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ld1 {v27.4s}, [x8], #16 //load rk9 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + ld1 {v28.4s}, [x8], #16 //load rk10 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + ld1 {v29.4s}, [x8], #16 //load rk11 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + trn1 v9.2d, v14.2d, v15.2d //h4h | h3h + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + trn2 v17.2d, v14.2d, v15.2d //h4l | h3l + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + trn2 v16.2d, v12.2d, v13.2d //h2l | h1l + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + trn1 v8.2d, v12.2d, v13.2d //h2h | h1h + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 10 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 10 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 10 + lsr x5, x1, #3 //byte_len + mov x15, x5 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 10 + sub x5, x5, #1 //byte_len - 1 + + eor v16.16b, v16.16b, v8.16b //h2k | h1k + and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + eor v17.16b, v17.16b, v9.16b //h4k | h3k + + aese v2.16b, v29.16b //AES block 2 - round 11 + add x4, x0, x1, lsr #3 //end_input_ptr + add x5, x5, x0 + + aese v1.16b, v29.16b //AES block 1 - round 11 + cmp x0, x5 //check if we have <= 4 blocks + + aese v0.16b, v29.16b //AES block 0 - round 11 + add w12, w12, #1 //CTR block 3 + + aese v3.16b, v29.16b //AES block 3 - round 11 + b.ge .L192_enc_tail //handle tail + + rev w9, w12 //CTR block 4 + ldp x6, x7, [x0, #0] //AES block 0 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + orr x9, x11, x9, lsl #32 //CTR block 4 + ldp x21, x22, [x0, #32] //AES block 2 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif + ldp x23, x24, [x0, #48] //AES block 3 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif + ldp x19, x20, [x0, #16] //AES block 1 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif + add x0, x0, #64 //AES input_ptr update + cmp x0, x5 //check if we have <= 8 blocks + + eor x6, x6, x13 //AES block 0 - round 12 low + + eor x7, x7, x14 //AES block 0 - round 12 high + eor x22, x22, x14 //AES block 2 - round 12 high + fmov d4, x6 //AES block 0 - mov low + + eor x24, x24, x14 //AES block 3 - round 12 high + fmov v4.d[1], x7 //AES block 0 - mov high + + eor x21, x21, x13 //AES block 2 - round 12 low + eor x19, x19, x13 //AES block 1 - round 12 low + + fmov d5, x19 //AES block 1 - mov low + eor x20, x20, x14 //AES block 1 - round 12 high + + fmov v5.d[1], x20 //AES block 1 - mov high + + eor x23, x23, x13 //AES block 3 - round 12 low + fmov d6, x21 //AES block 2 - mov low + + add w12, w12, #1 //CTR block 4 + eor v4.16b, v4.16b, v0.16b //AES block 0 - result + fmov d0, x10 //CTR block 4 + + fmov v0.d[1], x9 //CTR block 4 + rev w9, w12 //CTR block 5 + + orr x9, x11, x9, lsl #32 //CTR block 5 + add w12, w12, #1 //CTR block 5 + + fmov d7, x23 //AES block 3 - mov low + st1 { v4.16b}, [x2], #16 //AES block 0 - store result + + fmov v6.d[1], x22 //AES block 2 - mov high + + eor v5.16b, v5.16b, v1.16b //AES block 1 - result + fmov d1, x10 //CTR block 5 + st1 { v5.16b}, [x2], #16 //AES block 1 - store result + + fmov v7.d[1], x24 //AES block 3 - mov high + + fmov v1.d[1], x9 //CTR block 5 + rev w9, w12 //CTR block 6 + + orr x9, x11, x9, lsl #32 //CTR block 6 + + add w12, w12, #1 //CTR block 6 + eor v6.16b, v6.16b, v2.16b //AES block 2 - result + fmov d2, x10 //CTR block 6 + + fmov v2.d[1], x9 //CTR block 6 + rev w9, w12 //CTR block 7 + + orr x9, x11, x9, lsl #32 //CTR block 7 + st1 { v6.16b}, [x2], #16 //AES block 2 - store result + + eor v7.16b, v7.16b, v3.16b //AES block 3 - result + st1 { v7.16b}, [x2], #16 //AES block 3 - store result + b.ge .L192_enc_prepretail //do prepretail + +.L192_enc_main_loop: //main loop start + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + fmov d3, x10 //CTR block 4k+3 + rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + fmov v3.d[1], x9 //CTR block 4k+3 + + pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) + ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif + pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + eor v4.16b, v4.16b, v11.16b //PRE 1 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor x24, x24, x14 //AES block 4k+3 - round 12 high + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d8, v4.d[1] //GHASH block 4k - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + eor x21, x21, x13 //AES block 4k+6 - round 12 low + + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + eor x19, x19, x13 //AES block 4k+5 - round 12 low + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + + mov d10, v17.d[1] //GHASH block 4k - mid + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + eor x20, x20, x14 //AES block 4k+5 - round 12 high + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + add w12, w12, #1 //CTR block 4k+3 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + eor x22, x22, x14 //AES block 4k+6 - round 12 high + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + eor x23, x23, x13 //AES block 4k+3 - round 12 low + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + rev w9, w12 //CTR block 4k+8 + + pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + orr x9, x11, x9, lsl #32 //CTR block 4k+8 + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + add x0, x0, #64 //AES input_ptr update + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + movi v8.8b, #0xc2 + + pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + eor x7, x7, x14 //AES block 4k+4 - round 12 high + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + eor x6, x6, x13 //AES block 4k+4 - round 12 low + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + shl d8, d8, #56 //mod_constant + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + fmov d5, x19 //AES block 4k+5 - mov low + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + fmov v5.d[1], x20 //AES block 4k+5 - mov high + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + cmp x0, x5 //.LOOP CONTROL + fmov d4, x6 //AES block 4k+4 - mov low + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + fmov v4.d[1], x7 //AES block 4k+4 - mov high + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + fmov d7, x23 //AES block 4k+3 - mov low + + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + add w12, w12, #1 //CTR block 4k+8 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + fmov v7.d[1], x24 //AES block 4k+3 - mov high + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + fmov d6, x21 //AES block 4k+6 - mov low + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + + aese v0.16b, v29.16b //AES block 4k+4 - round 11 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + + eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result + fmov d0, x10 //CTR block 4k+8 + + aese v1.16b, v29.16b //AES block 4k+5 - round 11 + fmov v0.d[1], x9 //CTR block 4k+8 + rev w9, w12 //CTR block 4k+9 + + pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + fmov v6.d[1], x22 //AES block 4k+6 - mov high + st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + orr x9, x11, x9, lsl #32 //CTR block 4k+9 + + eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result + add w12, w12, #1 //CTR block 4k+9 + fmov d1, x10 //CTR block 4k+9 + + aese v2.16b, v29.16b //AES block 4k+6 - round 11 + fmov v1.d[1], x9 //CTR block 4k+9 + rev w9, w12 //CTR block 4k+10 + + add w12, w12, #1 //CTR block 4k+10 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + orr x9, x11, x9, lsl #32 //CTR block 4k+10 + + st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result + eor v11.16b, v11.16b, v9.16b //MODULO - fold into low + + aese v3.16b, v29.16b //AES block 4k+7 - round 11 + eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result + fmov d2, x10 //CTR block 4k+10 + + st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result + fmov v2.d[1], x9 //CTR block 4k+10 + rev w9, w12 //CTR block 4k+11 + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + orr x9, x11, x9, lsl #32 //CTR block 4k+11 + + eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result + st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result + b.lt .L192_enc_main_loop + +.L192_enc_prepretail: //PREPRETAIL + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) + + fmov d3, x10 //CTR block 4k+3 + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + add w12, w12, #1 //CTR block 4k+3 + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + + fmov v3.d[1], x9 //CTR block 4k+3 + eor v4.16b, v4.16b, v11.16b //PRE 1 + mov d10, v17.d[1] //GHASH block 4k - mid + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) + + pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d8, v4.d[1] //GHASH block 4k - mid + + pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high + + pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + + pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + movi v8.8b, #0xc2 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v9.16b //karatsuba tidy up + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + shl d8, d8, #56 //mod_constant + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v11.16b + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + pmull v30.1q, v9.1d, v8.1d + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + ext v9.16b, v9.16b, v9.16b, #8 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v30.16b + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v9.16b + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + + pmull v30.1q, v10.1d, v8.1d + + ext v10.16b, v10.16b, v10.16b, #8 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + eor v11.16b, v11.16b, v30.16b + + aese v0.16b, v29.16b //AES block 4k+4 - round 11 + + aese v3.16b, v29.16b //AES block 4k+7 - round 11 + + aese v2.16b, v29.16b //AES block 4k+6 - round 11 + + aese v1.16b, v29.16b //AES block 4k+5 - round 11 + eor v11.16b, v11.16b, v10.16b +.L192_enc_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + eor x6, x6, x13 //AES block 4k+4 - round 12 low + eor x7, x7, x14 //AES block 4k+4 - round 12 high + + fmov d4, x6 //AES block 4k+4 - mov low + + fmov v4.d[1], x7 //AES block 4k+4 - mov high + cmp x5, #48 + + eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result + + ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag + b.gt .L192_enc_blocks_more_than_3 + + sub w12, w12, #1 + movi v10.8b, #0 + + mov v3.16b, v2.16b + movi v9.8b, #0 + cmp x5, #32 + + mov v2.16b, v1.16b + movi v11.8b, #0 + b.gt .L192_enc_blocks_more_than_2 + + sub w12, w12, #1 + + mov v3.16b, v1.16b + cmp x5, #16 + b.gt .L192_enc_blocks_more_than_1 + + sub w12, w12, #1 + b .L192_enc_blocks_less_than_1 +.L192_enc_blocks_more_than_3: //blocks left > 3 + st1 { v5.16b}, [x2], #16 //AES final-3 block - store result + + ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + rev64 v4.16b, v5.16b //GHASH final-3 block + + eor x6, x6, x13 //AES final-2 block - round 12 low + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor x7, x7, x14 //AES final-2 block - round 12 high + fmov d5, x6 //AES final-2 block - mov low + + fmov v5.d[1], x7 //AES final-2 block - mov high + + mov d22, v4.d[1] //GHASH final-3 block - mid + + pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low + + mov d10, v17.d[1] //GHASH final-3 block - mid + + eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + + pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high + + pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid + eor v5.16b, v5.16b, v1.16b //AES final-2 block - result +.L192_enc_blocks_more_than_2: //blocks left > 2 + + st1 { v5.16b}, [x2], #16 //AES final-2 block - store result + + rev64 v4.16b, v5.16b //GHASH final-2 block + ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor x7, x7, x14 //AES final-1 block - round 12 high + + pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high + mov d22, v4.d[1] //GHASH final-2 block - mid + + pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low + eor x6, x6, x13 //AES final-1 block - round 12 low + + fmov d5, x6 //AES final-1 block - mov low + + fmov v5.d[1], x7 //AES final-1 block - mov high + eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low + + pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + + eor v5.16b, v5.16b, v2.16b //AES final-1 block - result + + eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid +.L192_enc_blocks_more_than_1: //blocks left > 1 + + st1 { v5.16b}, [x2], #16 //AES final-1 block - store result + + ldp x6, x7, [x0], #16 //AES final block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + rev64 v4.16b, v5.16b //GHASH final-1 block + + eor x6, x6, x13 //AES final block - round 12 low + eor v4.16b, v4.16b, v8.16b //feed in partial tag + movi v8.8b, #0 //suppress further partial tag feed in + + mov d22, v4.d[1] //GHASH final-1 block - mid + + eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid + eor x7, x7, x14 //AES final block - round 12 high + fmov d5, x6 //AES final block - mov low + + pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high + fmov v5.d[1], x7 //AES final block - mov high + + ins v22.d[1], v22.d[0] //GHASH final-1 block - mid + + eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high + + pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low + + pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid + + eor v5.16b, v5.16b, v3.16b //AES final block - result + + eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low + + eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid +.L192_enc_blocks_less_than_1: //blocks left <= 1 + + ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored +#ifndef __AARCH64EB__ + rev w9, w12 +#else + mov w9, w12 +#endif + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + mvn x14, xzr //rk12_h = 0xffffffffffffffff + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + mvn x13, xzr //rk12_l = 0xffffffffffffffff + + and x1, x1, #127 //bit_length %= 128 + + lsr x14, x14, x1 //rk12_h is mask for top 64b of last block + cmp x1, #64 + + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + + fmov d0, x6 //ctr0b is mask for last block + + fmov v0.d[1], x7 + + and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v4.16b, v5.16b //GHASH final block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + mov d8, v4.d[1] //GHASH final block - mid + + pmull v21.1q, v4.1d, v12.1d //GHASH final block - low + + pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high + + eor v8.8b, v8.8b, v4.8b //GHASH final block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final block - high + + pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid + + eor v10.16b, v10.16b, v8.16b //GHASH final block - mid + movi v8.8b, #0xc2 + + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + shl d8, d8, #56 //mod_constant + + bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing + + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + eor v11.16b, v11.16b, v9.16b //MODULO - fold into low + str w9, [x16, #12] //store the updated counter + + st1 { v5.16b}, [x2] //store all 16B + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L192_enc_ret: + mov w0, #0x0 + ret +.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel +.globl aes_gcm_dec_192_kernel +.type aes_gcm_dec_192_kernel,%function +.align 4 +aes_gcm_dec_192_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L192_dec_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + add x4, x0, x1, lsr #3 //end_input_ptr + ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #192] //load rk12 +#ifdef __AARCH64EB__ + ror x13, x13, #32 + ror x14, x14, #32 +#endif + ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + + ld1 {v18.4s}, [x8], #16 //load rk0 + + lsr x5, x1, #3 //byte_len + mov x15, x5 + ld1 {v19.4s}, [x8], #16 //load rk1 + + lsr x12, x11, #32 + orr w11, w11, w11 + fmov d3, x10 //CTR block 3 + + rev w12, w12 //rev_ctr32 + fmov d1, x10 //CTR block 1 + + add w12, w12, #1 //increment rev_ctr32 + ld1 {v20.4s}, [x8], #16 //load rk2 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + rev w9, w12 //CTR block 1 + + add w12, w12, #1 //CTR block 1 + orr x9, x11, x9, lsl #32 //CTR block 1 + ld1 {v21.4s}, [x8], #16 //load rk3 + + fmov v1.d[1], x9 //CTR block 1 + rev w9, w12 //CTR block 2 + add w12, w12, #1 //CTR block 2 + + fmov d2, x10 //CTR block 2 + orr x9, x11, x9, lsl #32 //CTR block 2 + + fmov v2.d[1], x9 //CTR block 2 + rev w9, w12 //CTR block 3 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + orr x9, x11, x9, lsl #32 //CTR block 3 + + fmov v3.d[1], x9 //CTR block 3 + + ld1 {v22.4s}, [x8], #16 //load rk4 + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ld1 {v23.4s}, [x8], #16 //load rk5 + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + ld1 {v24.4s}, [x8], #16 //load rk6 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + ld1 {v25.4s}, [x8], #16 //load rk7 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + ld1 {v26.4s}, [x8], #16 //load rk8 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + ld1 {v27.4s}, [x8], #16 //load rk9 + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + add w12, w12, #1 //CTR block 3 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + trn1 v9.2d, v14.2d, v15.2d //h4h | h3h + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + ld1 {v28.4s}, [x8], #16 //load rk10 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + trn2 v17.2d, v14.2d, v15.2d //h4l | h3l + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d //h2l | h1l + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + ld1 {v29.4s}, [x8], #16 //load rk11 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + sub x5, x5, #1 //byte_len - 1 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 10 + add x5, x5, x0 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + cmp x0, x5 //check if we have <= 4 blocks + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + trn1 v8.2d, v12.2d, v13.2d //h2h | h1h + + aese v3.16b, v29.16b //AES block 3 - round 11 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 10 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 10 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 10 + eor v16.16b, v16.16b, v8.16b //h2k | h1k + + aese v2.16b, v29.16b //AES block 2 - round 11 + + aese v1.16b, v29.16b //AES block 1 - round 11 + eor v17.16b, v17.16b, v9.16b //h4k | h3k + + aese v0.16b, v29.16b //AES block 0 - round 11 + b.ge .L192_dec_tail //handle tail + + ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext + + eor v1.16b, v5.16b, v1.16b //AES block 1 - result + + eor v0.16b, v4.16b, v0.16b //AES block 0 - result + rev w9, w12 //CTR block 4 + ld1 {v6.16b, v7.16b}, [x0], #32 //AES block 2,3 - load ciphertext + + mov x19, v1.d[0] //AES block 1 - mov low + + mov x20, v1.d[1] //AES block 1 - mov high + + mov x6, v0.d[0] //AES block 0 - mov low + orr x9, x11, x9, lsl #32 //CTR block 4 + add w12, w12, #1 //CTR block 4 + + mov x7, v0.d[1] //AES block 0 - mov high + rev64 v4.16b, v4.16b //GHASH block 0 + + fmov d0, x10 //CTR block 4 + rev64 v5.16b, v5.16b //GHASH block 1 + cmp x0, x5 //check if we have <= 8 blocks + + eor x19, x19, x13 //AES block 1 - round 12 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif + fmov v0.d[1], x9 //CTR block 4 + rev w9, w12 //CTR block 5 + + orr x9, x11, x9, lsl #32 //CTR block 5 + fmov d1, x10 //CTR block 5 + eor x20, x20, x14 //AES block 1 - round 12 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif + add w12, w12, #1 //CTR block 5 + fmov v1.d[1], x9 //CTR block 5 + eor x6, x6, x13 //AES block 0 - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + rev w9, w12 //CTR block 6 + eor x7, x7, x14 //AES block 0 - round 12 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + stp x6, x7, [x2], #16 //AES block 0 - store result + orr x9, x11, x9, lsl #32 //CTR block 6 + + stp x19, x20, [x2], #16 //AES block 1 - store result + + add w12, w12, #1 //CTR block 6 + eor v2.16b, v6.16b, v2.16b //AES block 2 - result + b.ge .L192_dec_prepretail //do prepretail + +.L192_dec_main_loop: //main loop start + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + + pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + mov x21, v2.d[0] //AES block 4k+2 - mov low + + mov x22, v2.d[1] //AES block 4k+2 - mov high + eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result + rev64 v7.16b, v7.16b //GHASH block 4k+3 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + fmov d2, x10 //CTR block 4k+6 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + eor v4.16b, v4.16b, v11.16b //PRE 1 + + pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + fmov v2.d[1], x9 //CTR block 4k+6 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + mov x24, v3.d[1] //AES block 4k+3 - mov high + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + mov x23, v3.d[0] //AES block 4k+3 - mov low + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + fmov d3, x10 //CTR block 4k+7 + mov d8, v4.d[1] //GHASH block 4k - mid + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d10, v17.d[1] //GHASH block 4k - mid + rev w9, w12 //CTR block 4k+7 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + orr x9, x11, x9, lsl #32 //CTR block 4k+7 + + fmov v3.d[1], x9 //CTR block 4k+7 + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + eor x22, x22, x14 //AES block 4k+2 - round 12 high +#ifdef __AARCH64EB__ + rev x22, x22 +#endif + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + rev64 v6.16b, v6.16b //GHASH block 4k+2 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low + eor x21, x21, x13 //AES block 4k+2 - round 12 low +#ifdef __AARCH64EB__ + rev x21, x21 +#endif + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + + pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + movi v8.8b, #0xc2 + + pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + shl d8, d8, #56 //mod_constant + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext + eor x23, x23, x13 //AES block 4k+3 - round 12 low +#ifdef __AARCH64EB__ + rev x23, x23 +#endif + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v0.16b, v29.16b //AES block 4k+4 - round 11 + add w12, w12, #1 //CTR block 4k+7 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext + + aese v1.16b, v29.16b //AES block 4k+5 - round 11 + ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext + rev w9, w12 //CTR block 4k+8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + stp x21, x22, [x2], #16 //AES block 4k+2 - store result + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + cmp x0, x5 //.LOOP CONTROL + + eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result + eor x24, x24, x14 //AES block 4k+3 - round 12 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif + eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + orr x9, x11, x9, lsl #32 //CTR block 4k+8 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + mov x19, v1.d[0] //AES block 4k+5 - mov low + + mov x6, v0.d[0] //AES block 4k+4 - mov low + stp x23, x24, [x2], #16 //AES block 4k+3 - store result + rev64 v5.16b, v5.16b //GHASH block 4k+5 + + aese v2.16b, v29.16b //AES block 4k+6 - round 11 + mov x7, v0.d[1] //AES block 4k+4 - mov high + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + mov x20, v1.d[1] //AES block 4k+5 - mov high + + fmov d0, x10 //CTR block 4k+8 + add w12, w12, #1 //CTR block 4k+8 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result + fmov v0.d[1], x9 //CTR block 4k+8 + rev w9, w12 //CTR block 4k+9 + + eor x6, x6, x13 //AES block 4k+4 - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + orr x9, x11, x9, lsl #32 //CTR block 4k+9 + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + fmov d1, x10 //CTR block 4k+9 + add w12, w12, #1 //CTR block 4k+9 + eor x19, x19, x13 //AES block 4k+5 - round 12 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif + fmov v1.d[1], x9 //CTR block 4k+9 + rev w9, w12 //CTR block 4k+10 + eor x20, x20, x14 //AES block 4k+5 - round 12 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif + eor x7, x7, x14 //AES block 4k+4 - round 12 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + stp x6, x7, [x2], #16 //AES block 4k+4 - store result + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + + add w12, w12, #1 //CTR block 4k+10 + rev64 v4.16b, v4.16b //GHASH block 4k+4 + orr x9, x11, x9, lsl #32 //CTR block 4k+10 + + aese v3.16b, v29.16b //AES block 4k+7 - round 11 + stp x19, x20, [x2], #16 //AES block 4k+5 - store result + b.lt .L192_dec_main_loop + +.L192_dec_prepretail: //PREPRETAIL + mov x22, v2.d[1] //AES block 4k+2 - mov high + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + mov x21, v2.d[0] //AES block 4k+2 - mov low + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + mov d10, v17.d[1] //GHASH block 4k - mid + + eor v4.16b, v4.16b, v11.16b //PRE 1 + fmov d2, x10 //CTR block 4k+6 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + mov x23, v3.d[0] //AES block 4k+3 - mov low + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + mov x24, v3.d[1] //AES block 4k+3 - mov high + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d8, v4.d[1] //GHASH block 4k - mid + fmov d3, x10 //CTR block 4k+7 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + rev64 v6.16b, v6.16b //GHASH block 4k+2 + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + fmov v2.d[1], x9 //CTR block 4k+6 + rev w9, w12 //CTR block 4k+7 + + orr x9, x11, x9, lsl #32 //CTR block 4k+7 + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + eor x24, x24, x14 //AES block 4k+3 - round 12 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif + fmov v3.d[1], x9 //CTR block 4k+7 + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + eor x21, x21, x13 //AES block 4k+2 - round 12 low +#ifdef __AARCH64EB__ + rev x21, x21 +#endif + pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + eor x22, x22, x14 //AES block 4k+2 - round 12 high +#ifdef __AARCH64EB__ + rev x22, x22 +#endif + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + eor x23, x23, x13 //AES block 4k+3 - round 12 low +#ifdef __AARCH64EB__ + rev x23, x23 +#endif + stp x21, x22, [x2], #16 //AES block 4k+2 - store result + + rev64 v7.16b, v7.16b //GHASH block 4k+3 + stp x23, x24, [x2], #16 //AES block 4k+3 - store result + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + add w12, w12, #1 //CTR block 4k+7 + + pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + mov d31, v6.d[1] //GHASH block 4k+2 - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high + + eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid + + pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + mov d30, v7.d[1] //GHASH block 4k+3 - mid + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid + + pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + + pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + movi v8.8b, #0xc2 + + pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + shl d8, d8, #56 //mod_constant + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + + aese v0.16b, v29.16b + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + aese v2.16b, v29.16b + + aese v1.16b, v29.16b + + aese v3.16b, v29.16b + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low +.L192_dec_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + + eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result + + mov x7, v0.d[1] //AES block 4k+4 - mov high + + mov x6, v0.d[0] //AES block 4k+4 - mov low + + ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag + + cmp x5, #48 + + eor x7, x7, x14 //AES block 4k+4 - round 12 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + eor x6, x6, x13 //AES block 4k+4 - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + b.gt .L192_dec_blocks_more_than_3 + + movi v11.8b, #0 + movi v9.8b, #0 + + mov v3.16b, v2.16b + mov v2.16b, v1.16b + sub w12, w12, #1 + + movi v10.8b, #0 + cmp x5, #32 + b.gt .L192_dec_blocks_more_than_2 + + mov v3.16b, v1.16b + cmp x5, #16 + sub w12, w12, #1 + + b.gt .L192_dec_blocks_more_than_1 + + sub w12, w12, #1 + b .L192_dec_blocks_less_than_1 +.L192_dec_blocks_more_than_3: //blocks left > 3 + rev64 v4.16b, v5.16b //GHASH final-3 block + ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext + + stp x6, x7, [x2], #16 //AES final-3 block - store result + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor v0.16b, v5.16b, v1.16b //AES final-2 block - result + + pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low + mov x6, v0.d[0] //AES final-2 block - mov low + mov d22, v4.d[1] //GHASH final-3 block - mid + + mov x7, v0.d[1] //AES final-2 block - mov high + + mov d10, v17.d[1] //GHASH final-3 block - mid + eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high + + eor x6, x6, x13 //AES final-2 block - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + movi v8.8b, #0 //suppress further partial tag feed in + + pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid + eor x7, x7, x14 //AES final-2 block - round 12 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif +.L192_dec_blocks_more_than_2: //blocks left > 2 + + rev64 v4.16b, v5.16b //GHASH final-2 block + ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + movi v8.8b, #0 //suppress further partial tag feed in + + eor v0.16b, v5.16b, v2.16b //AES final-1 block - result + + mov d22, v4.d[1] //GHASH final-2 block - mid + + pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low + + stp x6, x7, [x2], #16 //AES final-2 block - store result + + eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid + mov x7, v0.d[1] //AES final-1 block - mov high + + eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low + mov x6, v0.d[0] //AES final-1 block - mov low + + pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high + + pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid + + eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high + eor x7, x7, x14 //AES final-1 block - round 12 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + eor x6, x6, x13 //AES final-1 block - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid +.L192_dec_blocks_more_than_1: //blocks left > 1 + + rev64 v4.16b, v5.16b //GHASH final-1 block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext + + mov d22, v4.d[1] //GHASH final-1 block - mid + + pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high + + eor v0.16b, v5.16b, v3.16b //AES final block - result + stp x6, x7, [x2], #16 //AES final-1 block - store result + + eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid + + eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high + + pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low + mov x7, v0.d[1] //AES final block - mov high + + ins v22.d[1], v22.d[0] //GHASH final-1 block - mid + mov x6, v0.d[0] //AES final block - mov low + + pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low + eor x7, x7, x14 //AES final block - round 12 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + eor x6, x6, x13 //AES final block - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid +.L192_dec_blocks_less_than_1: //blocks left <= 1 + + mvn x13, xzr //rk12_l = 0xffffffffffffffff + ldp x4, x5, [x2] //load existing bytes we need to not overwrite + and x1, x1, #127 //bit_length %= 128 + + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + and x1, x1, #127 //bit_length %= 128 + mvn x14, xzr //rk12_h = 0xffffffffffffffff + + lsr x14, x14, x1 //rk12_h is mask for top 64b of last block + cmp x1, #64 + + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + + fmov d0, x9 //ctr0b is mask for last block + and x6, x6, x9 + bic x4, x4, x9 //mask out low existing bytes + + orr x6, x6, x4 + mov v0.d[1], x10 +#ifndef __AARCH64EB__ + rev w9, w12 +#else + mov w9, w12 +#endif + + and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits + str w9, [x16, #12] //store the updated counter + + rev64 v4.16b, v5.16b //GHASH final block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + bic x5, x5, x10 //mask out high existing bytes + + and x7, x7, x10 + + pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high + mov d8, v4.d[1] //GHASH final block - mid + + pmull v21.1q, v4.1d, v12.1d //GHASH final block - low + + eor v8.8b, v8.8b, v4.8b //GHASH final block - mid + + eor v9.16b, v9.16b, v20.16b //GHASH final block - high + + pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final block - low + + eor v10.16b, v10.16b, v8.16b //GHASH final block - mid + movi v8.8b, #0xc2 + + eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + shl d8, d8, #56 //mod_constant + + eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up + + pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + orr x7, x7, x5 + stp x6, x7, [x2] + + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid + + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L192_dec_ret: + mov w0, #0x0 + ret +.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel +.globl aes_gcm_enc_256_kernel +.type aes_gcm_enc_256_kernel,%function +.align 4 +aes_gcm_enc_256_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L256_enc_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + ldr x6, [x3, #32] // Htable ptr + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + add x4, x0, x1, lsr #3 //end_input_ptr + lsr x5, x1, #3 //byte_len + mov x15, x5 + ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #224] //load rk14 +#ifdef __AARCH64EB__ + ror x13, x13, #32 + ror x14, x14, #32 +#endif + ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + sub x5, x5, #1 //byte_len - 1 + + ld1 {v18.4s}, [x8], #16 //load rk0 + and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + ld1 {v19.4s}, [x8], #16 //load rk1 + add x5, x5, x0 + + lsr x12, x11, #32 + fmov d2, x10 //CTR block 2 + orr w11, w11, w11 + + rev w12, w12 //rev_ctr32 + cmp x0, x5 //check if we have <= 4 blocks + fmov d1, x10 //CTR block 1 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + add w12, w12, #1 //increment rev_ctr32 + + rev w9, w12 //CTR block 1 + fmov d3, x10 //CTR block 3 + + orr x9, x11, x9, lsl #32 //CTR block 1 + add w12, w12, #1 //CTR block 1 + ld1 {v20.4s}, [x8], #16 //load rk2 + + fmov v1.d[1], x9 //CTR block 1 + rev w9, w12 //CTR block 2 + add w12, w12, #1 //CTR block 2 + + orr x9, x11, x9, lsl #32 //CTR block 2 + ld1 {v21.4s}, [x8], #16 //load rk3 + + fmov v2.d[1], x9 //CTR block 2 + rev w9, w12 //CTR block 3 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + orr x9, x11, x9, lsl #32 //CTR block 3 + + fmov v3.d[1], x9 //CTR block 3 + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + ld1 {v22.4s}, [x8], #16 //load rk4 + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + ld1 {v23.4s}, [x8], #16 //load rk5 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ld1 {v24.4s}, [x8], #16 //load rk6 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + ldr q14, [x6, #48] //load h3l | h3h +#ifndef __AARCH64EB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + ld1 {v25.4s}, [x8], #16 //load rk7 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + ld1 {v26.4s}, [x8], #16 //load rk8 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + ldr q13, [x6, #32] //load h2l | h2h +#ifndef __AARCH64EB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + ld1 {v27.4s}, [x8], #16 //load rk9 + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + ldr q15, [x6, #80] //load h4l | h4h +#ifndef __AARCH64EB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + ld1 {v28.4s}, [x8], #16 //load rk10 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + ld1 {v29.4s}, [x8], #16 //load rk11 + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + add w12, w12, #1 //CTR block 3 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + trn2 v17.2d, v14.2d, v15.2d //h4l | h3l + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + ld1 {v30.4s}, [x8], #16 //load rk12 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + ldr q12, [x6] //load h1l | h1h +#ifndef __AARCH64EB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + ld1 {v31.4s}, [x8], #16 //load rk13 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + trn1 v9.2d, v14.2d, v15.2d //h4h | h3h + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + trn2 v16.2d, v12.2d, v13.2d //h2l | h1l + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 10 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 10 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 10 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b //AES block 1 - round 11 + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b //AES block 2 - round 11 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 10 + + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b //AES block 1 - round 12 + + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b //AES block 2 - round 12 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b //AES block 0 - round 11 + eor v17.16b, v17.16b, v9.16b //h4k | h3k + + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b //AES block 3 - round 11 + + aese v2.16b, v31.16b //AES block 2 - round 13 + trn1 v8.2d, v12.2d, v13.2d //h2h | h1h + + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b //AES block 0 - round 12 + + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b //AES block 3 - round 12 + + aese v1.16b, v31.16b //AES block 1 - round 13 + + aese v0.16b, v31.16b //AES block 0 - round 13 + + aese v3.16b, v31.16b //AES block 3 - round 13 + eor v16.16b, v16.16b, v8.16b //h2k | h1k + b.ge .L256_enc_tail //handle tail + + ldp x19, x20, [x0, #16] //AES block 1 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif + rev w9, w12 //CTR block 4 + ldp x6, x7, [x0, #0] //AES block 0 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + ldp x23, x24, [x0, #48] //AES block 3 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif + ldp x21, x22, [x0, #32] //AES block 2 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif + add x0, x0, #64 //AES input_ptr update + + eor x19, x19, x13 //AES block 1 - round 14 low + eor x20, x20, x14 //AES block 1 - round 14 high + + fmov d5, x19 //AES block 1 - mov low + eor x6, x6, x13 //AES block 0 - round 14 low + + eor x7, x7, x14 //AES block 0 - round 14 high + eor x24, x24, x14 //AES block 3 - round 14 high + fmov d4, x6 //AES block 0 - mov low + + cmp x0, x5 //check if we have <= 8 blocks + fmov v4.d[1], x7 //AES block 0 - mov high + eor x23, x23, x13 //AES block 3 - round 14 low + + eor x21, x21, x13 //AES block 2 - round 14 low + fmov v5.d[1], x20 //AES block 1 - mov high + + fmov d6, x21 //AES block 2 - mov low + add w12, w12, #1 //CTR block 4 + + orr x9, x11, x9, lsl #32 //CTR block 4 + fmov d7, x23 //AES block 3 - mov low + eor x22, x22, x14 //AES block 2 - round 14 high + + fmov v6.d[1], x22 //AES block 2 - mov high + + eor v4.16b, v4.16b, v0.16b //AES block 0 - result + fmov d0, x10 //CTR block 4 + + fmov v0.d[1], x9 //CTR block 4 + rev w9, w12 //CTR block 5 + add w12, w12, #1 //CTR block 5 + + eor v5.16b, v5.16b, v1.16b //AES block 1 - result + fmov d1, x10 //CTR block 5 + orr x9, x11, x9, lsl #32 //CTR block 5 + + fmov v1.d[1], x9 //CTR block 5 + rev w9, w12 //CTR block 6 + st1 { v4.16b}, [x2], #16 //AES block 0 - store result + + fmov v7.d[1], x24 //AES block 3 - mov high + orr x9, x11, x9, lsl #32 //CTR block 6 + eor v6.16b, v6.16b, v2.16b //AES block 2 - result + + st1 { v5.16b}, [x2], #16 //AES block 1 - store result + + add w12, w12, #1 //CTR block 6 + fmov d2, x10 //CTR block 6 + + fmov v2.d[1], x9 //CTR block 6 + st1 { v6.16b}, [x2], #16 //AES block 2 - store result + rev w9, w12 //CTR block 7 + + orr x9, x11, x9, lsl #32 //CTR block 7 + + eor v7.16b, v7.16b, v3.16b //AES block 3 - result + st1 { v7.16b}, [x2], #16 //AES block 3 - store result + b.ge .L256_enc_prepretail //do prepretail + +.L256_enc_main_loop: //main loop start + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + fmov d3, x10 //CTR block 4k+3 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + fmov v3.d[1], x9 //CTR block 4k+3 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + ldp x23, x24, [x0, #48] //AES block 4k+7 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + eor v4.16b, v4.16b, v11.16b //PRE 1 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor x23, x23, x13 //AES block 4k+7 - round 14 low + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + mov d10, v17.d[1] //GHASH block 4k - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + eor x22, x22, x14 //AES block 4k+6 - round 14 high + mov d8, v4.d[1] //GHASH block 4k - mid + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) + + pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) + + pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + mov d8, v6.d[1] //GHASH block 4k+2 - mid + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + + pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + + pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + + pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + mov d4, v7.d[1] //GHASH block 4k+3 - mid + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low + + pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + eor x19, x19, x13 //AES block 4k+5 - round 14 low + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + eor x21, x21, x13 //AES block 4k+6 - round 14 low + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + movi v8.8b, #0xc2 + + pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + fmov d5, x19 //AES block 4k+5 - mov low + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + shl d8, d8, #56 //mod_constant + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + add w12, w12, #1 //CTR block 4k+3 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 + eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 + add x0, x0, #64 //AES input_ptr update + + pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + rev w9, w12 //CTR block 4k+8 + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + eor x6, x6, x13 //AES block 4k+4 - round 14 low + + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 + eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + eor x7, x7, x14 //AES block 4k+4 - round 14 high + + fmov d4, x6 //AES block 4k+4 - mov low + orr x9, x11, x9, lsl #32 //CTR block 4k+8 + eor v7.16b, v9.16b, v7.16b //MODULO - fold into mid + + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 + eor x20, x20, x14 //AES block 4k+5 - round 14 high + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 + eor x24, x24, x14 //AES block 4k+7 - round 14 high + + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 + add w12, w12, #1 //CTR block 4k+8 + + aese v0.16b, v31.16b //AES block 4k+4 - round 13 + fmov v4.d[1], x7 //AES block 4k+4 - mov high + eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid + + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 + fmov d7, x23 //AES block 4k+7 - mov low + + aese v1.16b, v31.16b //AES block 4k+5 - round 13 + fmov v5.d[1], x20 //AES block 4k+5 - mov high + + fmov d6, x21 //AES block 4k+6 - mov low + cmp x0, x5 //.LOOP CONTROL + + fmov v6.d[1], x22 //AES block 4k+6 - mov high + + pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result + fmov d0, x10 //CTR block 4k+8 + + fmov v0.d[1], x9 //CTR block 4k+8 + rev w9, w12 //CTR block 4k+9 + add w12, w12, #1 //CTR block 4k+9 + + eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result + fmov d1, x10 //CTR block 4k+9 + orr x9, x11, x9, lsl #32 //CTR block 4k+9 + + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 + fmov v1.d[1], x9 //CTR block 4k+9 + + aese v2.16b, v31.16b //AES block 4k+6 - round 13 + rev w9, w12 //CTR block 4k+10 + st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result + + orr x9, x11, x9, lsl #32 //CTR block 4k+10 + eor v11.16b, v11.16b, v9.16b //MODULO - fold into low + fmov v7.d[1], x24 //AES block 4k+7 - mov high + + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result + add w12, w12, #1 //CTR block 4k+10 + + aese v3.16b, v31.16b //AES block 4k+7 - round 13 + eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result + fmov d2, x10 //CTR block 4k+10 + + st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result + fmov v2.d[1], x9 //CTR block 4k+10 + rev w9, w12 //CTR block 4k+11 + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + orr x9, x11, x9, lsl #32 //CTR block 4k+11 + + eor v7.16b, v7.16b, v3.16b //AES block 4k+7 - result + st1 { v7.16b}, [x2], #16 //AES block 4k+7 - store result + b.lt .L256_enc_main_loop + +.L256_enc_prepretail: //PREPRETAIL + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + fmov d3, x10 //CTR block 4k+3 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) + + fmov v3.d[1], x9 //CTR block 4k+3 + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + + eor v4.16b, v4.16b, v11.16b //PRE 1 + rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + mov d10, v17.d[1] //GHASH block 4k - mid + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d8, v4.d[1] //GHASH block 4k - mid + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + + pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + mov d8, v6.d[1] //GHASH block 4k+2 - mid + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid + add w12, w12, #1 //CTR block 4k+3 + + pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + + pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + + eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low + ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high + mov d4, v7.d[1] //GHASH block 4k+3 - mid + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid + + eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid + eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + shl d8, d8, #56 //mod_constant + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid + + pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + + eor v10.16b, v10.16b, v9.16b //karatsuba tidy up + + pmull v4.1q, v9.1d, v8.1d + ext v9.16b, v9.16b, v9.16b, #8 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + eor v10.16b, v10.16b, v11.16b + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 + eor v10.16b, v10.16b, v4.16b + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 + eor v10.16b, v10.16b, v9.16b + + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 + + pmull v4.1q, v10.1d, v8.1d + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 + ext v10.16b, v10.16b, v10.16b, #8 + + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 + + aese v1.16b, v31.16b //AES block 4k+5 - round 13 + eor v11.16b, v11.16b, v4.16b + + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 + + aese v3.16b, v31.16b //AES block 4k+7 - round 13 + + aese v0.16b, v31.16b //AES block 4k+4 - round 13 + + aese v2.16b, v31.16b //AES block 4k+6 - round 13 + eor v11.16b, v11.16b, v10.16b +.L256_enc_tail: //TAIL + + ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + eor x6, x6, x13 //AES block 4k+4 - round 14 low + eor x7, x7, x14 //AES block 4k+4 - round 14 high + + cmp x5, #48 + fmov d4, x6 //AES block 4k+4 - mov low + + fmov v4.d[1], x7 //AES block 4k+4 - mov high + + eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result + b.gt .L256_enc_blocks_more_than_3 + + cmp x5, #32 + mov v3.16b, v2.16b + movi v11.8b, #0 + + movi v9.8b, #0 + sub w12, w12, #1 + + mov v2.16b, v1.16b + movi v10.8b, #0 + b.gt .L256_enc_blocks_more_than_2 + + mov v3.16b, v1.16b + sub w12, w12, #1 + cmp x5, #16 + + b.gt .L256_enc_blocks_more_than_1 + + sub w12, w12, #1 + b .L256_enc_blocks_less_than_1 +.L256_enc_blocks_more_than_3: //blocks left > 3 + st1 { v5.16b}, [x2], #16 //AES final-3 block - store result + + ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + rev64 v4.16b, v5.16b //GHASH final-3 block + + eor x6, x6, x13 //AES final-2 block - round 14 low + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor x7, x7, x14 //AES final-2 block - round 14 high + + mov d22, v4.d[1] //GHASH final-3 block - mid + fmov d5, x6 //AES final-2 block - mov low + + fmov v5.d[1], x7 //AES final-2 block - mov high + + eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid + movi v8.8b, #0 //suppress further partial tag feed in + + mov d10, v17.d[1] //GHASH final-3 block - mid + + pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low + + pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high + + pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid + eor v5.16b, v5.16b, v1.16b //AES final-2 block - result +.L256_enc_blocks_more_than_2: //blocks left > 2 + + st1 { v5.16b}, [x2], #16 //AES final-2 block - store result + + ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + rev64 v4.16b, v5.16b //GHASH final-2 block + + eor x6, x6, x13 //AES final-1 block - round 14 low + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + fmov d5, x6 //AES final-1 block - mov low + eor x7, x7, x14 //AES final-1 block - round 14 high + + fmov v5.d[1], x7 //AES final-1 block - mov high + + movi v8.8b, #0 //suppress further partial tag feed in + + pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high + mov d22, v4.d[1] //GHASH final-2 block - mid + + pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low + + eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid + + eor v5.16b, v5.16b, v2.16b //AES final-1 block - result + + eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high + + pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low + + eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid +.L256_enc_blocks_more_than_1: //blocks left > 1 + + st1 { v5.16b}, [x2], #16 //AES final-1 block - store result + + rev64 v4.16b, v5.16b //GHASH final-1 block + + ldp x6, x7, [x0], #16 //AES final block - load input low & high +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + movi v8.8b, #0 //suppress further partial tag feed in + + eor x6, x6, x13 //AES final block - round 14 low + mov d22, v4.d[1] //GHASH final-1 block - mid + + pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high + eor x7, x7, x14 //AES final block - round 14 high + + eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid + + eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high + + ins v22.d[1], v22.d[0] //GHASH final-1 block - mid + fmov d5, x6 //AES final block - mov low + + fmov v5.d[1], x7 //AES final block - mov high + + pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid + + pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low + + eor v5.16b, v5.16b, v3.16b //AES final block - result + eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low +.L256_enc_blocks_less_than_1: //blocks left <= 1 + + and x1, x1, #127 //bit_length %= 128 + + mvn x13, xzr //rk14_l = 0xffffffffffffffff + sub x1, x1, #128 //bit_length -= 128 + + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + + mvn x14, xzr //rk14_h = 0xffffffffffffffff + and x1, x1, #127 //bit_length %= 128 + + lsr x14, x14, x1 //rk14_h is mask for top 64b of last block + cmp x1, #64 + + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + + fmov d0, x6 //ctr0b is mask for last block + + fmov v0.d[1], x7 + + and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v4.16b, v5.16b //GHASH final block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing + + pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high + mov d8, v4.d[1] //GHASH final block - mid +#ifndef __AARCH64EB__ + rev w9, w12 +#else + mov w9, w12 +#endif + + pmull v21.1q, v4.1d, v12.1d //GHASH final block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final block - high + eor v8.8b, v8.8b, v4.8b //GHASH final block - mid + + pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid + + eor v11.16b, v11.16b, v21.16b //GHASH final block - low + + eor v10.16b, v10.16b, v8.16b //GHASH final block - mid + movi v8.8b, #0xc2 + + eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + shl d8, d8, #56 //mod_constant + + eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up + + pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid + + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + str w9, [x16, #12] //store the updated counter + + st1 { v5.16b}, [x2] //store all 16B + eor v11.16b, v11.16b, v9.16b //MODULO - fold into low + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L256_enc_ret: + mov w0, #0x0 + ret +.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel +.globl aes_gcm_dec_256_kernel +.type aes_gcm_dec_256_kernel,%function +.align 4 +aes_gcm_dec_256_kernel: + AARCH64_VALID_CALL_TARGET + cbz x1, .L256_dec_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + ldr x6, [x3, #32] // load Htable ptr + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + lsr x5, x1, #3 //byte_len + mov x15, x5 + ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #224] //load rk14 +#ifdef __AARCH64EB__ + ror x14, x14, #32 + ror x13, x13, #32 +#endif + ld1 {v18.4s}, [x8], #16 //load rk0 + sub x5, x5, #1 //byte_len - 1 + + ld1 {v19.4s}, [x8], #16 //load rk1 + and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + add x4, x0, x1, lsr #3 //end_input_ptr + ld1 {v20.4s}, [x8], #16 //load rk2 + + lsr x12, x11, #32 + ld1 {v21.4s}, [x8], #16 //load rk3 + orr w11, w11, w11 + + ld1 {v22.4s}, [x8], #16 //load rk4 + add x5, x5, x0 + rev w12, w12 //rev_ctr32 + + add w12, w12, #1 //increment rev_ctr32 + fmov d3, x10 //CTR block 3 + + rev w9, w12 //CTR block 1 + add w12, w12, #1 //CTR block 1 + fmov d1, x10 //CTR block 1 + + orr x9, x11, x9, lsl #32 //CTR block 1 + ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + + fmov v1.d[1], x9 //CTR block 1 + rev w9, w12 //CTR block 2 + add w12, w12, #1 //CTR block 2 + + fmov d2, x10 //CTR block 2 + orr x9, x11, x9, lsl #32 //CTR block 2 + + fmov v2.d[1], x9 //CTR block 2 + rev w9, w12 //CTR block 3 + + orr x9, x11, x9, lsl #32 //CTR block 3 + ld1 {v23.4s}, [x8], #16 //load rk5 + + fmov v3.d[1], x9 //CTR block 3 + add w12, w12, #1 //CTR block 3 + + ld1 {v24.4s}, [x8], #16 //load rk6 + + ld1 {v25.4s}, [x8], #16 //load rk7 + + ld1 {v26.4s}, [x8], #16 //load rk8 + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 0 - round 0 + ldr q14, [x6, #48] //load h3l | h3h +#ifndef __AARCH64EB__ + ext v14.16b, v14.16b, v14.16b, #8 +#endif + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 3 - round 0 + ldr q15, [x6, #80] //load h4l | h4h +#ifndef __AARCH64EB__ + ext v15.16b, v15.16b, v15.16b, #8 +#endif + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 1 - round 0 + ldr q13, [x6, #32] //load h2l | h2h +#ifndef __AARCH64EB__ + ext v13.16b, v13.16b, v13.16b, #8 +#endif + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 2 - round 0 + ld1 {v27.4s}, [x8], #16 //load rk9 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 0 - round 1 + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 1 - round 1 + ld1 { v11.16b}, [x3] + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 2 - round 1 + ld1 {v28.4s}, [x8], #16 //load rk10 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 3 - round 1 + ld1 {v29.4s}, [x8], #16 //load rk11 + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 0 - round 2 + ldr q12, [x6] //load h1l | h1h +#ifndef __AARCH64EB__ + ext v12.16b, v12.16b, v12.16b, #8 +#endif + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 2 - round 2 + ld1 {v30.4s}, [x8], #16 //load rk12 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 3 - round 2 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 0 - round 3 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 1 - round 2 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 3 - round 3 + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 0 - round 4 + cmp x0, x5 //check if we have <= 4 blocks + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 2 - round 3 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 1 - round 3 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 3 - round 4 + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 2 - round 4 + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 1 - round 4 + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 3 - round 5 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 0 - round 5 + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 1 - round 5 + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 2 - round 5 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 0 - round 6 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 3 - round 6 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 1 - round 6 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 2 - round 6 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 0 - round 7 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 1 - round 7 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 3 - round 7 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 0 - round 8 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 2 - round 7 + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 3 - round 8 + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 1 - round 8 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 0 - round 9 + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 2 - round 8 + ld1 {v31.4s}, [x8], #16 //load rk13 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 1 - round 9 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 0 - round 10 + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 3 - round 9 + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 1 - round 10 + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 2 - round 9 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 3 - round 10 + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b //AES block 0 - round 11 + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 2 - round 10 + + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b //AES block 3 - round 11 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b //AES block 1 - round 11 + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b //AES block 2 - round 11 + + trn1 v9.2d, v14.2d, v15.2d //h4h | h3h + + trn2 v17.2d, v14.2d, v15.2d //h4l | h3l + + trn1 v8.2d, v12.2d, v13.2d //h2h | h1h + trn2 v16.2d, v12.2d, v13.2d //h2l | h1l + + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b //AES block 1 - round 12 + + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b //AES block 0 - round 12 + + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b //AES block 2 - round 12 + + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b //AES block 3 - round 12 + eor v17.16b, v17.16b, v9.16b //h4k | h3k + + aese v1.16b, v31.16b //AES block 1 - round 13 + + aese v2.16b, v31.16b //AES block 2 - round 13 + eor v16.16b, v16.16b, v8.16b //h2k | h1k + + aese v3.16b, v31.16b //AES block 3 - round 13 + + aese v0.16b, v31.16b //AES block 0 - round 13 + b.ge .L256_dec_tail //handle tail + + ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext + + rev w9, w12 //CTR block 4 + + eor v0.16b, v4.16b, v0.16b //AES block 0 - result + + eor v1.16b, v5.16b, v1.16b //AES block 1 - result + rev64 v5.16b, v5.16b //GHASH block 1 + ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext + + mov x7, v0.d[1] //AES block 0 - mov high + + mov x6, v0.d[0] //AES block 0 - mov low + rev64 v4.16b, v4.16b //GHASH block 0 + add w12, w12, #1 //CTR block 4 + + fmov d0, x10 //CTR block 4 + orr x9, x11, x9, lsl #32 //CTR block 4 + + fmov v0.d[1], x9 //CTR block 4 + rev w9, w12 //CTR block 5 + add w12, w12, #1 //CTR block 5 + + mov x19, v1.d[0] //AES block 1 - mov low + + orr x9, x11, x9, lsl #32 //CTR block 5 + mov x20, v1.d[1] //AES block 1 - mov high + eor x7, x7, x14 //AES block 0 - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + eor x6, x6, x13 //AES block 0 - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + stp x6, x7, [x2], #16 //AES block 0 - store result + fmov d1, x10 //CTR block 5 + + ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext + + fmov v1.d[1], x9 //CTR block 5 + rev w9, w12 //CTR block 6 + add w12, w12, #1 //CTR block 6 + + eor x19, x19, x13 //AES block 1 - round 14 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif + orr x9, x11, x9, lsl #32 //CTR block 6 + + eor x20, x20, x14 //AES block 1 - round 14 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif + stp x19, x20, [x2], #16 //AES block 1 - store result + + eor v2.16b, v6.16b, v2.16b //AES block 2 - result + cmp x0, x5 //check if we have <= 8 blocks + b.ge .L256_dec_prepretail //do prepretail + +.L256_dec_main_loop: //main loop start + mov x21, v2.d[0] //AES block 4k+2 - mov low + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + mov x22, v2.d[1] //AES block 4k+2 - mov high + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + fmov d2, x10 //CTR block 4k+6 + + fmov v2.d[1], x9 //CTR block 4k+6 + eor v4.16b, v4.16b, v11.16b //PRE 1 + rev w9, w12 //CTR block 4k+7 + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + mov x24, v3.d[1] //AES block 4k+3 - mov high + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + mov x23, v3.d[0] //AES block 4k+3 - mov low + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + mov d8, v4.d[1] //GHASH block 4k - mid + fmov d3, x10 //CTR block 4k+7 + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + orr x9, x11, x9, lsl #32 //CTR block 4k+7 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + fmov v3.d[1], x9 //CTR block 4k+7 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + eor x22, x22, x14 //AES block 4k+2 - round 14 high +#ifdef __AARCH64EB__ + rev x22, x22 +#endif + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + mov d10, v17.d[1] //GHASH block 4k - mid + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + rev64 v6.16b, v6.16b //GHASH block 4k+2 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + eor x21, x21, x13 //AES block 4k+2 - round 14 low +#ifdef __AARCH64EB__ + rev x21, x21 +#endif + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + stp x21, x22, [x2], #16 //AES block 4k+2 - store result + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + + pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + rev64 v7.16b, v7.16b //GHASH block 4k+3 + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + eor x23, x23, x13 //AES block 4k+3 - round 14 low +#ifdef __AARCH64EB__ + rev x23, x23 +#endif + pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + eor x24, x24, x14 //AES block 4k+3 - round 14 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + add w12, w12, #1 //CTR block 4k+7 + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + mov d8, v6.d[1] //GHASH block 4k+2 - mid + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + + pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + rev w9, w12 //CTR block 4k+8 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + add w12, w12, #1 //CTR block 4k+8 + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + + pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + mov d6, v7.d[1] //GHASH block 4k+3 - mid + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + + pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + + pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + orr x9, x11, x9, lsl #32 //CTR block 4k+8 + eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + + pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid + movi v8.8b, #0xc2 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + shl d8, d8, #56 //mod_constant + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid + + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 + + pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + + aese v0.16b, v31.16b //AES block 4k+4 - round 13 + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 + stp x23, x24, [x2], #16 //AES block 4k+3 - store result + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext + + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 + ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext + + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 + mov x7, v0.d[1] //AES block 4k+4 - mov high + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + aese v1.16b, v31.16b //AES block 4k+5 - round 13 + mov x6, v0.d[0] //AES block 4k+4 - mov low + + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 + fmov d0, x10 //CTR block 4k+8 + + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 + fmov v0.d[1], x9 //CTR block 4k+8 + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result + rev w9, w12 //CTR block 4k+9 + + aese v2.16b, v31.16b //AES block 4k+6 - round 13 + orr x9, x11, x9, lsl #32 //CTR block 4k+9 + cmp x0, x5 //.LOOP CONTROL + + add w12, w12, #1 //CTR block 4k+9 + + eor x6, x6, x13 //AES block 4k+4 - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor x7, x7, x14 //AES block 4k+4 - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + mov x20, v1.d[1] //AES block 4k+5 - mov high + eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 + mov x19, v1.d[0] //AES block 4k+5 - mov low + + fmov d1, x10 //CTR block 4k+9 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + fmov v1.d[1], x9 //CTR block 4k+9 + rev w9, w12 //CTR block 4k+10 + add w12, w12, #1 //CTR block 4k+10 + + aese v3.16b, v31.16b //AES block 4k+7 - round 13 + orr x9, x11, x9, lsl #32 //CTR block 4k+10 + + rev64 v5.16b, v5.16b //GHASH block 4k+5 + eor x20, x20, x14 //AES block 4k+5 - round 14 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif + stp x6, x7, [x2], #16 //AES block 4k+4 - store result + + eor x19, x19, x13 //AES block 4k+5 - round 14 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif + stp x19, x20, [x2], #16 //AES block 4k+5 - store result + + rev64 v4.16b, v4.16b //GHASH block 4k+4 + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + b.lt .L256_dec_main_loop + + +.L256_dec_prepretail: //PREPRETAIL + ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 + mov x21, v2.d[0] //AES block 4k+2 - mov low + eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result + + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 + mov x22, v2.d[1] //AES block 4k+2 - mov high + + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 + fmov d2, x10 //CTR block 4k+6 + + fmov v2.d[1], x9 //CTR block 4k+6 + rev w9, w12 //CTR block 4k+7 + eor v4.16b, v4.16b, v11.16b //PRE 1 + + rev64 v6.16b, v6.16b //GHASH block 4k+2 + orr x9, x11, x9, lsl #32 //CTR block 4k+7 + mov x23, v3.d[0] //AES block 4k+3 - mov low + + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 + mov x24, v3.d[1] //AES block 4k+3 - mov high + + pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low + mov d8, v4.d[1] //GHASH block 4k - mid + fmov d3, x10 //CTR block 4k+7 + + pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high + fmov v3.d[1], x9 //CTR block 4k+7 + + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 + mov d10, v17.d[1] //GHASH block 4k - mid + + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 + eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid + + pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high + + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 + rev64 v7.16b, v7.16b //GHASH block 4k+3 + + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 + + pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high + + pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low + + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 + mov d4, v5.d[1] //GHASH block 4k+1 - mid + + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 + + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 + eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low + + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 + + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 + mov d8, v6.d[1] //GHASH block 4k+2 - mid + + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 + eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid + + pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low + + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 + + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 + eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid + + pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid + + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 + eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low + + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 + + pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high + eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid + + pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high + + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 + ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid + + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 + + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 + eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high + + pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low + + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 + mov d6, v7.d[1] //GHASH block 4k+3 - mid + + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 + + pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid + + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 + eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid + + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 + + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 + eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid + + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 + + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 + movi v8.8b, #0xc2 + + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 + eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low + + pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid + + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 + eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high + + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 + + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 + eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid + + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 + + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 + eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 + + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 + shl d8, d8, #56 //mod_constant + + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 + + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 + eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up + + pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 + + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 + eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid + + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 + + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 + + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 + eor x22, x22, x14 //AES block 4k+2 - round 14 high +#ifdef __AARCH64EB__ + rev x22, x22 +#endif + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 + eor x23, x23, x13 //AES block 4k+3 - round 14 low +#ifdef __AARCH64EB__ + rev x23, x23 +#endif + aese v2.16b, v29.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + aese v0.16b, v29.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 + add w12, w12, #1 //CTR block 4k+7 + + aese v1.16b, v29.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 + eor x21, x21, x13 //AES block 4k+2 - round 14 low +#ifdef __AARCH64EB__ + rev x21, x21 +#endif + + aese v2.16b, v30.16b + aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + eor x24, x24, x14 //AES block 4k+3 - round 14 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif + + aese v3.16b, v29.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 + stp x21, x22, [x2], #16 //AES block 4k+2 - store result + + aese v1.16b, v30.16b + aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + aese v0.16b, v30.16b + aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 + stp x23, x24, [x2], #16 //AES block 4k+3 - store result + + aese v3.16b, v30.16b + aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + aese v1.16b, v31.16b //AES block 4k+5 - round 13 + + aese v0.16b, v31.16b //AES block 4k+4 - round 13 + + aese v3.16b, v31.16b //AES block 4k+7 - round 13 + + aese v2.16b, v31.16b //AES block 4k+6 - round 13 + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low +.L256_dec_tail: //TAIL + + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + + eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result + + mov x6, v0.d[0] //AES block 4k+4 - mov low + + mov x7, v0.d[1] //AES block 4k+4 - mov high + ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag + + cmp x5, #48 + + eor x6, x6, x13 //AES block 4k+4 - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + + eor x7, x7, x14 //AES block 4k+4 - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif + b.gt .L256_dec_blocks_more_than_3 + + sub w12, w12, #1 + mov v3.16b, v2.16b + movi v10.8b, #0 + + movi v11.8b, #0 + cmp x5, #32 + + movi v9.8b, #0 + mov v2.16b, v1.16b + b.gt .L256_dec_blocks_more_than_2 + + sub w12, w12, #1 + + mov v3.16b, v1.16b + cmp x5, #16 + b.gt .L256_dec_blocks_more_than_1 + + sub w12, w12, #1 + b .L256_dec_blocks_less_than_1 +.L256_dec_blocks_more_than_3: //blocks left > 3 + rev64 v4.16b, v5.16b //GHASH final-3 block + ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext + + stp x6, x7, [x2], #16 //AES final-3 block - store result + + mov d10, v17.d[1] //GHASH final-3 block - mid + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + eor v0.16b, v5.16b, v1.16b //AES final-2 block - result + + mov d22, v4.d[1] //GHASH final-3 block - mid + + mov x6, v0.d[0] //AES final-2 block - mov low + + mov x7, v0.d[1] //AES final-2 block - mov high + + eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid + + movi v8.8b, #0 //suppress further partial tag feed in + + pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high + + pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid + eor x6, x6, x13 //AES final-2 block - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + + pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low + eor x7, x7, x14 //AES final-2 block - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif +.L256_dec_blocks_more_than_2: //blocks left > 2 + + rev64 v4.16b, v5.16b //GHASH final-2 block + ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + stp x6, x7, [x2], #16 //AES final-2 block - store result + + eor v0.16b, v5.16b, v2.16b //AES final-1 block - result + + mov d22, v4.d[1] //GHASH final-2 block - mid + + pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low + + pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high + + eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid + mov x6, v0.d[0] //AES final-1 block - mov low + + mov x7, v0.d[1] //AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low + movi v8.8b, #0 //suppress further partial tag feed in + + pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid + + eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high + eor x6, x6, x13 //AES final-1 block - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + + eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid + eor x7, x7, x14 //AES final-1 block - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif +.L256_dec_blocks_more_than_1: //blocks left > 1 + + stp x6, x7, [x2], #16 //AES final-1 block - store result + rev64 v4.16b, v5.16b //GHASH final-1 block + + ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + movi v8.8b, #0 //suppress further partial tag feed in + + mov d22, v4.d[1] //GHASH final-1 block - mid + + eor v0.16b, v5.16b, v3.16b //AES final block - result + + pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high + + eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid + + pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low + mov x6, v0.d[0] //AES final block - mov low + + ins v22.d[1], v22.d[0] //GHASH final-1 block - mid + + mov x7, v0.d[1] //AES final block - mov high + + pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid + eor x6, x6, x13 //AES final block - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif + eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low + + eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high + + eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid + eor x7, x7, x14 //AES final block - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif +.L256_dec_blocks_less_than_1: //blocks left <= 1 + + and x1, x1, #127 //bit_length %= 128 + mvn x14, xzr //rk14_h = 0xffffffffffffffff + + sub x1, x1, #128 //bit_length -= 128 + mvn x13, xzr //rk14_l = 0xffffffffffffffff + + ldp x4, x5, [x2] //load existing bytes we need to not overwrite + neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) + + and x1, x1, #127 //bit_length %= 128 + + lsr x14, x14, x1 //rk14_h is mask for top 64b of last block + cmp x1, #64 + + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + + fmov d0, x9 //ctr0b is mask for last block + and x6, x6, x9 + + mov v0.d[1], x10 + bic x4, x4, x9 //mask out low existing bytes + +#ifndef __AARCH64EB__ + rev w9, w12 +#else + mov w9, w12 +#endif + + bic x5, x5, x10 //mask out high existing bytes + + orr x6, x6, x4 + + and x7, x7, x10 + + orr x7, x7, x5 + + and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits + + rev64 v4.16b, v5.16b //GHASH final block + + eor v4.16b, v4.16b, v8.16b //feed in partial tag + + pmull v21.1q, v4.1d, v12.1d //GHASH final block - low + + mov d8, v4.d[1] //GHASH final block - mid + + eor v8.8b, v8.8b, v4.8b //GHASH final block - mid + + pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high + + pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid + + eor v9.16b, v9.16b, v20.16b //GHASH final block - high + + eor v11.16b, v11.16b, v21.16b //GHASH final block - low + + eor v10.16b, v10.16b, v8.16b //GHASH final block - mid + movi v8.8b, #0xc2 + + eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up + + shl d8, d8, #56 //mod_constant + + eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up + + pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid + + ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment + + eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid + + eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid + + pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low + + ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment + + eor v11.16b, v11.16b, v8.16b //MODULO - fold into low + + stp x6, x7, [x2] + + str w9, [x16, #12] //store the updated counter + + eor v11.16b, v11.16b, v10.16b //MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b + mov x0, x15 + st1 { v11.16b }, [x3] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L256_dec_ret: + mov w0, #0x0 + + ret +.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel + +.globl clear_fpu_regs_v8 +.type clear_fpu_regs_v8,%function +.align 4 +clear_fpu_regs_v8: + // Not sure there is a vzeroall on ARM neon + ldr x0,=.Lzeroes + ld1 {v0.2d, v1.2d, v2.2d, v3.2d},[x0] + ld1 {v4.2d, v5.2d, v6.2d, v7.2d},[x0] + ld1 {v8.2d, v9.2d, v10.2d, v11.2d},[x0] + ld1 {v12.2d, v13.2d, v14.2d, v15.2d},[x0] + ld1 {v16.2d, v17.2d, v18.2d, v19.2d},[x0] + ld1 {v20.2d, v21.2d, v22.2d, v23.2d},[x0] + ld1 {v24.2d, v25.2d, v26.2d, v27.2d},[x0] + ld1 {v28.2d, v29.2d, v30.2d, v31.2d},[x0] + ret +.size clear_fpu_regs_v8,.-clear_fpu_regs_v8 + +.globl gcm_xor_v8 +.type gcm_xor_v8,%function +.align 4 +gcm_xor_v8: + ld1 {v0.2d},[x0] + ld1 {v1.2d},[x1] + eor v0.16b,v0.16b,v1.16b + st1 {v0.2d},[x1] + ret +.size gcm_xor_v8,.-gcm_xor_v8 + +.globl atomic_toggle_boolean_nv +.type atomic_toggle_boolean_nv,%function +.align 4 +atomic_toggle_boolean_nv: + mov w1,#0x1 + ldeoral w2,w1,[x0] + eor w0,w2,w1 + ret +.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv + +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + + +.align 8 +.Lzeroes: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.align 8 + +#endif diff --git a/module/icp/asm-aarch64/modes/arm_arch.h b/module/icp/asm-aarch64/modes/arm_arch.h new file mode 100644 index 000000000000..acd8aee4d519 --- /dev/null +++ b/module/icp/asm-aarch64/modes/arm_arch.h @@ -0,0 +1,219 @@ +/* + * Copyright 2011-2024 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#ifndef OSSL_CRYPTO_ARM_ARCH_H +# define OSSL_CRYPTO_ARM_ARCH_H + +# if !defined(__ARM_ARCH__) +# if defined(__CC_ARM) +# define __ARM_ARCH__ __TARGET_ARCH_ARM +# if defined(__BIG_ENDIAN) +# define __ARMEB__ +# else +# define __ARMEL__ +# endif +# elif defined(__GNUC__) +# if defined(__aarch64__) +# define __ARM_ARCH__ 8 + /* + * Why doesn't gcc define __ARM_ARCH__? Instead it defines + * bunch of below macros. See all_architectures[] table in + * gcc/config/arm/arm.c. On a side note it defines + * __ARMEL__/__ARMEB__ for little-/big-endian. + */ +# elif defined(__ARM_ARCH) +# define __ARM_ARCH__ __ARM_ARCH +# elif defined(__ARM_ARCH_8A__) +# define __ARM_ARCH__ 8 +# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ + defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \ + defined(__ARM_ARCH_7EM__) +# define __ARM_ARCH__ 7 +# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \ + defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_6T2__) +# define __ARM_ARCH__ 6 +# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ + defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \ + defined(__ARM_ARCH_5TEJ__) +# define __ARM_ARCH__ 5 +# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) +# define __ARM_ARCH__ 4 +# else +# error "unsupported ARM architecture" +# endif +# elif defined(__ARM_ARCH) +# define __ARM_ARCH__ __ARM_ARCH +# endif +# endif + +# if !defined(__ARM_MAX_ARCH__) +# define __ARM_MAX_ARCH__ __ARM_ARCH__ +# endif + +# if __ARM_MAX_ARCH__<__ARM_ARCH__ +# error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__" +# elif __ARM_MAX_ARCH__!=__ARM_ARCH__ +# if __ARM_ARCH__<7 && __ARM_MAX_ARCH__>=7 && defined(__ARMEB__) +# error "can't build universal big-endian binary" +# endif +# endif + +# ifndef __ASSEMBLER__ +extern unsigned int OPENSSL_armcap_P; +extern unsigned int OPENSSL_arm_midr; +extern unsigned int OPENSSL_armv8_rsa_neonized; +# endif + +# define ARMV7_NEON (1<<0) +# define ARMV7_TICK (1<<1) +# define ARMV8_AES (1<<2) +# define ARMV8_SHA1 (1<<3) +# define ARMV8_SHA256 (1<<4) +# define ARMV8_PMULL (1<<5) +# define ARMV8_SHA512 (1<<6) +# define ARMV8_CPUID (1<<7) +# define ARMV8_RNG (1<<8) +# define ARMV8_SM3 (1<<9) +# define ARMV8_SM4 (1<<10) +# define ARMV8_SHA3 (1<<11) +# define ARMV8_UNROLL8_EOR3 (1<<12) +# define ARMV8_SVE (1<<13) +# define ARMV8_SVE2 (1<<14) +# define ARMV8_HAVE_SHA3_AND_WORTH_USING (1<<15) +# define ARMV8_UNROLL12_EOR3 (1<<16) + +/* + * MIDR_EL1 system register + * + * 63___ _ ___32_31___ _ ___24_23_____20_19_____16_15__ _ __4_3_______0 + * | | | | | | | + * |RES0 | Implementer | Variant | Arch | PartNum |Revision| + * |____ _ _____|_____ _ _____|_________|_______ _|____ _ ___|________| + * + */ + +# define ARM_CPU_IMP_ARM 0x41 +# define HISI_CPU_IMP 0x48 +# define ARM_CPU_IMP_APPLE 0x61 +# define ARM_CPU_IMP_MICROSOFT 0x6D +# define ARM_CPU_IMP_AMPERE 0xC0 + +# define ARM_CPU_PART_CORTEX_A72 0xD08 +# define ARM_CPU_PART_N1 0xD0C +# define ARM_CPU_PART_V1 0xD40 +# define ARM_CPU_PART_N2 0xD49 +# define HISI_CPU_PART_KP920 0xD01 +# define ARM_CPU_PART_V2 0xD4F + +# define APPLE_CPU_PART_M1_ICESTORM 0x022 +# define APPLE_CPU_PART_M1_FIRESTORM 0x023 +# define APPLE_CPU_PART_M1_ICESTORM_PRO 0x024 +# define APPLE_CPU_PART_M1_FIRESTORM_PRO 0x025 +# define APPLE_CPU_PART_M1_ICESTORM_MAX 0x028 +# define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029 +# define APPLE_CPU_PART_M2_BLIZZARD 0x032 +# define APPLE_CPU_PART_M2_AVALANCHE 0x033 +# define APPLE_CPU_PART_M2_BLIZZARD_PRO 0x034 +# define APPLE_CPU_PART_M2_AVALANCHE_PRO 0x035 +# define APPLE_CPU_PART_M2_BLIZZARD_MAX 0x038 +# define APPLE_CPU_PART_M2_AVALANCHE_MAX 0x039 + +# define MICROSOFT_CPU_PART_COBALT_100 0xD49 + +# define MIDR_PARTNUM_SHIFT 4 +# define MIDR_PARTNUM_MASK (0xfffU << MIDR_PARTNUM_SHIFT) +# define MIDR_PARTNUM(midr) \ + (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT) + +# define MIDR_IMPLEMENTER_SHIFT 24 +# define MIDR_IMPLEMENTER_MASK (0xffU << MIDR_IMPLEMENTER_SHIFT) +# define MIDR_IMPLEMENTER(midr) \ + (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT) + +# define MIDR_ARCHITECTURE_SHIFT 16 +# define MIDR_ARCHITECTURE_MASK (0xfU << MIDR_ARCHITECTURE_SHIFT) +# define MIDR_ARCHITECTURE(midr) \ + (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT) + +# define MIDR_CPU_MODEL_MASK \ + (MIDR_IMPLEMENTER_MASK | \ + MIDR_PARTNUM_MASK | \ + MIDR_ARCHITECTURE_MASK) + +# define MIDR_CPU_MODEL(imp, partnum) \ + (((imp) << MIDR_IMPLEMENTER_SHIFT) | \ + (0xfU << MIDR_ARCHITECTURE_SHIFT) | \ + ((partnum) << MIDR_PARTNUM_SHIFT)) + +# define MIDR_IS_CPU_MODEL(midr, imp, partnum) \ + (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum)) + +#if defined(__ASSEMBLER__) + + /* + * Support macros for + * - Armv8.3-A Pointer Authentication and + * - Armv8.5-A Branch Target Identification + * features which require emitting a .note.gnu.property section with the + * appropriate architecture-dependent feature bits set. + * Read more: "ELF for the Arm® 64-bit Architecture" + */ + +# if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 +# define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */ +# define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */ +# else +# define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */ +# define AARCH64_VALID_CALL_TARGET +# endif + +# if defined(__ARM_FEATURE_PAC_DEFAULT) && \ + (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 /* Signed with A-key */ +# define GNU_PROPERTY_AARCH64_POINTER_AUTH \ + (1 << 1) /* Has Pointer Authentication */ +# define AARCH64_SIGN_LINK_REGISTER hint #25 /* PACIASP */ +# define AARCH64_VALIDATE_LINK_REGISTER hint #29 /* AUTIASP */ +# elif defined(__ARM_FEATURE_PAC_DEFAULT) && \ + (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 /* Signed with B-key */ +# define GNU_PROPERTY_AARCH64_POINTER_AUTH \ + (1 << 1) /* Has Pointer Authentication */ +# define AARCH64_SIGN_LINK_REGISTER hint #27 /* PACIBSP */ +# define AARCH64_VALIDATE_LINK_REGISTER hint #31 /* AUTIBSP */ +# else +# define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 /* No Pointer Authentication */ +# if GNU_PROPERTY_AARCH64_BTI != 0 +# define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET +# else +# define AARCH64_SIGN_LINK_REGISTER +# endif +# define AARCH64_VALIDATE_LINK_REGISTER +# endif + +# if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0 + .pushsection .note.gnu.property, "a"; + .balign 8; + .long 4; + .long 0x10; + .long 0x5; + .asciz "GNU"; + .long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ + .long 4; + .long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI); + .long 0; + .popsection; +# endif + +# endif /* defined __ASSEMBLER__ */ + +# define IS_CPU_SUPPORT_UNROLL8_EOR3() \ + (OPENSSL_armcap_P & ARMV8_UNROLL8_EOR3) + +#endif diff --git a/module/icp/asm-aarch64/modes/ghashv8-armx.S b/module/icp/asm-aarch64/modes/ghashv8-armx.S new file mode 100644 index 000000000000..918c66349eac --- /dev/null +++ b/module/icp/asm-aarch64/modes/ghashv8-armx.S @@ -0,0 +1,656 @@ +/* + * Copyright 2011-2024 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#if defined(__aarch64__) +#include "arm_arch.h" + +.arch armv8-a+crypto +.text +.globl gcm_init_v8 +.type gcm_init_v8,%function +.align 4 +gcm_init_v8: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x1] //load input H + movi v19.16b,#0xe1 + shl v19.2d,v19.2d,#57 //0xc2.0 + // KCF/ICP stores H in network byte order with the hi qword first + // so we need to swap all bytes, not the 2 qwords: + rev64 v17.16b,v17.16b + ext v3.16b,v17.16b,v17.16b,#8 + ushr v18.2d,v19.2d,#63 + dup v17.4s,v17.s[1] + ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 + ushr v18.2d,v3.2d,#63 + sshr v17.4s,v17.4s,#31 //broadcast carry bit + and v18.16b,v18.16b,v16.16b + shl v3.2d,v3.2d,#1 + ext v18.16b,v18.16b,v18.16b,#8 + and v16.16b,v16.16b,v17.16b + orr v3.16b,v3.16b,v18.16b //H<<<=1 + eor v20.16b,v3.16b,v16.16b //twisted H + st1 {v20.2d},[x0],#16 //store Htable[0] + + //calculate H^2 + ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing + pmull v0.1q,v20.1d,v20.1d + eor v16.16b,v16.16b,v20.16b + pmull2 v2.1q,v20.2d,v20.2d + pmull v1.1q,v16.1d,v16.1d + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v22.16b,v0.16b,v18.16b + + ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] + //calculate H^3 and H^4 + pmull v0.1q,v20.1d, v22.1d + pmull v5.1q,v22.1d,v22.1d + pmull2 v2.1q,v20.2d, v22.2d + pmull2 v7.1q,v22.2d,v22.2d + pmull v1.1q,v16.1d,v17.1d + pmull v6.1q,v17.1d,v17.1d + + ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + ext v17.16b,v5.16b,v7.16b,#8 + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v16.16b + eor v4.16b,v5.16b,v7.16b + eor v6.16b,v6.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + eor v6.16b,v6.16b,v4.16b + pmull v4.1q,v5.1d,v19.1d + + ins v2.d[0],v1.d[1] + ins v7.d[0],v6.d[1] + ins v1.d[1],v0.d[0] + ins v6.d[1],v5.d[0] + eor v0.16b,v1.16b,v18.16b + eor v5.16b,v6.16b,v4.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v4.16b,v5.16b,v5.16b,#8 + pmull v0.1q,v0.1d,v19.1d + pmull v5.1q,v5.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v4.16b,v4.16b,v7.16b + eor v23.16b, v0.16b,v18.16b //H^3 + eor v25.16b,v5.16b,v4.16b //H^4 + + ext v16.16b,v23.16b, v23.16b,#8 //Karatsuba pre-processing + ext v17.16b,v25.16b,v25.16b,#8 + ext v18.16b,v22.16b,v22.16b,#8 + eor v16.16b,v16.16b,v23.16b + eor v17.16b,v17.16b,v25.16b + eor v18.16b,v18.16b,v22.16b + ext v24.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v23.2d,v24.2d,v25.2d},[x0],#48 //store Htable[3..5] + + //calculate H^5 and H^6 + pmull v0.1q,v22.1d, v23.1d + pmull v5.1q,v23.1d,v23.1d + pmull2 v2.1q,v22.2d, v23.2d + pmull2 v7.1q,v23.2d,v23.2d + pmull v1.1q,v16.1d,v18.1d + pmull v6.1q,v16.1d,v16.1d + + ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + ext v17.16b,v5.16b,v7.16b,#8 + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v16.16b + eor v4.16b,v5.16b,v7.16b + eor v6.16b,v6.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + eor v6.16b,v6.16b,v4.16b + pmull v4.1q,v5.1d,v19.1d + + ins v2.d[0],v1.d[1] + ins v7.d[0],v6.d[1] + ins v1.d[1],v0.d[0] + ins v6.d[1],v5.d[0] + eor v0.16b,v1.16b,v18.16b + eor v5.16b,v6.16b,v4.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v4.16b,v5.16b,v5.16b,#8 + pmull v0.1q,v0.1d,v19.1d + pmull v5.1q,v5.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v4.16b,v4.16b,v7.16b + eor v26.16b,v0.16b,v18.16b //H^5 + eor v28.16b,v5.16b,v4.16b //H^6 + + ext v16.16b,v26.16b, v26.16b,#8 //Karatsuba pre-processing + ext v17.16b,v28.16b,v28.16b,#8 + ext v18.16b,v22.16b,v22.16b,#8 + eor v16.16b,v16.16b,v26.16b + eor v17.16b,v17.16b,v28.16b + eor v18.16b,v18.16b,v22.16b + ext v27.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v26.2d,v27.2d,v28.2d},[x0],#48 //store Htable[6..8] + + //calculate H^7 and H^8 + pmull v0.1q,v22.1d,v26.1d + pmull v5.1q,v22.1d,v28.1d + pmull2 v2.1q,v22.2d,v26.2d + pmull2 v7.1q,v22.2d,v28.2d + pmull v1.1q,v16.1d,v18.1d + pmull v6.1q,v17.1d,v18.1d + + ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + ext v17.16b,v5.16b,v7.16b,#8 + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v16.16b + eor v4.16b,v5.16b,v7.16b + eor v6.16b,v6.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + eor v6.16b,v6.16b,v4.16b + pmull v4.1q,v5.1d,v19.1d + + ins v2.d[0],v1.d[1] + ins v7.d[0],v6.d[1] + ins v1.d[1],v0.d[0] + ins v6.d[1],v5.d[0] + eor v0.16b,v1.16b,v18.16b + eor v5.16b,v6.16b,v4.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v4.16b,v5.16b,v5.16b,#8 + pmull v0.1q,v0.1d,v19.1d + pmull v5.1q,v5.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v4.16b,v4.16b,v7.16b + eor v29.16b,v0.16b,v18.16b //H^7 + eor v31.16b,v5.16b,v4.16b //H^8 + + ext v16.16b,v29.16b,v29.16b,#8 //Karatsuba pre-processing + ext v17.16b,v31.16b,v31.16b,#8 + eor v16.16b,v16.16b,v29.16b + eor v17.16b,v17.16b,v31.16b + ext v30.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v29.2d,v30.2d,v31.2d},[x0] //store Htable[9..11] + ret +.size gcm_init_v8,.-gcm_init_v8 +.globl gcm_gmult_v8 +.type gcm_gmult_v8,%function +.align 4 +gcm_gmult_v8: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x0] //load Xi + movi v19.16b,#0xe1 + ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... + shl v19.2d,v19.2d,#57 +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ext v3.16b,v17.16b,v17.16b,#8 + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret +.size gcm_gmult_v8,.-gcm_gmult_v8 +.globl gcm_ghash_v8 +.type gcm_ghash_v8,%function +.align 4 +gcm_ghash_v8: + AARCH64_VALID_CALL_TARGET + cmp x3,#64 + b.hs .Lgcm_ghash_v8_4x + ld1 {v0.2d},[x0] //load [rotated] Xi + //"[rotated]" means that + //loaded value would have + //to be rotated in order to + //make it appear as in + //algorithm specification + subs x3,x3,#32 //see if x3 is 32 or larger + mov x12,#16 //x12 is used as post- + //increment for input pointer; + //as loop is modulo-scheduled + //x12 is zeroed just in time + //to preclude overstepping + //inp[len], which means that + //last block[s] are actually + //loaded twice, but last + //copy is not processed + ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 + movi v19.16b,#0xe1 + ld1 {v22.2d},[x1] + csel x12,xzr,x12,eq //is it time to zero x12? + ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi + ld1 {v16.2d},[x2],#16 //load [rotated] I[0] + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant +#ifndef __AARCH64EB__ + rev64 v16.16b,v16.16b + rev64 v0.16b,v0.16b +#endif + ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] + b.lo .Lodd_tail_v8 //x3 was less than 32 + ld1 {v17.2d},[x2],x12 //load [rotated] I[1] +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ext v7.16b,v17.16b,v17.16b,#8 + eor v3.16b,v3.16b,v0.16b //I[i]^=Xi + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + pmull2 v6.1q,v20.2d,v7.2d + b .Loop_mod2x_v8 + +.align 4 +.Loop_mod2x_v8: + ext v18.16b,v3.16b,v3.16b,#8 + subs x3,x3,#32 //is there more data? + pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo + csel x12,xzr,x12,lo //is it time to zero x12? + + pmull v5.1q,v21.1d,v17.1d + eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi + eor v0.16b,v0.16b,v4.16b //accumulate + pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] + + eor v2.16b,v2.16b,v6.16b + csel x12,xzr,x12,eq //is it time to zero x12? + eor v1.16b,v1.16b,v5.16b + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] +#ifndef __AARCH64EB__ + rev64 v16.16b,v16.16b +#endif + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v7.16b,v17.16b,v17.16b,#8 + ext v3.16b,v16.16b,v16.16b,#8 + eor v0.16b,v1.16b,v18.16b + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v3.16b,v3.16b,v18.16b + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + eor v3.16b,v3.16b,v0.16b + pmull2 v6.1q,v20.2d,v7.2d + b.hs .Loop_mod2x_v8 //there was at least 32 more bytes + + eor v2.16b,v2.16b,v18.16b + ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b + adds x3,x3,#32 //re-construct x3 + eor v0.16b,v0.16b,v2.16b //re-construct v0.16b + b.eq .Ldone_v8 //is x3 zero? +.Lodd_tail_v8: + ext v18.16b,v0.16b,v0.16b,#8 + eor v3.16b,v3.16b,v0.16b //inp^=Xi + eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +.Ldone_v8: +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret +.size gcm_ghash_v8,.-gcm_ghash_v8 +.type gcm_ghash_v8_4x,%function +.align 4 +gcm_ghash_v8_4x: +.Lgcm_ghash_v8_4x: + ld1 {v0.2d},[x0] //load [rotated] Xi + ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 + movi v19.16b,#0xe1 + ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant + + ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v7.16b,v7.16b + rev64 v4.16b,v4.16b +#endif + ext v25.16b,v7.16b,v7.16b,#8 + ext v24.16b,v6.16b,v6.16b,#8 + ext v23.16b,v5.16b,v5.16b,#8 + + pmull v29.1q,v20.1d,v25.1d //H·Ii+3 + eor v7.16b,v7.16b,v25.16b + pmull2 v31.1q,v20.2d,v25.2d + pmull v30.1q,v21.1d,v7.1d + + pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 + eor v6.16b,v6.16b,v24.16b + pmull2 v24.1q,v22.2d,v24.2d + pmull2 v6.1q,v21.2d,v6.2d + + eor v29.16b,v29.16b,v16.16b + eor v31.16b,v31.16b,v24.16b + eor v30.16b,v30.16b,v6.16b + + pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 + eor v5.16b,v5.16b,v23.16b + pmull2 v23.1q,v26.2d,v23.2d + pmull v5.1q,v27.1d,v5.1d + + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + eor v30.16b,v30.16b,v5.16b + + subs x3,x3,#128 + b.lo .Ltail4x + + b .Loop4x + +.align 4 +.Loop4x: + eor v16.16b,v4.16b,v0.16b + ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 + ext v3.16b,v16.16b,v16.16b,#8 +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v7.16b,v7.16b + rev64 v4.16b,v4.16b +#endif + + pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v28.2d,v3.2d + ext v25.16b,v7.16b,v7.16b,#8 + pmull2 v1.1q,v27.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + ext v24.16b,v6.16b,v6.16b,#8 + eor v1.16b,v1.16b,v30.16b + ext v23.16b,v5.16b,v5.16b,#8 + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + pmull v29.1q,v20.1d,v25.1d //H·Ii+3 + eor v7.16b,v7.16b,v25.16b + eor v1.16b,v1.16b,v17.16b + pmull2 v31.1q,v20.2d,v25.2d + eor v1.16b,v1.16b,v18.16b + pmull v30.1q,v21.1d,v7.1d + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 + eor v6.16b,v6.16b,v24.16b + pmull2 v24.1q,v22.2d,v24.2d + eor v0.16b,v1.16b,v18.16b + pmull2 v6.1q,v21.2d,v6.2d + + eor v29.16b,v29.16b,v16.16b + eor v31.16b,v31.16b,v24.16b + eor v30.16b,v30.16b,v6.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 + eor v5.16b,v5.16b,v23.16b + eor v18.16b,v18.16b,v2.16b + pmull2 v23.1q,v26.2d,v23.2d + pmull v5.1q,v27.1d,v5.1d + + eor v0.16b,v0.16b,v18.16b + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + ext v0.16b,v0.16b,v0.16b,#8 + eor v30.16b,v30.16b,v5.16b + + subs x3,x3,#64 + b.hs .Loop4x + +.Ltail4x: + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v28.2d,v3.2d + pmull2 v1.1q,v27.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + + adds x3,x3,#64 + b.eq .Ldone4x + + cmp x3,#32 + b.lo .Lone + b.eq .Ltwo +.Lthree: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d,v5.2d,v6.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v24.16b,v6.16b,v6.16b,#8 + ext v23.16b,v5.16b,v5.16b,#8 + eor v0.16b,v1.16b,v18.16b + + pmull v29.1q,v20.1d,v24.1d //H·Ii+2 + eor v6.16b,v6.16b,v24.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + pmull2 v31.1q,v20.2d,v24.2d + pmull v30.1q,v21.1d,v6.1d + eor v0.16b,v0.16b,v18.16b + pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 + eor v5.16b,v5.16b,v23.16b + ext v0.16b,v0.16b,v0.16b,#8 + + pmull2 v23.1q,v22.2d,v23.2d + eor v16.16b,v4.16b,v0.16b + pmull2 v5.1q,v21.2d,v5.2d + ext v3.16b,v16.16b,v16.16b,#8 + + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + eor v30.16b,v30.16b,v5.16b + + pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v26.2d,v3.2d + pmull v1.1q,v27.1d,v16.1d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + b .Ldone4x + +.align 4 +.Ltwo: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d,v5.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v23.16b,v5.16b,v5.16b,#8 + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + + pmull v29.1q,v20.1d,v23.1d //H·Ii+1 + eor v5.16b,v5.16b,v23.16b + + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull2 v31.1q,v20.2d,v23.2d + pmull v30.1q,v21.1d,v5.1d + + pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v22.2d,v3.2d + pmull2 v1.1q,v21.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + b .Ldone4x + +.align 4 +.Lone: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull v0.1q,v20.1d,v3.1d + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v20.2d,v3.2d + pmull v1.1q,v21.1d,v16.1d + +.Ldone4x: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + st1 {v0.2d},[x0] //write out Xi + + ret +.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 + +#endif \ No newline at end of file diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h index d26ced58ff1e..7d620249f872 100644 --- a/module/icp/include/aes/aes_impl.h +++ b/module/icp/include/aes/aes_impl.h @@ -198,6 +198,9 @@ extern ASMABI void aes_decrypt_amd64(const uint32_t rk[], int Nr, #if defined(__x86_64) && defined(HAVE_AES) extern const aes_impl_ops_t aes_aesni_impl; #endif +#if defined(__aarch64__) && defined(HAVE_ARM_AES) +extern const aes_impl_ops_t aes_armv8_crypto_impl; +#endif /* * Initializes fastest implementation diff --git a/module/icp/include/modes/modes.h b/module/icp/include/modes/modes.h index daa0335b5c3b..5d17c62a68b6 100644 --- a/module/icp/include/modes/modes.h +++ b/module/icp/include/modes/modes.h @@ -42,7 +42,15 @@ extern "C" { #if defined(__x86_64__) && defined(HAVE_AVX) && \ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) #define CAN_USE_GCM_ASM -extern boolean_t gcm_avx_can_use_movbe; +#endif + +#if defined(__aarch64__) && defined(HAVE_ARM_AES) && \ + (defined(HAVE_KERNEL_NEON) || defined(HAVE_KERNEL_FPU_INTERNAL)) +#define CAN_USE_GCM_ASM +#endif + +#if defined(CAN_USE_GCM_ASM) +extern boolean_t gcm_hardware_can_use_movbe; #endif #define CCM_MODE 0x00000010 @@ -173,7 +181,7 @@ typedef struct gcm_ctx { uint64_t gcm_len_a_len_c[2]; uint8_t *gcm_pt_buf; #ifdef CAN_USE_GCM_ASM - boolean_t gcm_use_avx; + boolean_t gcm_use_hardware; #endif } gcm_ctx_t; diff --git a/module/zcommon/simd_stat.c b/module/zcommon/simd_stat.c index 33c15140cdb9..e669b56074cf 100644 --- a/module/zcommon/simd_stat.c +++ b/module/zcommon/simd_stat.c @@ -139,6 +139,10 @@ simd_stat_kstat_data(char *buf, size_t size, void *data) off += SIMD_STAT_PRINT(simd_stat_kstat_payload, "sha256", zfs_sha256_available()); #if defined(__aarch64__) + off += SIMD_STAT_PRINT(simd_stat_kstat_payload, + "aes", zfs_aes_available()); + off += SIMD_STAT_PRINT(simd_stat_kstat_payload, + "pmull", zfs_pmull_available()); /* * This technically can exist on 32b ARM but we don't * define hooks to check for it and I didn't want to