From 52cebb1f4513b9eafca832da559513670de0a295 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tarek=20Ziad=C3=A9?= <tarek@ziade.org>
Date: Mon, 2 Dec 2024 10:09:49 +0100
Subject: [PATCH] added a firefox matmul backend

---
 build.sh                                      |   15 +-
 cmake/onnxruntime_webassembly.cmake           |    1 +
 include/onnxruntime/gemmology.h               | 1390 ++++++++
 include/onnxruntime/gemmology_fwd.h           |  282 ++
 .../arch/generic/xsimd_generic_arithmetic.hpp |  241 ++
 .../arch/generic/xsimd_generic_complex.hpp    |  108 +
 .../arch/generic/xsimd_generic_details.hpp    |  316 ++
 .../arch/generic/xsimd_generic_logical.hpp    |  208 ++
 .../xsimd/arch/generic/xsimd_generic_math.hpp | 2499 +++++++++++++++
 .../arch/generic/xsimd_generic_memory.hpp     |  672 ++++
 .../arch/generic/xsimd_generic_rounding.hpp   |   72 +
 .../arch/generic/xsimd_generic_trigo.hpp      |  969 ++++++
 include/onnxruntime/xsimd/arch/xsimd_avx.hpp  | 1820 +++++++++++
 include/onnxruntime/xsimd/arch/xsimd_avx2.hpp | 1021 ++++++
 .../onnxruntime/xsimd/arch/xsimd_avx512bw.hpp |  701 ++++
 .../onnxruntime/xsimd/arch/xsimd_avx512cd.hpp |   28 +
 .../onnxruntime/xsimd/arch/xsimd_avx512dq.hpp |  212 ++
 .../onnxruntime/xsimd/arch/xsimd_avx512er.hpp |   20 +
 .../onnxruntime/xsimd/arch/xsimd_avx512f.hpp  | 2167 +++++++++++++
 .../xsimd/arch/xsimd_avx512ifma.hpp           |   20 +
 .../onnxruntime/xsimd/arch/xsimd_avx512pf.hpp |   20 +
 .../xsimd/arch/xsimd_avx512vbmi.hpp           |   20 +
 .../xsimd/arch/xsimd_avx512vnni_avx512bw.hpp  |   20 +
 .../arch/xsimd_avx512vnni_avx512vbmi.hpp      |   20 +
 .../onnxruntime/xsimd/arch/xsimd_avxvnni.hpp  |   20 +
 .../xsimd/arch/xsimd_constants.hpp            |  391 +++
 .../onnxruntime/xsimd/arch/xsimd_emulated.hpp |  771 +++++
 .../onnxruntime/xsimd/arch/xsimd_fma3_avx.hpp |   80 +
 .../xsimd/arch/xsimd_fma3_avx2.hpp            |   46 +
 .../onnxruntime/xsimd/arch/xsimd_fma3_sse.hpp |   79 +
 include/onnxruntime/xsimd/arch/xsimd_fma4.hpp |   79 +
 .../onnxruntime/xsimd/arch/xsimd_generic.hpp  |   23 +
 .../xsimd/arch/xsimd_generic_fwd.hpp          |   44 +
 .../xsimd/arch/xsimd_i8mm_neon64.hpp          |   17 +
 include/onnxruntime/xsimd/arch/xsimd_isa.hpp  |  130 +
 include/onnxruntime/xsimd/arch/xsimd_neon.hpp | 2813 +++++++++++++++++
 .../onnxruntime/xsimd/arch/xsimd_neon64.hpp   | 1536 +++++++++
 include/onnxruntime/xsimd/arch/xsimd_rvv.hpp  | 1500 +++++++++
 .../onnxruntime/xsimd/arch/xsimd_scalar.hpp   | 1223 +++++++
 include/onnxruntime/xsimd/arch/xsimd_sse2.hpp | 1763 +++++++++++
 include/onnxruntime/xsimd/arch/xsimd_sse3.hpp |   64 +
 .../onnxruntime/xsimd/arch/xsimd_sse4_1.hpp   |  339 ++
 .../onnxruntime/xsimd/arch/xsimd_sse4_2.hpp   |   44 +
 .../onnxruntime/xsimd/arch/xsimd_ssse3.hpp    |  175 +
 include/onnxruntime/xsimd/arch/xsimd_sve.hpp  | 1148 +++++++
 include/onnxruntime/xsimd/arch/xsimd_wasm.hpp | 1703 ++++++++++
 .../onnxruntime/xsimd/config/xsimd_arch.hpp   |  238 ++
 .../onnxruntime/xsimd/config/xsimd_config.hpp |  462 +++
 .../onnxruntime/xsimd/config/xsimd_cpuid.hpp  |  262 ++
 .../onnxruntime/xsimd/config/xsimd_inline.hpp |   23 +
 .../onnxruntime/xsimd/math/xsimd_rem_pio2.hpp |  719 +++++
 .../xsimd/memory/xsimd_aligned_allocator.hpp  |  349 ++
 .../xsimd/memory/xsimd_alignment.hpp          |   91 +
 .../xsimd/types/xsimd_all_registers.hpp       |   52 +
 include/onnxruntime/xsimd/types/xsimd_api.hpp | 2700 ++++++++++++++++
 .../xsimd/types/xsimd_avx2_register.hpp       |   39 +
 .../xsimd/types/xsimd_avx512bw_register.hpp   |   47 +
 .../xsimd/types/xsimd_avx512cd_register.hpp   |   47 +
 .../xsimd/types/xsimd_avx512dq_register.hpp   |   47 +
 .../xsimd/types/xsimd_avx512er_register.hpp   |   47 +
 .../xsimd/types/xsimd_avx512f_register.hpp    |   73 +
 .../xsimd/types/xsimd_avx512ifma_register.hpp |   47 +
 .../xsimd/types/xsimd_avx512pf_register.hpp   |   47 +
 .../xsimd/types/xsimd_avx512vbmi_register.hpp |   47 +
 .../xsimd_avx512vnni_avx512bw_register.hpp    |   50 +
 .../xsimd_avx512vnni_avx512vbmi_register.hpp  |   50 +
 .../xsimd/types/xsimd_avx_register.hpp        |   60 +
 .../xsimd/types/xsimd_avxvnni_register.hpp    |   39 +
 .../onnxruntime/xsimd/types/xsimd_batch.hpp   | 1492 +++++++++
 .../xsimd/types/xsimd_batch_constant.hpp      |  300 ++
 .../xsimd/types/xsimd_emulated_register.hpp   |   80 +
 .../xsimd/types/xsimd_fma3_avx2_register.hpp  |   45 +
 .../xsimd/types/xsimd_fma3_avx_register.hpp   |   45 +
 .../xsimd/types/xsimd_fma3_sse_register.hpp   |   45 +
 .../xsimd/types/xsimd_fma4_register.hpp       |   41 +
 .../xsimd/types/xsimd_generic_arch.hpp        |   47 +
 .../types/xsimd_i8mm_neon64_register.hpp      |   50 +
 .../xsimd/types/xsimd_neon64_register.hpp     |   51 +
 .../xsimd/types/xsimd_neon_register.hpp       |  154 +
 .../xsimd/types/xsimd_register.hpp            |   94 +
 .../xsimd/types/xsimd_rvv_register.hpp        |  497 +++
 .../xsimd/types/xsimd_sse2_register.hpp       |   59 +
 .../xsimd/types/xsimd_sse3_register.hpp       |   44 +
 .../xsimd/types/xsimd_sse4_1_register.hpp     |   43 +
 .../xsimd/types/xsimd_sse4_2_register.hpp     |   43 +
 .../xsimd/types/xsimd_ssse3_register.hpp      |   43 +
 .../xsimd/types/xsimd_sve_register.hpp        |  156 +
 .../onnxruntime/xsimd/types/xsimd_traits.hpp  |  324 ++
 .../onnxruntime/xsimd/types/xsimd_utils.hpp   |  530 ++++
 .../xsimd/types/xsimd_wasm_register.hpp       |   59 +
 include/onnxruntime/xsimd/xsimd.hpp           |   69 +
 .../contrib_ops/cpu/cpu_contrib_kernels.cc    |    6 +-
 .../quantization/firefox_matmul_integer.cc    |  236 ++
 .../cpu/quantization/firefox_matmul_integer.h |  309 ++
 onnxruntime/core/framework/session_state.cc   |    3 +
 .../core/graph/contrib_ops/contrib_defs.cc    |   51 +
 onnxruntime/core/graph/contrib_ops/ms_opset.h |    2 +
 .../firefox_matmul_integer_test.cc            |   50 +
 .../test/framework/inference_session_test.cc  |    3 +
 onnxruntime/wasm/pre-jsep.js                  |   60 +-
 onnxruntime/wasm/pre.js                       |   91 +-
 101 files changed, 37449 insertions(+), 39 deletions(-)
 create mode 100644 include/onnxruntime/gemmology.h
 create mode 100644 include/onnxruntime/gemmology_fwd.h
 create mode 100644 include/onnxruntime/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/generic/xsimd_generic_complex.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/generic/xsimd_generic_details.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/generic/xsimd_generic_logical.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/generic/xsimd_generic_math.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/generic/xsimd_generic_memory.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/generic/xsimd_generic_rounding.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/generic/xsimd_generic_trigo.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_avx.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_avx2.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_avx512bw.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_avx512cd.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_avx512dq.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_avx512er.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_avx512f.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_avx512ifma.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_avx512pf.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_avx512vbmi.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_avx512vnni_avx512vbmi.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_avxvnni.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_constants.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_emulated.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_fma3_avx.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_fma3_avx2.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_fma3_sse.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_fma4.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_generic.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_generic_fwd.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_i8mm_neon64.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_isa.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_neon.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_neon64.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_rvv.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_scalar.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_sse2.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_sse3.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_sse4_1.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_sse4_2.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_ssse3.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_sve.hpp
 create mode 100644 include/onnxruntime/xsimd/arch/xsimd_wasm.hpp
 create mode 100644 include/onnxruntime/xsimd/config/xsimd_arch.hpp
 create mode 100644 include/onnxruntime/xsimd/config/xsimd_config.hpp
 create mode 100644 include/onnxruntime/xsimd/config/xsimd_cpuid.hpp
 create mode 100644 include/onnxruntime/xsimd/config/xsimd_inline.hpp
 create mode 100644 include/onnxruntime/xsimd/math/xsimd_rem_pio2.hpp
 create mode 100644 include/onnxruntime/xsimd/memory/xsimd_aligned_allocator.hpp
 create mode 100644 include/onnxruntime/xsimd/memory/xsimd_alignment.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_all_registers.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_api.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_avx2_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_avx512bw_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_avx512cd_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_avx512dq_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_avx512er_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_avx512f_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_avx512ifma_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_avx512pf_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_avx512vbmi_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_avx_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_avxvnni_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_batch.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_batch_constant.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_emulated_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_fma3_avx2_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_fma3_avx_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_fma3_sse_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_fma4_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_generic_arch.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_i8mm_neon64_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_neon64_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_neon_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_rvv_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_sse2_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_sse3_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_sse4_1_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_sse4_2_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_ssse3_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_sve_register.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_traits.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_utils.hpp
 create mode 100644 include/onnxruntime/xsimd/types/xsimd_wasm_register.hpp
 create mode 100644 include/onnxruntime/xsimd/xsimd.hpp
 create mode 100644 onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.cc
 create mode 100644 onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.h
 create mode 100644 onnxruntime/test/contrib_ops/firefox_matmul_integer_test.cc

diff --git a/build.sh b/build.sh
index bf799ac8b7211..0b293effe6330 100755
--- a/build.sh
+++ b/build.sh
@@ -1,21 +1,24 @@
 #!/bin/bash
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
+set -ex
 
 # Get directory this script is in
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 OS=$(uname -s)
 
 if [ "$OS" = "Darwin" ]; then
-    DIR_OS="MacOS"
+  DIR_OS="MacOS"
 else
-    DIR_OS="Linux"
+  DIR_OS="Linux"
 fi
 
 if [[ "$*" == *"--ios"* ]]; then
-    DIR_OS="iOS"
+  DIR_OS="iOS"
 elif [[ "$*" == *"--android"* ]]; then
-    DIR_OS="Android"
+  DIR_OS="Android"
 fi
 
-python3 $DIR/tools/ci_build/build.py --build_dir $DIR/build/$DIR_OS "$@"
+PYTHON="${PYTHON:-python3}"
+
+$PYTHON $DIR/tools/ci_build/build.py --build_dir $DIR/build/$DIR_OS "$@"
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 66268cefac9ef..3a5575b163b35 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -382,6 +382,7 @@ jsepDownload:_pp_")
       "SHELL:-s ASYNCIFY_STACK_SIZE=65536"
       "SHELL:-s ASYNCIFY_EXPORTS=['OrtRun']"
       "SHELL:-s ASYNCIFY_IMPORTS=['Module.jsepCopy','Module.jsepCopyAsync','jsepDownload']"
+      "SHELL:-s ERROR_ON_UNDEFINED_SYMBOLS=0"
     )
     set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js)
   endif()
diff --git a/include/onnxruntime/gemmology.h b/include/onnxruntime/gemmology.h
new file mode 100644
index 0000000000000..332afe166870d
--- /dev/null
+++ b/include/onnxruntime/gemmology.h
@@ -0,0 +1,1390 @@
+#ifndef GEMMOLOGY_H
+#define GEMMOLOGY_H
+
+#include "gemmology_fwd.h"
+
+#include <cstdint>
+#include <cstring>
+#include <tuple>
+
+#ifdef GEMMOLOGY_WITH_STD_THREAD
+#include <thread>
+#include <vector>
+#endif
+
+#include "xsimd/xsimd.hpp"
+
+namespace gemmology {
+
+namespace {
+
+//
+// Arch specific implementation of various elementary operations
+//
+
+namespace kernel {
+
+#ifdef __AVX512BW__
+template <class Arch>
+std::tuple<xsimd::batch<int8_t, Arch>, xsimd::batch<int8_t, Arch>>
+interleave(xsimd::batch<int8_t, Arch> first, xsimd::batch<int8_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::avx512bw>) {
+  return {_mm512_unpacklo_epi8(first, second),
+          _mm512_unpackhi_epi8(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int16_t, Arch>, xsimd::batch<int16_t, Arch>>
+interleave(xsimd::batch<int16_t, Arch> first,
+           xsimd::batch<int16_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::avx512bw>) {
+  return {_mm512_unpacklo_epi16(first, second),
+          _mm512_unpackhi_epi16(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
+interleave(xsimd::batch<int32_t, Arch> first,
+           xsimd::batch<int32_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::avx512bw>) {
+  return {_mm512_unpacklo_epi32(first, second),
+          _mm512_unpackhi_epi32(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int64_t, Arch>, xsimd::batch<int64_t, Arch>>
+interleave(xsimd::batch<int64_t, Arch> first,
+           xsimd::batch<int64_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::avx512bw>) {
+  return {_mm512_unpacklo_epi64(first, second),
+          _mm512_unpackhi_epi64(first, second)};
+}
+
+template <class Arch>
+xsimd::batch<int8_t, Arch>
+deinterleave(xsimd::batch<int16_t, Arch> first,
+             xsimd::batch<int16_t, Arch> second,
+             xsimd::kernel::requires_arch<xsimd::avx512bw>) {
+  return _mm512_packs_epi16(first, second);
+}
+
+template <class Arch>
+xsimd::batch<int16_t, Arch>
+deinterleave(xsimd::batch<int32_t, Arch> first,
+             xsimd::batch<int32_t, Arch> second,
+             xsimd::kernel::requires_arch<xsimd::avx512bw>) {
+  return _mm512_packs_epi32(first, second);
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+madd(xsimd::batch<int16_t, Arch> x, xsimd::batch<int16_t, Arch> y,
+     xsimd::kernel::requires_arch<xsimd::avx512bw>) {
+  return _mm512_madd_epi16(x, y);
+}
+
+template <class Arch>
+inline xsimd::batch<int16_t, Arch>
+madd(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+     xsimd::kernel::requires_arch<xsimd::avx512bw>) {
+  return _mm512_maddubs_epi16(x, y);
+}
+
+template <class Arch>
+inline xsimd::batch<int16_t, Arch>
+madd(xsimd::batch<int8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+     xsimd::kernel::requires_arch<xsimd::avx512bw>) {
+  return _mm512_madd_epi16(x, y);
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, xsimd::avx2>
+PermuteSummer(xsimd::batch<int32_t, Arch> pack0123,
+              xsimd::batch<int32_t, Arch> pack4567,
+              xsimd::kernel::requires_arch<xsimd::avx512bw>) {
+  // Form [0th 128-bit register of pack0123, 0st 128-bit register of pack4567,
+  // 2nd 128-bit register of pack0123, 2nd 128-bit register of pack4567]
+  __m512i mix0 =
+      _mm512_mask_permutex_epi64(pack0123, 0xcc, pack4567, (0 << 4) | (1 << 6));
+  // Form [1st 128-bit register of pack0123, 1st 128-bit register of pack4567,
+  // 3rd 128-bit register of pack0123, 3rd 128-bit register of pack4567]
+  __m512i mix1 =
+      _mm512_mask_permutex_epi64(pack4567, 0x33, pack0123, 2 | (3 << 2));
+  __m512i added = _mm512_add_epi32(mix0, mix1);
+  // Now we have 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7.
+  // Fold register over itself.
+  return _mm256_add_epi32(_mm512_castsi512_si256(added),
+                          _mm512_extracti64x4_epi64(added, 1));
+}
+#endif
+
+#ifdef __AVX2__
+template <class Arch>
+std::tuple<xsimd::batch<int8_t, Arch>, xsimd::batch<int8_t, Arch>>
+interleave(xsimd::batch<int8_t, Arch> first, xsimd::batch<int8_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::avx2>) {
+  return {_mm256_unpacklo_epi8(first, second),
+          _mm256_unpackhi_epi8(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int16_t, Arch>, xsimd::batch<int16_t, Arch>>
+interleave(xsimd::batch<int16_t, Arch> first,
+           xsimd::batch<int16_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::avx2>) {
+  return {_mm256_unpacklo_epi16(first, second),
+          _mm256_unpackhi_epi16(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
+interleave(xsimd::batch<int32_t, Arch> first,
+           xsimd::batch<int32_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::avx2>) {
+  return {_mm256_unpacklo_epi32(first, second),
+          _mm256_unpackhi_epi32(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int64_t, Arch>, xsimd::batch<int64_t, Arch>>
+interleave(xsimd::batch<int64_t, Arch> first,
+           xsimd::batch<int64_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::avx2>) {
+  return {_mm256_unpacklo_epi64(first, second),
+          _mm256_unpackhi_epi64(first, second)};
+}
+
+template <class Arch>
+xsimd::batch<int8_t, Arch>
+deinterleave(xsimd::batch<int16_t, Arch> first,
+             xsimd::batch<int16_t, Arch> second,
+             xsimd::kernel::requires_arch<xsimd::avx2>) {
+  return _mm256_packs_epi16(first, second);
+}
+
+template <class Arch>
+xsimd::batch<int16_t, Arch>
+deinterleave(xsimd::batch<int32_t, Arch> first,
+             xsimd::batch<int32_t, Arch> second,
+             xsimd::kernel::requires_arch<xsimd::avx2>) {
+  return _mm256_packs_epi32(first, second);
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+madd(xsimd::batch<int16_t, Arch> x, xsimd::batch<int16_t, Arch> y,
+     xsimd::kernel::requires_arch<xsimd::avx2>) {
+  return _mm256_madd_epi16(x, y);
+}
+
+template <class Arch>
+inline xsimd::batch<int16_t, Arch>
+madd(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+     xsimd::kernel::requires_arch<xsimd::avx2>) {
+  return _mm256_maddubs_epi16(x, y);
+}
+
+template <class Arch>
+inline xsimd::batch<int16_t, Arch>
+madd(xsimd::batch<int8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+     xsimd::kernel::requires_arch<xsimd::avx2>) {
+  return _mm256_maddubs_epi16(xsimd::abs(x), _mm256_sign_epi8(y, x));
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+PermuteSummer(xsimd::batch<int32_t, Arch> pack0123,
+              xsimd::batch<int32_t, Arch> pack4567,
+              xsimd::kernel::requires_arch<xsimd::avx2>) {
+  // This instruction generates 1s 2s 3s 4s 5f 6f 7f 8f
+  __m256i rev = _mm256_permute2f128_si256(pack0123, pack4567, 0x21);
+  // This instruction generates 1f 2f 3f 4f 5s 6s 7s 8s
+  __m256i blended = _mm256_blend_epi32(pack0123, pack4567, 0xf0);
+  return _mm256_add_epi32(rev, blended);
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch> Pack0123(xsimd::batch<int32_t, Arch> sum0,
+                                      xsimd::batch<int32_t, Arch> sum1,
+                                      xsimd::batch<int32_t, Arch> sum2,
+                                      xsimd::batch<int32_t, Arch> sum3,
+                                      xsimd::kernel::requires_arch<xsimd::avx2>) {
+  auto pack01 = _mm256_hadd_epi32(sum0, sum1);
+  auto pack23 = _mm256_hadd_epi32(sum2, sum3);
+  return _mm256_hadd_epi32(pack01, pack23);
+}
+
+#ifdef __AVXVNNI__
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+maddw(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+      xsimd::batch<int32_t, Arch> z,
+      xsimd::kernel::requires_arch<xsimd::avxvnni>) {
+  return _mm256_dpbusd_avx_epi32(z, x, y);
+}
+#endif
+
+#ifdef __AVX512VNNI__
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+maddw(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+      xsimd::batch<int32_t, Arch> z,
+      xsimd::kernel::requires_arch<xsimd::avx512vnni<xsimd::avx512bw>>) {
+  return _mm512_dpbusd_epi32(z, x, y);
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+maddw(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+      xsimd::batch<int32_t, Arch> z,
+      xsimd::kernel::requires_arch<xsimd::avx512vnni<xsimd::avx512vbmi>>) {
+  return _mm512_dpbusd_epi32(z, x, y);
+}
+#endif
+
+#endif
+
+#ifdef __SSSE3__
+
+template <class Arch>
+inline xsimd::batch<int16_t, Arch>
+madd(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+     xsimd::kernel::requires_arch<xsimd::ssse3>) {
+  return _mm_maddubs_epi16(x, y);
+}
+
+template <class Arch>
+inline xsimd::batch<int16_t, Arch>
+madd(xsimd::batch<int8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+     xsimd::kernel::requires_arch<xsimd::ssse3>) {
+  return _mm_maddubs_epi16(xsimd::abs(x), _mm_sign_epi8(y, x));
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch> Pack0123(xsimd::batch<int32_t, Arch> sum0,
+                                      xsimd::batch<int32_t, Arch> sum1,
+                                      xsimd::batch<int32_t, Arch> sum2,
+                                      xsimd::batch<int32_t, Arch> sum3,
+                                      xsimd::kernel::requires_arch<xsimd::ssse3>) {
+  auto pack01 = _mm_hadd_epi32(sum0, sum1);
+  auto pack23 = _mm_hadd_epi32(sum2, sum3);
+  return _mm_hadd_epi32(pack01, pack23);
+}
+#endif
+
+#ifdef __SSE2__
+template <class Arch>
+std::tuple<xsimd::batch<int8_t, Arch>, xsimd::batch<int8_t, Arch>>
+interleave(xsimd::batch<int8_t, Arch> first, xsimd::batch<int8_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::sse2>) {
+  return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int16_t, Arch>, xsimd::batch<int16_t, Arch>>
+interleave(xsimd::batch<int16_t, Arch> first,
+           xsimd::batch<int16_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::sse2>) {
+  return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
+interleave(xsimd::batch<int32_t, Arch> first,
+           xsimd::batch<int32_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::sse2>) {
+  return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int64_t, Arch>, xsimd::batch<int64_t, Arch>>
+interleave(xsimd::batch<int64_t, Arch> first,
+           xsimd::batch<int64_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::sse2>) {
+  return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
+}
+
+template <class Arch>
+xsimd::batch<int8_t, Arch>
+deinterleave(xsimd::batch<int16_t, Arch> first,
+             xsimd::batch<int16_t, Arch> second,
+             xsimd::kernel::requires_arch<xsimd::sse2>) {
+  return _mm_packs_epi16(first, second);
+}
+
+template <class Arch>
+xsimd::batch<int16_t, Arch>
+deinterleave(xsimd::batch<int32_t, Arch> first,
+             xsimd::batch<int32_t, Arch> second,
+             xsimd::kernel::requires_arch<xsimd::sse2>) {
+  return _mm_packs_epi32(first, second);
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+madd(xsimd::batch<int16_t, Arch> x, xsimd::batch<int16_t, Arch> y,
+     xsimd::kernel::requires_arch<xsimd::sse2>) {
+  return _mm_madd_epi16(x, y);
+}
+
+template <class Arch>
+inline xsimd::batch<int16_t, Arch>
+madd(xsimd::batch<uint8_t, Arch> a, xsimd::batch<int8_t, Arch> b,
+     xsimd::kernel::requires_arch<xsimd::sse2>) {
+  // Adapted from
+  // https://stackoverflow.com/questions/19957709/how-to-achieve-8bit-madd-using-sse2
+  // a = 0x00 0x01 0xFE 0x04 ...
+  // b = 0x00 0x02 0x80 0x84 ...
+
+  // To extend signed 8-bit value, MSB has to be set to 0xFF
+  __m128i sign_mask_b = _mm_cmplt_epi8(b, _mm_setzero_si128());
+
+  // sign_mask_b = 0x00 0x00 0xFF 0xFF ...
+
+  // Unpack positives with 0x00, negatives with 0xFF
+  __m128i a_epi16_l = _mm_unpacklo_epi8(a, _mm_setzero_si128());
+  __m128i a_epi16_h = _mm_unpackhi_epi8(a, _mm_setzero_si128());
+  __m128i b_epi16_l = _mm_unpacklo_epi8(b, sign_mask_b);
+  __m128i b_epi16_h = _mm_unpackhi_epi8(b, sign_mask_b);
+
+  // Here - valid 16-bit signed integers corresponding to the 8-bit input
+  // a_epi16_l = 0x00 0x00 0x01 0x00 0xFE 0xFF 0x04 0x00 ...
+
+  // Get the a[i] * b[i] + a[i+1] * b[i+1] for both low and high parts
+  __m128i madd_epi32_l = _mm_madd_epi16(a_epi16_l, b_epi16_l);
+  __m128i madd_epi32_h = _mm_madd_epi16(a_epi16_h, b_epi16_h);
+
+  // Now go back from 32-bit values to 16-bit values & signed saturate
+  return _mm_packs_epi32(madd_epi32_l, madd_epi32_h);
+}
+
+template <class Arch>
+inline xsimd::batch<int16_t, Arch>
+madd(xsimd::batch<int8_t, Arch> a, xsimd::batch<int8_t, Arch> b,
+     xsimd::kernel::requires_arch<xsimd::sse2>) {
+  // adapted
+  // https://stackoverflow.com/questions/19957709/how-to-achieve-8bit-madd-using-sse2
+  // a = 0x00 0x01 0xFE 0x04 ...
+  // b = 0x00 0x02 0x80 0x84 ...
+
+  // To extend signed 8-bit value, MSB has to be set to 0xFF
+  __m128i sign_mask_a = _mm_cmplt_epi8(a, _mm_setzero_si128());
+  __m128i sign_mask_b = _mm_cmplt_epi8(b, _mm_setzero_si128());
+
+  // sign_mask_a = 0x00 0x00 0xFF 0x00 ...
+  // sign_mask_b = 0x00 0x00 0xFF 0xFF ...
+
+  // Unpack positives with 0x00, negatives with 0xFF
+  __m128i a_epi16_l = _mm_unpacklo_epi8(a, sign_mask_a);
+  __m128i a_epi16_h = _mm_unpackhi_epi8(a, sign_mask_a);
+  __m128i b_epi16_l = _mm_unpacklo_epi8(b, sign_mask_b);
+  __m128i b_epi16_h = _mm_unpackhi_epi8(b, sign_mask_b);
+
+  // Here - valid 16-bit signed integers corresponding to the 8-bit input
+  // a_epi16_l = 0x00 0x00 0x01 0x00 0xFE 0xFF 0x04 0x00 ...
+
+  // Get the a[i] * b[i] + a[i+1] * b[i+1] for both low and high parts
+  __m128i madd_epi32_l = _mm_madd_epi16(a_epi16_l, b_epi16_l);
+  __m128i madd_epi32_h = _mm_madd_epi16(a_epi16_h, b_epi16_h);
+
+  // Now go back from 32-bit values to 16-bit values & signed saturate
+  return _mm_packs_epi32(madd_epi32_l, madd_epi32_h);
+}
+
+template <class Arch>
+inline std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
+PermuteSummer(xsimd::batch<int32_t, Arch> pack0123,
+              xsimd::batch<int32_t, Arch> pack4567,
+              xsimd::kernel::requires_arch<xsimd::sse2>) {
+  return {pack0123, pack4567};
+}
+
+#endif
+
+#if __ARM_ARCH >= 7
+template <class Arch>
+std::tuple<xsimd::batch<int8_t, Arch>, xsimd::batch<int8_t, Arch>>
+interleave(xsimd::batch<int8_t, Arch> first, xsimd::batch<int8_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::neon>) {
+  return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int16_t, Arch>, xsimd::batch<int16_t, Arch>>
+interleave(xsimd::batch<int16_t, Arch> first,
+           xsimd::batch<int16_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::neon>) {
+  return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
+interleave(xsimd::batch<int32_t, Arch> first,
+           xsimd::batch<int32_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::neon>) {
+  return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int64_t, Arch>, xsimd::batch<int64_t, Arch>>
+interleave(xsimd::batch<int64_t, Arch> first,
+           xsimd::batch<int64_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::neon>) {
+  return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
+}
+
+template <class Arch>
+xsimd::batch<int8_t, Arch>
+deinterleave(xsimd::batch<int16_t, Arch> first,
+             xsimd::batch<int16_t, Arch> second,
+             xsimd::kernel::requires_arch<xsimd::neon>) {
+
+  return vcombine_s8(vqmovn_s16(first), vqmovn_s16(second));
+}
+
+template <class Arch>
+xsimd::batch<int16_t, Arch>
+deinterleave(xsimd::batch<int32_t, Arch> first,
+             xsimd::batch<int32_t, Arch> second,
+             xsimd::kernel::requires_arch<xsimd::neon>) {
+  return vcombine_s16(vqmovn_s32(first), vqmovn_s32(second));
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+madd(xsimd::batch<int16_t, Arch> x, xsimd::batch<int16_t, Arch> y,
+     xsimd::kernel::requires_arch<xsimd::neon>) {
+
+  int32x4_t low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
+  int32x4_t high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
+
+  int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+  int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+  return vcombine_s32(low_sum, high_sum);
+}
+
+template <class Arch>
+inline xsimd::batch<int16_t, Arch>
+madd(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+     xsimd::kernel::requires_arch<xsimd::neon>) {
+
+  // This would be much simpler if x86 would choose to zero extend OR sign
+  // extend, not both. This could probably be optimized better.
+
+  // Zero extend x
+  int16x8_t x_odd =
+      vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_u8(x), 8));
+  int16x8_t x_even = vreinterpretq_s16_u16(
+      vbicq_u16(vreinterpretq_u16_u8(x), vdupq_n_u16(0xff00)));
+
+  // Sign extend by shifting left then shifting right.
+  int16x8_t y_even = vshrq_n_s16(vshlq_n_s16(vreinterpretq_s16_s8(y), 8), 8);
+  int16x8_t y_odd = vshrq_n_s16(vreinterpretq_s16_s8(y), 8);
+
+  // multiply
+  int16x8_t prod1 = vmulq_s16(x_even, y_even);
+  int16x8_t prod2 = vmulq_s16(x_odd, y_odd);
+
+  // saturated add
+  return vqaddq_s16(prod1, prod2);
+}
+
+template <class Arch>
+inline xsimd::batch<int16_t, Arch>
+madd(xsimd::batch<int8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+     xsimd::kernel::requires_arch<xsimd::neon>) {
+  int16x8_t low = vmull_s8(vget_low_s8(x), vget_low_s8(y));
+  int16x8_t high = vmull_s8(vget_high_s8(x), vget_high_s8(y));
+
+  int16x4_t low_sum = vpadd_s16(vget_low_s16(low), vget_high_s16(low));
+  int16x4_t high_sum = vpadd_s16(vget_low_s16(high), vget_high_s16(high));
+
+  return vcombine_s16(low_sum, high_sum);
+}
+
+template <class Arch>
+inline std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
+PermuteSummer(xsimd::batch<int32_t, Arch> pack0123,
+              xsimd::batch<int32_t, Arch> pack4567,
+              xsimd::kernel::requires_arch<xsimd::neon>) {
+  return {pack0123, pack4567};
+}
+#endif
+
+#ifdef __aarch64__
+template <class Arch>
+std::tuple<xsimd::batch<int8_t, Arch>, xsimd::batch<int8_t, Arch>>
+interleave(xsimd::batch<int8_t, Arch> first, xsimd::batch<int8_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::neon64>) {
+  return {vzip1q_s8(first, second), vzip2q_s8(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int16_t, Arch>, xsimd::batch<int16_t, Arch>>
+interleave(xsimd::batch<int16_t, Arch> first,
+           xsimd::batch<int16_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::neon64>) {
+  return {vzip1q_s16(first, second), vzip2q_s16(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
+interleave(xsimd::batch<int32_t, Arch> first,
+           xsimd::batch<int32_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::neon64>) {
+  return {vzip1q_s32(first, second), vzip2q_s32(first, second)};
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<int64_t, Arch>, xsimd::batch<int64_t, Arch>>
+interleave(xsimd::batch<int64_t, Arch> first,
+           xsimd::batch<int64_t, Arch> second,
+           xsimd::kernel::requires_arch<xsimd::neon64>) {
+  return {vzip1q_s64(first, second), vzip2q_s64(first, second)};
+}
+
+template <class Arch>
+xsimd::batch<int8_t, Arch>
+deinterleave(xsimd::batch<int16_t, Arch> first,
+             xsimd::batch<int16_t, Arch> second,
+             xsimd::kernel::requires_arch<xsimd::neon64>) {
+
+  return vqmovn_high_s16(vqmovn_s16(first), second);
+}
+
+template <class Arch>
+xsimd::batch<int16_t, Arch>
+deinterleave(xsimd::batch<int32_t, Arch> first,
+             xsimd::batch<int32_t, Arch> second,
+             xsimd::kernel::requires_arch<xsimd::neon64>) {
+  return vqmovn_high_s32(vqmovn_s32(first), second);
+}
+
+#ifdef __ARM_FEATURE_MATMUL_INT8
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+maddw(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+      xsimd::batch<int32_t, Arch> z,
+      xsimd::kernel::requires_arch<xsimd::i8mm<xsimd::neon64>>) {
+  return vusdotq_s32(z, x, y);
+}
+#endif
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+maddw(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+      xsimd::batch<int32_t, Arch> z,
+      xsimd::kernel::requires_arch<xsimd::neon64>) {
+  int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(x))),
+                           vmovl_s8(vget_low_s8(y)));
+  int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(x))),
+                           vmovl_s8(vget_high_s8(y)));
+  return vpadalq_s16(vpadalq_s16(z, tl), th);
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+maddw(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+      xsimd::kernel::requires_arch<xsimd::neon64>) {
+  int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(x))),
+                           vmovl_s8(vget_low_s8(y)));
+  int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(x))),
+                           vmovl_s8(vget_high_s8(y)));
+  return vpadalq_s16(vpaddlq_s16(tl), th);
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch> Pack0123(xsimd::batch<int32_t, Arch> sum0,
+                                      xsimd::batch<int32_t, Arch> sum1,
+                                      xsimd::batch<int32_t, Arch> sum2,
+                                      xsimd::batch<int32_t, Arch> sum3,
+                                      xsimd::kernel::requires_arch<xsimd::neon64>) {
+  auto pack01 = vpaddq_s32(sum0, sum1);
+  auto pack23 = vpaddq_s32(sum2, sum3);
+  return vpaddq_s32(pack01, pack23);
+}
+
+#endif
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+maddw(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+      xsimd::batch<int32_t, Arch> z,
+      xsimd::kernel::requires_arch<xsimd::generic>) {
+  return z + madd(xsimd::batch<int16_t, Arch>(1), madd(x, y, Arch{}), Arch{});
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+maddw(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
+      xsimd::kernel::requires_arch<xsimd::generic>) {
+  return maddw(x, y, xsimd::batch<int32_t, Arch>(0), Arch{});
+}
+
+} // namespace kernel
+
+//
+// Generic dispatcher for interleave, deinterleave madd and PermuteSummer
+//
+
+template <class T, class Arch>
+std::tuple<xsimd::batch<T, Arch>, xsimd::batch<T, Arch>>
+interleave(xsimd::batch<T, Arch> first, xsimd::batch<T, Arch> second) {
+  return kernel::interleave(first, second, Arch{});
+}
+
+template <class Arch>
+xsimd::batch<int8_t, Arch> deinterleave(xsimd::batch<int16_t, Arch> first,
+                                        xsimd::batch<int16_t, Arch> second) {
+  return kernel::deinterleave(first, second, Arch{});
+}
+template <class Arch>
+xsimd::batch<int16_t, Arch> deinterleave(xsimd::batch<int32_t, Arch> first,
+                                         xsimd::batch<int32_t, Arch> second) {
+  return kernel::deinterleave(first, second, Arch{});
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch> madd(xsimd::batch<int16_t, Arch> x,
+                                        xsimd::batch<int16_t, Arch> y) {
+  return kernel::madd(x, y, Arch{});
+}
+template <class Arch>
+inline xsimd::batch<int16_t, Arch> madd(xsimd::batch<int8_t, Arch> x,
+                                        xsimd::batch<int8_t, Arch> y) {
+  return kernel::madd(x, y, Arch{});
+}
+template <class Arch>
+inline xsimd::batch<int16_t, Arch> madd(xsimd::batch<uint8_t, Arch> x,
+                                        xsimd::batch<int8_t, Arch> y) {
+  return kernel::madd(x, y, Arch{});
+}
+template <class Arch>
+inline xsimd::batch<int32_t, Arch> maddw(xsimd::batch<uint8_t, Arch> x,
+                                         xsimd::batch<int8_t, Arch> y,
+                                         xsimd::batch<int32_t, Arch> z
+                                         ) {
+  return kernel::maddw(x, y, z, Arch{});
+}
+template <class Arch>
+inline xsimd::batch<int32_t, Arch> maddw(xsimd::batch<uint8_t, Arch> x,
+                                         xsimd::batch<int8_t, Arch> y
+                                         ) {
+  return kernel::maddw(x, y, Arch{});
+}
+
+template <class Arch>
+inline auto PermuteSummer(xsimd::batch<int32_t, Arch> pack0123,
+                          xsimd::batch<int32_t, Arch> pack4567)
+    -> decltype(kernel::PermuteSummer(pack0123, pack4567, Arch{})) {
+  return kernel::PermuteSummer(pack0123, pack4567, Arch{});
+}
+
+
+namespace kernel {
+
+  template <class Arch>
+  inline xsimd::batch<int32_t, Arch> Pack0123(xsimd::batch<int32_t, Arch> sum0,
+                                        xsimd::batch<int32_t, Arch> sum1,
+                                        xsimd::batch<int32_t, Arch> sum2,
+                                        xsimd::batch<int32_t, Arch> sum3,
+                                        xsimd::kernel::requires_arch<xsimd::generic>) {
+
+    std::tie(sum0, sum1) = interleave(sum0, sum1, Arch{});
+    auto pack01 = sum0 + sum1;
+    std::tie(sum2, sum3) = interleave(sum2, sum3, Arch{});
+    auto pack23 = sum2 + sum3;
+
+    auto packed = interleave(xsimd::bitwise_cast<int64_t>(pack01),
+                             xsimd::bitwise_cast<int64_t>(pack23),
+                             Arch{});
+    return xsimd::bitwise_cast<int32_t>(std::get<0>(packed)) +
+           xsimd::bitwise_cast<int32_t>(std::get<1>(packed));
+  }
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch> Pack0123(xsimd::batch<int32_t, Arch> sum0,
+                                      xsimd::batch<int32_t, Arch> sum1,
+                                      xsimd::batch<int32_t, Arch> sum2,
+                                      xsimd::batch<int32_t, Arch> sum3) {
+  return kernel::Pack0123(sum0, sum1, sum2, sum3, Arch{});
+}
+
+template <class Arch>
+static inline xsimd::batch<int32_t, Arch>
+quantize(xsimd::batch<float, Arch> input,
+         xsimd::batch<float, Arch> quant_mult) {
+  return xsimd::nearbyint_as_int(input * quant_mult);
+}
+
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+QuantizerGrab(const float *input, xsimd::batch<float, Arch> quant_mult_reg) {
+  return quantize(xsimd::batch<float, Arch>::load_unaligned(input),
+                  quant_mult_reg);
+}
+
+#ifdef __AVX512BW__
+inline __m512 Concat(const __m256 first, const __m256 second) {
+  // INTGEMM_AVX512DQ but that goes with INTGEMM_AVX512BW anyway.
+  return _mm512_insertf32x8(_mm512_castps256_ps512(first), second, 1);
+}
+
+// Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be
+// controlled independently.
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set
+ * INTGEMM_AVX512BW */
+inline __m512i QuantizerGrabHalves(const float *input0, const float *input1,
+                                   const __m512 quant_mult_reg) {
+  __m512 appended = Concat(_mm256_loadu_ps(input0), _mm256_loadu_ps(input1));
+  appended = _mm512_mul_ps(appended, quant_mult_reg);
+  return _mm512_cvtps_epi32(appended);
+}
+#else
+template <class Arch>
+inline xsimd::batch<int32_t, Arch>
+QuantizerGrabHalves(const float *input0, const float *input1,
+                    xsimd::batch<float, Arch> quant_mult_reg);
+#endif
+
+/* Read 8 floats at a time from input0, input1, input2, and input3.  Quantize
+ * them to 8-bit by multiplying with quant_mult_reg then rounding. Concatenate
+ * the result into one register and return it.
+ */
+class QuantizeTile8 {
+  template <class Arch> struct Tiler {
+    static constexpr uint32_t get(std::size_t i, std::size_t n) {
+      size_t factor = xsimd::batch<float, Arch>::size / 4;
+      return (i % factor) * 4 + i / factor;
+    }
+  };
+
+public:
+  template <class Arch>
+  static inline xsimd::batch<int8_t, Arch>
+  Consecutive(xsimd::batch<float, Arch> quant_mult, const float *input) {
+    return Tile(quant_mult, input + 0 * xsimd::batch<float, Arch>::size,
+                input + 1 * xsimd::batch<float, Arch>::size,
+                input + 2 * xsimd::batch<float, Arch>::size,
+                input + 3 * xsimd::batch<float, Arch>::size);
+  }
+
+  template <class Arch>
+  static inline xsimd::batch<uint8_t, Arch>
+  ConsecutiveU(xsimd::batch<float, Arch> quant_mult, const float *input) {
+    return TileU(quant_mult, input + 0 * xsimd::batch<float, Arch>::size,
+                 input + 1 * xsimd::batch<float, Arch>::size,
+                 input + 2 * xsimd::batch<float, Arch>::size,
+                 input + 3 * xsimd::batch<float, Arch>::size);
+  }
+
+  template <class Arch>
+  static inline xsimd::batch<int8_t, Arch>
+  ConsecutiveWithWrapping(xsimd::batch<float, Arch> quant_mult,
+                          const float *input, size_t cols_left, size_t cols,
+                          size_t row_step) {
+    using batchf32 = xsimd::batch<float, Arch>;
+    const float *inputs[4];
+    for (size_t i = 0; i < std::size(inputs); ++i) {
+      while (cols_left < batchf32::size) {
+        input += cols * (row_step - 1);
+        cols_left += cols;
+      }
+      inputs[i] = input;
+      input += batchf32::size;
+      cols_left -= batchf32::size;
+    }
+    return Tile(quant_mult, inputs[0], inputs[1], inputs[2], inputs[3]);
+  }
+
+  template <class Arch>
+  static inline xsimd::batch<int8_t, Arch>
+  ForReshape(xsimd::batch<float, Arch> quant_mult, const float *input,
+             size_t cols) {
+    using batchf32 = xsimd::batch<float, Arch>;
+    using batch8 = xsimd::batch<int8_t, Arch>;
+    using batch16 = xsimd::batch<int16_t, Arch>;
+    using batch32 = xsimd::batch<int32_t, Arch>;
+
+    // Put higher rows in the second half of the register.  These will jumble
+    // around in the same way then conveniently land in the right place.
+    if constexpr (batchf32::size == 16) {
+      const batch8 neg127(-127);
+      // In reverse order: grabbing the first 32-bit values from each 128-bit
+      // register, then the second 32-bit values, etc. Grab 4 registers at a
+      // time in 32-bit format.
+      batch32 g0 =
+          QuantizerGrabHalves(input + 0 * cols, input + 2 * cols, quant_mult);
+      batch32 g1 =
+          QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, quant_mult);
+      batch32 g2 =
+          QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, quant_mult);
+      batch32 g3 =
+          QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, quant_mult);
+
+      // Pack 32-bit to 16-bit.
+      batch16 packed0 = deinterleave(g0, g1);
+      batch16 packed1 = deinterleave(g2, g3);
+      // Pack 16-bit to 8-bit.
+      batch8 packed = deinterleave(packed0, packed1);
+      // Ban -128.
+      packed = xsimd::max(packed, neg127);
+
+      return xsimd::bitwise_cast<int8_t>(
+          xsimd::swizzle(xsimd::bitwise_cast<int32_t>(packed),
+                         xsimd::make_batch_constant<uint32_t, Arch, Tiler<Arch>>()));
+    } else if constexpr (batchf32::size == 8)
+      return Tile(quant_mult, input, input + 2 * cols, input + 16 * cols,
+                  input + 18 * cols);
+    else if constexpr (batchf32::size == 4)
+      // Skip a row.
+      return Tile(quant_mult, input, input + 4, input + 2 * cols,
+                  input + 2 * cols + 4);
+    else
+      return {};
+  }
+
+  template <class Arch>
+  static inline xsimd::batch<int8_t, Arch>
+  Tile(xsimd::batch<float, Arch> quant_mult, const float *input0,
+       const float *input1, const float *input2, const float *input3) {
+    using batch8 = xsimd::batch<int8_t, Arch>;
+    using batch16 = xsimd::batch<int16_t, Arch>;
+    using batch32 = xsimd::batch<int32_t, Arch>;
+
+    const batch8 neg127(-127);
+    // Grab 4 registers at a time in 32-bit format.
+    batch32 g0 = QuantizerGrab(input0, quant_mult);
+    batch32 g1 = QuantizerGrab(input1, quant_mult);
+    batch32 g2 = QuantizerGrab(input2, quant_mult);
+    batch32 g3 = QuantizerGrab(input3, quant_mult);
+    // Pack 32-bit to 16-bit.
+    batch16 packed0 = deinterleave(g0, g1);
+    batch16 packed1 = deinterleave(g2, g3);
+    // Pack 16-bit to 8-bit.
+    batch8 packed = deinterleave(packed0, packed1);
+    // Ban -128.
+    packed = xsimd::max(packed, neg127);
+
+    if constexpr (batch32::size == 4)
+      return packed;
+    // Currently in 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 4 5 6 7 12 13 14
+    // 15 20 21 22 23 28 29 30 31 Or as 32-bit integers 0 2 4 6 1 3 5 7
+    // Technically this could be removed so long as the rows are bigger than 16
+    // and the values are only used for GEMM.
+    return xsimd::bitwise_cast<int8_t>(
+        xsimd::swizzle(xsimd::bitwise_cast<int32_t>(packed),
+                       xsimd::make_batch_constant<uint32_t, Arch, Tiler<Arch>>()));
+  }
+
+private:
+  // A version that produces uint8_ts
+  template <class Arch>
+  static inline xsimd::batch<uint8_t, Arch>
+  TileU(xsimd::batch<float, Arch> quant_mult, const float *input0,
+        const float *input1, const float *input2, const float *input3) {
+    using batch8 = xsimd::batch<int8_t, Arch>;
+    using batch16 = xsimd::batch<int16_t, Arch>;
+    using batch32 = xsimd::batch<int32_t, Arch>;
+
+    const batch8 neg127 = -127;
+    const batch8 pos127 = +127;
+    // Grab 4 registers at a time in 32-bit format.
+    batch32 g0 = QuantizerGrab(input0, quant_mult);
+    batch32 g1 = QuantizerGrab(input1, quant_mult);
+    batch32 g2 = QuantizerGrab(input2, quant_mult);
+    batch32 g3 = QuantizerGrab(input3, quant_mult);
+    // Pack 32-bit to 16-bit.
+    batch16 packed0 = deinterleave(g0, g1);
+    batch16 packed1 = deinterleave(g2, g3);
+    // Pack 16-bit to 8-bit.
+    batch8 packed = deinterleave(packed0, packed1);
+    // Ban -128.
+    packed = xsimd::max(packed, neg127); // Could be removed  if we use +128
+    packed = packed + pos127;
+    if (batch32::size == 4)
+      return xsimd::bitwise_cast<uint8_t>(packed);
+    // Currently in 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 4 5 6 7 12 13 14
+    // 15 20 21 22 23 28 29 30 31 Or as 32-bit integers 0 2 4 6 1 3 5 7
+    // Technically this could be removed so long as the rows are bigger than 16
+    // and the values are only used for GEMM.
+    return xsimd::bitwise_cast<uint8_t>(
+        xsimd::swizzle(xsimd::bitwise_cast<int32_t>(packed),
+                       xsimd::make_batch_constant<uint32_t, Arch, Tiler<Arch>>()));
+  }
+};
+
+template <class Arch>
+inline void Transpose16InLane(
+    xsimd::batch<int8_t, Arch> &r0, xsimd::batch<int8_t, Arch> &r1,
+    xsimd::batch<int8_t, Arch> &r2, xsimd::batch<int8_t, Arch> &r3,
+    xsimd::batch<int8_t, Arch> &r4, xsimd::batch<int8_t, Arch> &r5,
+    xsimd::batch<int8_t, Arch> &r6, xsimd::batch<int8_t, Arch> &r7) {
+  /* r0: columns 0 1 2 3 4 5 6 7 from row 0
+     r1: columns 0 1 2 3 4 5 6 7 from row 1*/
+  auto r0_16 = xsimd::bitwise_cast<int16_t>(r0);
+  auto r1_16 = xsimd::bitwise_cast<int16_t>(r1);
+  auto r2_16 = xsimd::bitwise_cast<int16_t>(r2);
+  auto r3_16 = xsimd::bitwise_cast<int16_t>(r3);
+  auto r4_16 = xsimd::bitwise_cast<int16_t>(r4);
+  auto r5_16 = xsimd::bitwise_cast<int16_t>(r5);
+  auto r6_16 = xsimd::bitwise_cast<int16_t>(r6);
+  auto r7_16 = xsimd::bitwise_cast<int16_t>(r7);
+
+  std::tie(r0_16, r1_16) = interleave(r0_16, r1_16);
+  std::tie(r2_16, r3_16) = interleave(r2_16, r3_16);
+  std::tie(r4_16, r5_16) = interleave(r4_16, r5_16);
+  std::tie(r6_16, r7_16) = interleave(r6_16, r7_16);
+  /* r0: columns 0 0 1 1 2 2 3 3 from rows 0 and 1
+     r1: columns 4 4 5 5 6 6 7 7 from rows 0 and 1
+     r2: columns 0 0 1 1 2 2 3 3 from rows 2 and 3
+     r3: columns 4 4 5 5 6 6 7 7 from rows 2 and 3
+     r4: columns 0 0 1 1 2 2 3 3 from rows 4 and 5
+     r5: columns 4 4 5 5 6 6 7 7 from rows 4 and 5
+     r6: columns 0 0 1 1 2 2 3 3 from rows 6 and 7
+     r7: columns 4 4 5 5 6 6 7 7 from rows 6 and 7*/
+  auto r0_32 = xsimd::bitwise_cast<int32_t>(r0_16);
+  auto r2_32 = xsimd::bitwise_cast<int32_t>(r2_16);
+  auto r1_32 = xsimd::bitwise_cast<int32_t>(r1_16);
+  auto r3_32 = xsimd::bitwise_cast<int32_t>(r3_16);
+  auto r4_32 = xsimd::bitwise_cast<int32_t>(r4_16);
+  auto r6_32 = xsimd::bitwise_cast<int32_t>(r6_16);
+  auto r5_32 = xsimd::bitwise_cast<int32_t>(r5_16);
+  auto r7_32 = xsimd::bitwise_cast<int32_t>(r7_16);
+
+  std::tie(r0_32, r2_32) = interleave(r0_32, r2_32);
+  std::tie(r1_32, r3_32) = interleave(r1_32, r3_32);
+  std::tie(r4_32, r6_32) = interleave(r4_32, r6_32);
+  std::tie(r5_32, r7_32) = interleave(r5_32, r7_32);
+  /* r0: columns 0 0 0 0 1 1 1 1 from rows 0, 1, 2, and 3
+     r1: columns 4 4 4 4 5 5 5 5 from rows 0, 1, 2, and 3
+     r2: columns 2 2 2 2 3 3 3 3 from rows 0, 1, 2, and 3
+     r3: columns 6 6 6 6 7 7 7 7 from rows 0, 1, 2, and 3
+     r4: columns 0 0 0 0 1 1 1 1 from rows 4, 5, 6, and 7
+     r5: columns 4 4 4 4 5 5 5 5 from rows 4, 5, 6, and 7
+     r6: columns 2 2 2 2 3 3 3 3 from rows 4, 5, 6, and 7
+     r7: columns 6 6 6 6 7 7 7 7 from rows 4, 5, 6, and 7*/
+
+  auto r0_64 = xsimd::bitwise_cast<int64_t>(r0_32);
+  auto r2_64 = xsimd::bitwise_cast<int64_t>(r2_32);
+  auto r1_64 = xsimd::bitwise_cast<int64_t>(r1_32);
+  auto r3_64 = xsimd::bitwise_cast<int64_t>(r3_32);
+  auto r4_64 = xsimd::bitwise_cast<int64_t>(r4_32);
+  auto r6_64 = xsimd::bitwise_cast<int64_t>(r6_32);
+  auto r5_64 = xsimd::bitwise_cast<int64_t>(r5_32);
+  auto r7_64 = xsimd::bitwise_cast<int64_t>(r7_32);
+
+  std::tie(r0_64, r4_64) = interleave(r0_64, r4_64);
+  std::tie(r1_64, r5_64) = interleave(r1_64, r5_64);
+  std::tie(r2_64, r6_64) = interleave(r2_64, r6_64);
+  std::tie(r3_64, r7_64) = interleave(r3_64, r7_64);
+
+  r0 = xsimd::bitwise_cast<int8_t>(r0_64);
+  r1 = xsimd::bitwise_cast<int8_t>(r1_64);
+  r2 = xsimd::bitwise_cast<int8_t>(r2_64);
+  r3 = xsimd::bitwise_cast<int8_t>(r3_64);
+  r4 = xsimd::bitwise_cast<int8_t>(r4_64);
+  r5 = xsimd::bitwise_cast<int8_t>(r5_64);
+  r6 = xsimd::bitwise_cast<int8_t>(r6_64);
+  r7 = xsimd::bitwise_cast<int8_t>(r7_64);
+  /* r0: columns 0 0 0 0 0 0 0 0 from rows 0 through 7
+     r1: columns 4 4 4 4 4 4 4 4 from rows 0 through 7
+     r2: columns 2 2 2 2 2 2 2 2 from rows 0 through 7
+     r3: columns 6 6 6 6 6 6 6 6 from rows 0 through 7
+     r4: columns 1 1 1 1 1 1 1 1 from rows 0 through 7
+     r5: columns 5 5 5 5 5 5 5 5 from rows 0 through 7*/
+  /* Empirically gcc is able to remove these movs and just rename the outputs of
+   * Interleave64. */
+  std::swap(r1, r4);
+  std::swap(r3, r6);
+}
+
+template <class Arch, typename IntegerTy>
+void SelectColumnsOfB(const xsimd::batch<int8_t, Arch> *input,
+                      xsimd::batch<int8_t, Arch> *output,
+                      size_t rows_bytes /* number of bytes in a row */,
+                      const IntegerTy *cols_begin, const IntegerTy *cols_end) {
+  using batch8 = xsimd::batch<int8_t, Arch>;
+  /* Do columns for multiples of 8.*/
+  size_t register_rows = rows_bytes / batch8::size;
+  const batch8 *starts[8];
+  for (; cols_begin != cols_end; cols_begin += 8) {
+    for (size_t k = 0; k < 8; ++k) {
+      starts[k] =
+          input + (cols_begin[k] & 7) + (cols_begin[k] & ~7) * register_rows;
+    }
+    for (size_t r = 0; r < register_rows; ++r) {
+      for (size_t k = 0; k < 8; ++k) {
+        *(output++) = *starts[k];
+        starts[k] += 8;
+      }
+    }
+  }
+}
+
+} // namespace
+
+namespace callbacks {
+template <class Arch>
+xsimd::batch<float, Arch> Unquantize::operator()(xsimd::batch<int32_t, Arch> total, size_t, size_t,
+                            size_t) {
+  return xsimd::batch_cast<float>(total) * unquant_mult;
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>> Unquantize::operator()(
+    std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>> total,
+    size_t, size_t, size_t) {
+  return std::make_tuple(
+      xsimd::batch_cast<float>(std::get<0>(total)) * unquant_mult,
+      xsimd::batch_cast<float>(std::get<1>(total)) * unquant_mult);
+}
+
+template <class Arch>
+xsimd::batch<float, Arch> AddBias::operator()(xsimd::batch<float, Arch> total, size_t,
+                         size_t col_idx, size_t) {
+  return total + xsimd::batch<float, Arch>::load_aligned(bias_addr + col_idx);
+}
+
+template <class Arch>
+std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>>
+AddBias::operator()(
+    std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>> total,
+    size_t, size_t col_idx, size_t) {
+  return std::make_tuple(
+      std::get<0>(total) + xsimd::batch<float, Arch>::load_aligned(bias_addr + col_idx + 0),
+      std::get<1>(total) +
+          xsimd::batch<float, Arch>::load_aligned(bias_addr + col_idx +
+                              xsimd::batch<float, Arch>::size));
+}
+
+template <class Arch>
+void Write::operator()(xsimd::batch<float, Arch> result, size_t row_idx,
+                       size_t col_idx, size_t col_size) {
+  result.store_aligned(output_addr + row_idx * col_size + col_idx);
+}
+
+template <class Arch>
+void Write::operator()(xsimd::batch<int32_t, Arch> result, size_t row_idx,
+                       size_t col_idx, size_t col_size) {
+  xsimd::bitwise_cast<float>(result).store_aligned(
+      output_addr + row_idx * col_size + col_idx);
+}
+
+template <class Arch>
+void Write::operator()(
+    std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>> result,
+    size_t row_idx, size_t col_idx, size_t col_size) {
+  std::get<0>(result).store_aligned(output_addr + row_idx * col_size + col_idx +
+                                    0);
+  std::get<1>(result).store_aligned(output_addr + row_idx * col_size + col_idx +
+                                    xsimd::batch<float, Arch>::size);
+}
+
+template <class Arch>
+void Write::operator()(
+    std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>> result,
+    size_t row_idx, size_t col_idx, size_t col_size) {
+  xsimd::bitwise_cast<float>(std::get<0>(result))
+      .store_aligned(output_addr + row_idx * col_size + col_idx + 0);
+  xsimd::bitwise_cast<float>(std::get<1>(result))
+      .store_aligned(output_addr + row_idx * col_size + col_idx +
+                     xsimd::batch<int32_t, Arch>::size);
+}
+
+template <class T>
+void UnquantizeAndWrite::operator()(T const &total, size_t row_idx,
+                                    size_t col_idx, size_t col_size) {
+  auto unquantized = unquantize(total, row_idx, col_idx, col_size);
+  write(unquantized, row_idx, col_idx, col_size);
+}
+
+template <class T>
+void UnquantizeAndAddBiasAndWrite::operator()(T const &total, size_t row_idx,
+                                              size_t col_idx, size_t col_size) {
+  auto unquantized = unquantize(total, row_idx, col_idx, col_size);
+  auto bias_added = add_bias(unquantized, row_idx, col_idx, col_size);
+  write(bias_added, row_idx, col_idx, col_size);
+}
+} // namespace callbacks
+
+template <class Arch>
+void Engine<Arch>::QuantizeU(const float *input, uint8_t *output,
+                             float quant_mult, size_t size) {
+  using batch8 = xsimd::batch<int8_t, Arch>;
+
+  xsimd::batch<float, Arch> q(quant_mult);
+  const float *end = input + size;
+  for (; input != end; input += batch8::size, output += batch8::size) {
+    auto tile = QuantizeTile8::ConsecutiveU(q, input);
+    tile.store_aligned(output);
+  }
+}
+
+template <class Arch>
+void Engine<Arch>::Quantize(const float *const input, int8_t *const output,
+                            float quant_mult, size_t size) {
+  using batch8 = xsimd::batch<int8_t, Arch>;
+
+  const std::size_t kBatch = batch8::size;
+  const std::size_t fast_end = size & ~(kBatch - 1);
+
+  xsimd::batch<float, Arch> q(quant_mult);
+  for (std::size_t i = 0; i < fast_end; i += kBatch) {
+    auto tile = QuantizeTile8::Consecutive(q, input + i);
+    tile.store_aligned(output + i);
+  }
+
+  std::size_t overhang = size & (kBatch - 1);
+  if (!overhang)
+    return;
+  /* Each does size(xsimd::batch<int8_t, Arch>) / 32 == kBatch / 4 floats at a
+   * time. If we're allowed to read one of them, then we can read the whole
+   * register.
+   */
+  const float *inputs[4];
+  std::size_t i;
+  for (i = 0; i < (overhang + (kBatch / 4) - 1) / (kBatch / 4); ++i) {
+    inputs[i] = &input[fast_end + i * (kBatch / 4)];
+  }
+  /* These will be clipped off. */
+  for (; i < 4; ++i) {
+    inputs[i] = &input[fast_end];
+  }
+  auto result =
+      QuantizeTile8::Tile(q, inputs[0], inputs[1], inputs[2], inputs[3]);
+  alignas(Arch::alignment()) int8_t buffer[kBatch];
+  result.store_aligned(buffer);
+  std::memcpy(output + (size & ~(kBatch - 1)), buffer, overhang);
+}
+
+template <class Arch>
+template <typename IntegerTy>
+void Engine<Arch>::SelectColumnsB(const int8_t *input, int8_t *output,
+                                  size_t rows, const IntegerTy *cols_begin,
+                                  const IntegerTy *cols_end) {
+  using batch8 = xsimd::batch<int8_t, Arch>;
+  SelectColumnsOfB(reinterpret_cast<const batch8 *>(input),
+                   reinterpret_cast<batch8 *>(output), rows, cols_begin,
+                   cols_end);
+}
+
+template <class Arch>
+void Engine<Arch>::PrepareBTransposed(const float *input, int8_t *output,
+                                      float quant_mult, size_t cols,
+                                      size_t rows) {
+  using batch8 = xsimd::batch<int8_t, Arch>;
+  const size_t RegisterElemsInt = batch8::size;
+  const size_t kColStride = 8;
+
+  xsimd::batch<float, Arch> q(quant_mult);
+  auto *output_it = reinterpret_cast<batch8 *>(output);
+  size_t r = 0;
+  size_t c = 0;
+  while (r < rows) {
+    for (size_t ri = 0; ri < 8; ++ri)
+      *output_it++ = QuantizeTile8::ConsecutiveWithWrapping(
+          q, input + (r + ri) * cols + c, cols - c, cols, 8);
+    c += RegisterElemsInt;
+    while (c >= cols) {
+      r += kColStride;
+      c -= cols;
+    }
+  }
+}
+
+template <class Arch>
+void Engine<Arch>::PrepareBQuantizedTransposed(const int8_t *input,
+                                               int8_t *output, size_t cols,
+                                               size_t rows) {
+  using batch8 = xsimd::batch<int8_t, Arch>;
+  const size_t RegisterElems = batch8::size;
+  const size_t kColStride = 8;
+
+  auto *output_it = reinterpret_cast<batch8 *>(output);
+  for (size_t r = 0; r < rows; r += kColStride)
+    for (size_t c = 0; c < cols; c += RegisterElems)
+      for (size_t ri = 0; ri < 8; ++ri)
+        *output_it++ =
+            *reinterpret_cast<const batch8 *>(input + (r + ri) * cols + c);
+}
+
+template <class Arch>
+void Engine<Arch>::PrepareB(const float *input, int8_t *output_shadow,
+                            float quant_mult, size_t rows, size_t cols) {
+  using batch8 = xsimd::batch<int8_t, Arch>;
+
+  xsimd::batch<float, Arch> q(quant_mult);
+  /* Currently all multipliers have a stride of 8 columns.*/
+  const size_t kColStride = 8;
+  auto *output = reinterpret_cast<batch8 *>(output_shadow);
+  for (size_t c = 0; c < cols; c += kColStride) {
+    for (size_t r = 0; r < rows; r += sizeof(*output), output += 8) {
+      output[0] =
+          QuantizeTile8::ForReshape(q, input + cols * (r + 0) + c, cols);
+      output[1] =
+          QuantizeTile8::ForReshape(q, input + cols * (r + 1) + c, cols);
+      output[2] =
+          QuantizeTile8::ForReshape(q, input + cols * (r + 4) + c, cols);
+      output[3] =
+          QuantizeTile8::ForReshape(q, input + cols * (r + 5) + c, cols);
+      output[4] =
+          QuantizeTile8::ForReshape(q, input + cols * (r + 8) + c, cols);
+      output[5] =
+          QuantizeTile8::ForReshape(q, input + cols * (r + 9) + c, cols);
+      output[6] =
+          QuantizeTile8::ForReshape(q, input + cols * (r + 12) + c, cols);
+      output[7] =
+          QuantizeTile8::ForReshape(q, input + cols * (r + 13) + c, cols);
+      std::tie(output[0], output[1]) =
+          interleave(xsimd::bitwise_cast<int8_t>(output[0]),
+                     xsimd::bitwise_cast<int8_t>(output[1]));
+      std::tie(output[2], output[3]) =
+          interleave(xsimd::bitwise_cast<int8_t>(output[2]),
+                     xsimd::bitwise_cast<int8_t>(output[3]));
+      std::tie(output[4], output[5]) =
+          interleave(xsimd::bitwise_cast<int8_t>(output[4]),
+                     xsimd::bitwise_cast<int8_t>(output[5]));
+      std::tie(output[6], output[7]) =
+          interleave(xsimd::bitwise_cast<int8_t>(output[6]),
+                     xsimd::bitwise_cast<int8_t>(output[7]));
+      Transpose16InLane(output[0], output[1], output[2], output[3], output[4],
+                        output[5], output[6], output[7]);
+    }
+  }
+}
+
+template <class Arch>
+void Engine<Arch>::PrepareA(const float *input, int8_t *output,
+                            float quant_mult, size_t rows, size_t cols) {
+  Quantize(input, output, quant_mult, rows * cols);
+}
+
+template <class Arch>
+void Engine<Arch>::Shift::PrepareA(const float *input, uint8_t *output,
+                                   float quant_mult, size_t rows, size_t cols) {
+  QuantizeU(input, output, quant_mult, rows * cols);
+}
+
+struct SequentialExecutionEngine {
+
+  template<class F>
+  inline void operator()(size_t Start, size_t End, size_t Stride, F&& f) {
+    for(size_t i = Start; i < End; i += Stride) {
+      f(i);
+    }
+  }
+
+};
+
+template <class Arch>
+template <class Callback, class ExecutionEngine>
+void Engine<Arch>::Shift::Multiply(const uint8_t *A, const int8_t *B,
+                                   size_t A_rows, size_t width, size_t B_cols,
+                                   Callback callback, ExecutionEngine& engine) {
+
+  using batch8 = xsimd::batch<int8_t, Arch>;
+  using ubatch8 = xsimd::batch<uint8_t, Arch>;
+  using batch32 = xsimd::batch<int32_t, Arch>;
+
+  engine(0, B_cols, 8, [A, B, A_rows, width, B_cols, &callback](size_t B0_colidx) {
+    const size_t simd_width = width / batch8::size;
+    const auto *B0_col =
+        reinterpret_cast<const batch8 *>(B) + simd_width * B0_colidx;
+    /* Process one row of A at a time.  Doesn't seem to be faster to do multiple
+     * rows of A at once.*/
+    for (size_t A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
+      const auto *A_row =
+          reinterpret_cast<const ubatch8 *>(A + A_rowidx * width);
+      /* These will be packed 16-bit integers containing sums for each row of B
+         multiplied by the row of A. Iterate over shared (inner) dimension.*/
+      /* Upcast to 32-bit and horizontally add. Seems a bit faster if this is
+       * declared here.*/
+      size_t k = 0;
+      ubatch8 a = *(A_row + k);
+      batch32 isum0 = maddw(a, *(B0_col + k * 8));
+      batch32 isum1 = maddw(a, *(B0_col + k * 8 + 1));
+      batch32 isum2 = maddw(a, *(B0_col + k * 8 + 2));
+      batch32 isum3 = maddw(a, *(B0_col + k * 8 + 3));
+      batch32 isum4 = maddw(a, *(B0_col + k * 8 + 4));
+      batch32 isum5 = maddw(a, *(B0_col + k * 8 + 5));
+      batch32 isum6 = maddw(a, *(B0_col + k * 8 + 6));
+      batch32 isum7 = maddw(a, *(B0_col + k * 8 + 7));
+      for (k = 1; k < simd_width; ++k) {
+        a = *(A_row + k);
+        /* Multiply 8-bit, horizontally add to packed 16-bit integers.*/
+        /* Upcast to 32-bit and horizontally add.*/
+        isum0 = maddw(a, *(B0_col + k * 8 + 0), isum0);
+        isum1 = maddw(a, *(B0_col + k * 8 + 1), isum1);
+        isum2 = maddw(a, *(B0_col + k * 8 + 2), isum2);
+        isum3 = maddw(a, *(B0_col + k * 8 + 3), isum3);
+        isum4 = maddw(a, *(B0_col + k * 8 + 4), isum4);
+        isum5 = maddw(a, *(B0_col + k * 8 + 5), isum5);
+        isum6 = maddw(a, *(B0_col + k * 8 + 6), isum6);
+        isum7 = maddw(a, *(B0_col + k * 8 + 7), isum7);
+      }
+      /* Reduce sums within 128-bit lanes.*/
+      auto pack0123 = Pack0123(isum0, isum1, isum2, isum3);
+      auto pack4567 = Pack0123(isum4, isum5, isum6, isum7);
+      /*The specific implementation may need to reduce further.*/
+      auto total = PermuteSummer(pack0123, pack4567);
+      callback(total, A_rowidx, B0_colidx, B_cols);
+    }
+  });
+}
+
+template <class Arch>
+template <class Callback>
+void Engine<Arch>::Shift::PrepareBias(const int8_t *B, size_t width,
+                                      size_t B_cols, Callback C) {
+  using batch8 = xsimd::batch<int8_t, Arch>;
+  const size_t simd_width = width / batch8::size;
+  xsimd::batch<uint8_t, Arch> a(1);
+  for (size_t j = 0; j < B_cols; j += 8) {
+    /*Process one row of A at a time.  Doesn't seem to be faster to do multiple
+     * rows of A at once.*/
+    const int8_t *B_j = B + j * width;
+
+    /* Rather than initializing as zeros and adding, just initialize the
+     * first.*/
+    /* These will be packed 16-bit integers containing sums for each column of
+     * B multiplied by the row of A.*/
+    /* Upcast to 32-bit and horizontally add. Seems a bit faster if this is
+     * declared here.*/
+    auto isum0 = maddw(a, batch8::load_aligned(&B_j[0 * batch8::size]));
+    auto isum1 = maddw(a, batch8::load_aligned(&B_j[1 * batch8::size]));
+    auto isum2 = maddw(a, batch8::load_aligned(&B_j[2 * batch8::size]));
+    auto isum3 = maddw(a, batch8::load_aligned(&B_j[3 * batch8::size]));
+    auto isum4 = maddw(a, batch8::load_aligned(&B_j[4 * batch8::size]));
+    auto isum5 = maddw(a, batch8::load_aligned(&B_j[5 * batch8::size]));
+    auto isum6 = maddw(a, batch8::load_aligned(&B_j[6 * batch8::size]));
+    auto isum7 = maddw(a, batch8::load_aligned(&B_j[7 * batch8::size]));
+
+    B_j += 8 * batch8::size;
+
+    for (size_t k = 1; k < simd_width; ++k, B_j += 8 * batch8::size) {
+      isum0 = maddw(a, batch8::load_aligned(&B_j[0 * batch8::size]), isum0);
+      isum1 = maddw(a, batch8::load_aligned(&B_j[1 * batch8::size]), isum1);
+      isum2 = maddw(a, batch8::load_aligned(&B_j[2 * batch8::size]), isum2);
+      isum3 = maddw(a, batch8::load_aligned(&B_j[3 * batch8::size]), isum3);
+      isum4 = maddw(a, batch8::load_aligned(&B_j[4 * batch8::size]), isum4);
+      isum5 = maddw(a, batch8::load_aligned(&B_j[5 * batch8::size]), isum5);
+      isum6 = maddw(a, batch8::load_aligned(&B_j[6 * batch8::size]), isum6);
+      isum7 = maddw(a, batch8::load_aligned(&B_j[7 * batch8::size]), isum7);
+    }
+
+    auto pack0123 = Pack0123(isum0, isum1, isum2, isum3);
+    auto pack4567 = Pack0123(isum4, isum5, isum6, isum7);
+
+    auto total = PermuteSummer(pack0123, pack4567);
+    C(total, 0, j, B_cols);
+  }
+}
+
+} // namespace gemmology
+
+#endif
diff --git a/include/onnxruntime/gemmology_fwd.h b/include/onnxruntime/gemmology_fwd.h
new file mode 100644
index 0000000000000..ba5a2490ed879
--- /dev/null
+++ b/include/onnxruntime/gemmology_fwd.h
@@ -0,0 +1,282 @@
+/***************************************************************
+ *                                       _                     *
+ *                                      | |                    *
+ *   __ _  ___ _ __ ___  _ __ ___   ___ | | ___   __ _ _   _   *
+ *  / _` |/ _ \ '_ ` _ \| '_ ` _ \ / _ \| |/ _ \ / _` | | | |  *
+ * | (_| |  __/ | | | | | | | | | | (_) | | (_) | (_| | |_| |  *
+ *  \__, |\___|_| |_| |_|_| |_| |_|\___/|_|\___/ \__, |\__, |  *
+ *   __/ |                                        __/ | __/ |  *
+ *  |___/                                        |___/ |___/   *
+ *                                                             *
+ *                                                 version 0.1 *
+ ***************************************************************/
+
+#ifndef GEMMOLOGY_FWD_H
+#define GEMMOLOGY_FWD_H
+
+#include <cstdint>
+#include <cstring>
+#include <tuple>
+#include "xsimd/xsimd.hpp"
+
+#ifdef GEMMOLOGY_WITH_STD_THREAD
+#include <thread>
+#include <vector>
+#endif
+
+namespace gemmology {
+
+struct SequentialExecutionEngine;
+
+#ifdef GEMMOLOGY_WITH_STD_THREAD
+struct StdThreadExecutionEngine {
+
+  StdThreadExecutionEngine(size_t PoolSize) : MaxPoolSize(PoolSize) {
+    Pool.reserve(PoolSize - 1);
+  }
+
+  template<class F>
+  inline void operator()(size_t Start, size_t End, size_t Stride, F&& f) {
+    const size_t NbIter = (End - Start) / Stride;
+    const size_t NbThread = std::min(NbIter, MaxPoolSize);
+    const size_t Chunk = (NbIter / NbThread) * Stride;
+
+    size_t Curr = Start, Next = Start;
+
+    for(size_t threadID = 0; threadID < NbThread - 1; ++threadID) {
+      Next += Chunk;
+      Pool.emplace_back([=]() {
+        for(size_t i = Curr; i < Next; i += Stride) {
+          f(i);
+        };
+      });
+      Curr = Next;
+    }
+
+    for(size_t i = Next; i < End; i += Stride) {
+      f(i);
+    };
+    for(size_t threadID = 0; threadID < Pool.size(); ++threadID) {
+      Pool[threadID].join();
+    }
+    Pool.clear();
+  }
+
+  private:
+    const size_t MaxPoolSize;
+    std::vector<std::thread> Pool;
+
+};
+
+#endif
+
+#ifdef _OPENMP
+struct OpenMPExecutionEngine {
+
+  template<class F>
+  inline void operator()(size_t Start, size_t End, size_t Stride, F&& f) {
+#pragma omp parallel for
+    for(size_t i = Start; i < End; i += Stride) {
+      f(i);
+    }
+  }
+
+};
+#endif
+
+namespace callbacks {
+
+struct Unquantize {
+  float unquant_mult;
+  template <class Arch>
+  xsimd::batch<float, Arch> operator()(xsimd::batch<int32_t, Arch> total, size_t, size_t, size_t);
+  template <class Arch>
+  std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>> operator()(
+      std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
+          total,
+      size_t, size_t, size_t);
+};
+
+struct AddBias {
+  const float *bias_addr;
+  template <class Arch>
+  xsimd::batch<float, Arch> operator()(xsimd::batch<float, Arch> total, size_t, size_t col_idx,
+                  size_t);
+  template <class Arch>
+  std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>>
+  operator()(
+      std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>> total,
+      size_t, size_t col_idx, size_t);
+};
+
+struct Write {
+  float *output_addr;
+
+  Write(float *o) : output_addr(o) {}
+
+  template <class Arch>
+  void operator()(xsimd::batch<float, Arch> result, size_t row_idx,
+                  size_t col_idx, size_t col_size);
+  template <class Arch>
+  void operator()(xsimd::batch<int32_t, Arch> result, size_t row_idx,
+                  size_t col_idx, size_t col_size);
+
+  template <class Arch>
+  void operator()(
+      std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>> result,
+      size_t row_idx, size_t col_idx, size_t col_size);
+
+  template <class Arch>
+  void operator()(
+      std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
+          result,
+      size_t row_idx, size_t col_idx, size_t col_size);
+};
+
+struct UnquantizeAndWrite {
+
+  Unquantize unquantize;
+  Write write;
+
+  UnquantizeAndWrite(float factor, float *output)
+      : unquantize{factor}, write{output} {}
+
+  template <class T>
+  void operator()(T const &total, size_t row_idx, size_t col_idx,
+                  size_t col_size);
+};
+
+struct UnquantizeAndAddBiasAndWrite {
+
+  Unquantize unquantize;
+  AddBias add_bias;
+  Write write;
+
+  UnquantizeAndAddBiasAndWrite(float factor, const float *bias, float *output)
+      : unquantize{factor}, add_bias{bias}, write{output} {}
+
+  template <class T>
+  void operator()(T const &total, size_t row_idx, size_t col_idx,
+                  size_t col_size);
+};
+
+} // namespace callbacks
+
+//
+// Arch-specific implementation of each routine
+//
+template <class Arch> struct Engine {
+
+  static void QuantizeU(const float *input, uint8_t *output, float quant_mult,
+                        size_t size);
+
+  static void Quantize(const float *const input, int8_t *const output,
+                       float quant_mult, size_t size);
+
+  template <typename IntegerTy>
+  static void SelectColumnsB(const int8_t *input, int8_t *output, size_t rows,
+                             const IntegerTy *cols_begin,
+                             const IntegerTy *cols_end);
+
+  static void PrepareBTransposed(const float *input, int8_t *output,
+                                 float quant_mult, size_t cols, size_t rows);
+
+  static void PrepareBQuantizedTransposed(const int8_t *input, int8_t *output,
+                                          size_t cols, size_t rows);
+
+  static void PrepareB(const float *input, int8_t *output_shadow,
+                       float quant_mult, size_t rows, size_t cols);
+
+  static void PrepareA(const float *input, int8_t *output, float quant_mult,
+                       size_t rows, size_t cols);
+
+  struct Shift {
+
+    static void PrepareA(const float *input, uint8_t *output, float quant_mult,
+                         size_t rows, size_t cols);
+
+    template <class Callback, class ExecutionEngine>
+    static void Multiply(const uint8_t *A, const int8_t *B, size_t A_rows,
+                         size_t width, size_t B_cols, Callback callback,
+                         ExecutionEngine& engine);
+
+    template <class Callback>
+    static void PrepareBias(const int8_t *B, size_t width, size_t B_cols,
+                            Callback C);
+  };
+};
+
+//
+// Top-level wrappers that mostly match intgemm API
+//
+
+template <class Arch = xsimd::default_arch>
+inline void QuantizeU(const float *input, uint8_t *output, float quant_mult,
+                      size_t size) {
+  return Engine<Arch>::QuantizeU(input, output, quant_mult, size);
+}
+
+template <class Arch = xsimd::default_arch>
+inline void Quantize(const float *const input, int8_t *const output,
+                     float quant_mult, size_t size) {
+  return Engine<Arch>::Quantize(input, output, quant_mult, size);
+}
+
+template <class Arch = xsimd::default_arch, typename IntegerTy>
+inline void SelectColumnsB(const int8_t *input, int8_t *output, size_t rows,
+                           const IntegerTy *cols_begin,
+                           const IntegerTy *cols_end) {
+  return Engine<Arch>::SelectColumnsB(input, output, rows, cols_begin,
+                                      cols_end);
+}
+
+template <class Arch = xsimd::default_arch>
+inline void PrepareBTransposed(const float *input, int8_t *output,
+                               float quant_mult, size_t cols, size_t rows) {
+  return Engine<Arch>::PrepareBTransposed(input, output, quant_mult, cols,
+                                          rows);
+}
+
+template <class Arch = xsimd::default_arch>
+inline void PrepareBQuantizedTransposed(const int8_t *input, int8_t *output,
+                                        size_t cols, size_t rows) {
+  return Engine<Arch>::PrepareBQuantizedTransposed(input, output, cols, rows);
+}
+
+template <class Arch = xsimd::default_arch>
+inline void PrepareB(const float *input, int8_t *output_shadow,
+                     float quant_mult, size_t rows, size_t cols) {
+  return Engine<Arch>::PrepareB(input, output_shadow, quant_mult, rows, cols);
+}
+
+template <class Arch = xsimd::default_arch>
+inline void PrepareA(const float *input, int8_t *output, float quant_mult,
+                     size_t rows, size_t cols) {
+  return Engine<Arch>::PrepareA(input, output, quant_mult, rows, cols);
+}
+
+namespace Shift {
+
+template <class Arch = xsimd::default_arch>
+inline void PrepareA(const float *input, uint8_t *output, float quant_mult,
+                     size_t rows, size_t cols) {
+  return Engine<Arch>::Shift::PrepareA(input, output, quant_mult, rows, cols);
+}
+
+template <class Arch = xsimd::default_arch, class Callback, class ExecutionEngine=SequentialExecutionEngine>
+inline void Multiply(const uint8_t *A, const int8_t *B, size_t A_rows,
+                     size_t width, size_t B_cols, Callback C, ExecutionEngine&& engine={}) {
+  return Engine<Arch>::Shift::Multiply(A, B, A_rows, width, B_cols, C, engine);
+}
+
+template <class Arch = xsimd::default_arch, class Callback>
+inline void PrepareBias(const int8_t *B, size_t width, size_t B_cols,
+                        Callback C) {
+  return Engine<Arch>::Shift::PrepareBias(B, width, B_cols, C);
+}
+
+} // namespace Shift
+
+} // namespace gemmology
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/generic/xsimd_generic_arithmetic.hpp b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
new file mode 100644
index 0000000000000..e7916b0d43641
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
@@ -0,0 +1,241 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_ARITHMETIC_HPP
+#define XSIMD_GENERIC_ARITHMETIC_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // bitwise_lshift
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x << y; },
+                                 self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x >> y; },
+                                 self, other);
+        }
+
+        // decr
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> decr(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self - T(1);
+        }
+
+        // decr_if
+        template <class A, class T, class Mask>
+        XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, Mask const& mask, requires_arch<generic>) noexcept
+        {
+            return select(mask, decr(self), self);
+        }
+
+        // div
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept -> T
+                                 { return x / y; },
+                                 self, other);
+        }
+
+        // fma
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return x * y + z;
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> fma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
+            auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // fms
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return x * y - z;
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> fms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
+            auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // fnma
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return -x * y + z;
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> fnma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = -fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
+            auto res_i = -fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // fnms
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return -x * y - z;
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> fnms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = -fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
+            auto res_i = -fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // hadd
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        XSIMD_INLINE T hadd(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            self.store_aligned(buffer);
+            T res = 0;
+            for (T val : buffer)
+            {
+                res += val;
+            }
+            return res;
+        }
+
+        // incr
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> incr(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self + T(1);
+        }
+
+        // incr_if
+        template <class A, class T, class Mask>
+        XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, Mask const& mask, requires_arch<generic>) noexcept
+        {
+            return select(mask, incr(self), self);
+        }
+
+        // mul
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept -> T
+                                 { return x * y; },
+                                 self, other);
+        }
+
+        // rotl
+        template <class A, class T, class STy>
+        XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, STy other, requires_arch<generic>) noexcept
+        {
+            constexpr auto N = std::numeric_limits<T>::digits;
+            return (self << other) | (self >> (N - other));
+        }
+
+        // rotr
+        template <class A, class T, class STy>
+        XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, STy other, requires_arch<generic>) noexcept
+        {
+            constexpr auto N = std::numeric_limits<T>::digits;
+            return (self >> other) | (self << (N - other));
+        }
+
+        // sadd
+        template <class A>
+        XSIMD_INLINE batch<float, A> sadd(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
+        {
+            return add(self, other); // no saturated arithmetic on floating point numbers
+        }
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                auto mask = (other >> (8 * sizeof(T) - 1));
+                auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+                auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+                return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
+            }
+            else
+            {
+                const auto diffmax = std::numeric_limits<T>::max() - self;
+                const auto mindiff = min(diffmax, other);
+                return self + mindiff;
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> sadd(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
+        {
+            return add(self, other); // no saturated arithmetic on floating point numbers
+        }
+
+        // ssub
+        template <class A>
+        XSIMD_INLINE batch<float, A> ssub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
+        {
+            return sub(self, other); // no saturated arithmetic on floating point numbers
+        }
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                return sadd(self, -other);
+            }
+            else
+            {
+                const auto diff = min(self, other);
+                return self - diff;
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> ssub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
+        {
+            return sub(self, other); // no saturated arithmetic on floating point numbers
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/generic/xsimd_generic_complex.hpp b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_complex.hpp
new file mode 100644
index 0000000000000..812c592aec03c
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_complex.hpp
@@ -0,0 +1,108 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_COMPLEX_HPP
+#define XSIMD_GENERIC_COMPLEX_HPP
+
+#include <complex>
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // real
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> real(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> real(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self.real();
+        }
+
+        // imag
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> imag(batch<T, A> const& /*self*/, requires_arch<generic>) noexcept
+        {
+            return batch<T, A>(T(0));
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> imag(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self.imag();
+        }
+
+        // arg
+        template <class A, class T>
+        XSIMD_INLINE real_batch_type_t<batch<T, A>> arg(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return atan2(imag(self), real(self));
+        }
+
+        // conj
+        template <class A, class T>
+        XSIMD_INLINE complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return { real(self), -imag(self) };
+        }
+
+        // norm
+        template <class A, class T>
+        XSIMD_INLINE real_batch_type_t<batch<T, A>> norm(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return { fma(real(self), real(self), imag(self) * imag(self)) };
+        }
+
+        // proj
+        template <class A, class T>
+        XSIMD_INLINE complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = complex_batch_type_t<batch<T, A>>;
+            using real_batch = typename batch_type::real_batch;
+            using real_value_type = typename real_batch::value_type;
+            auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self));
+            return select(cond,
+                          batch_type(constants::infinity<real_batch>(),
+                                     copysign(real_batch(real_value_type(0)), imag(self))),
+                          batch_type(self));
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> isnan(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(isnan(self.real()) || isnan(self.imag()));
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> isinf(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(isinf(self.real()) || isinf(self.imag()));
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> isfinite(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(isfinite(self.real()) && isfinite(self.imag()));
+        }
+    }
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/generic/xsimd_generic_details.hpp b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_details.hpp
new file mode 100644
index 0000000000000..a9af608c88c56
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_details.hpp
@@ -0,0 +1,316 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_DETAILS_HPP
+#define XSIMD_GENERIC_DETAILS_HPP
+
+#include <complex>
+
+#include "../../math/xsimd_rem_pio2.hpp"
+#include "../../types/xsimd_generic_arch.hpp"
+#include "../../types/xsimd_utils.hpp"
+#include "../xsimd_constants.hpp"
+
+namespace xsimd
+{
+    // Forward declaration. Should we put them in a separate file?
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> abs(batch<std::complex<T>, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE bool any(batch_bool<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+    template <class A, class T_out, class T_in>
+    XSIMD_INLINE batch<T_out, A> batch_cast(batch<T_in, A> const&, batch<T_out, A> const& out) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> bitofsign(batch<T, A> const& self) noexcept;
+    template <class T_out, class T_in, class A>
+    XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> cos(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> cosh(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> exp(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
+    template <class T, class A, uint64_t... Coefs>
+    XSIMD_INLINE batch<T, A> horner(const batch<T, A>& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> hypot(const batch<T, A>& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> is_even(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> is_flint(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> is_odd(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE typename batch<T, A>::batch_bool_type isinf(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE typename batch<T, A>::batch_bool_type isfinite(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> log(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE T reduce_add(batch<T, A> const&) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> sign(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> signnz(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> sin(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> sinh(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> tan(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<as_float_t<T>, A> to_float(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<as_integer_t<T>, A> to_int(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> trunc(batch<T, A> const& self) noexcept;
+
+    namespace kernel
+    {
+
+        namespace detail
+        {
+            template <class F, class A, class T, class... Batches>
+            XSIMD_INLINE batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                constexpr std::size_t size = batch<T, A>::size;
+                alignas(A::alignment()) T self_buffer[size];
+                alignas(A::alignment()) T other_buffer[size];
+                self.store_aligned(&self_buffer[0]);
+                other.store_aligned(&other_buffer[0]);
+                for (std::size_t i = 0; i < size; ++i)
+                {
+                    self_buffer[i] = func(self_buffer[i], other_buffer[i]);
+                }
+                return batch<T, A>::load_aligned(self_buffer);
+            }
+
+            template <class U, class F, class A, class T>
+            XSIMD_INLINE batch<U, A> apply_transform(F&& func, batch<T, A> const& self) noexcept
+            {
+                static_assert(batch<T, A>::size == batch<U, A>::size,
+                              "Source and destination sizes must match");
+                constexpr std::size_t src_size = batch<T, A>::size;
+                constexpr std::size_t dest_size = batch<U, A>::size;
+                alignas(A::alignment()) T self_buffer[src_size];
+                alignas(A::alignment()) U other_buffer[dest_size];
+                self.store_aligned(&self_buffer[0]);
+                for (std::size_t i = 0; i < src_size; ++i)
+                {
+                    other_buffer[i] = func(self_buffer[i]);
+                }
+                return batch<U, A>::load_aligned(other_buffer);
+            }
+        }
+
+        // some generic fast_cast conversion
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<uint8_t, A> fast_cast(batch<int8_t, A> const& self, batch<uint8_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<uint8_t>(self);
+            }
+            template <class A>
+            XSIMD_INLINE batch<uint16_t, A> fast_cast(batch<int16_t, A> const& self, batch<uint16_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<uint16_t>(self);
+            }
+            template <class A>
+            XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<int32_t, A> const& self, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<uint32_t>(self);
+            }
+            template <class A>
+            XSIMD_INLINE batch<uint64_t, A> fast_cast(batch<int64_t, A> const& self, batch<uint64_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<uint64_t>(self);
+            }
+            template <class A>
+            XSIMD_INLINE batch<int8_t, A> fast_cast(batch<uint8_t, A> const& self, batch<int8_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<int8_t>(self);
+            }
+            template <class A>
+            XSIMD_INLINE batch<int16_t, A> fast_cast(batch<uint16_t, A> const& self, batch<int16_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<int16_t>(self);
+            }
+            template <class A>
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<uint32_t, A> const& self, batch<int32_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<int32_t>(self);
+            }
+            template <class A>
+            XSIMD_INLINE batch<int64_t, A> fast_cast(batch<uint64_t, A> const& self, batch<int64_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<int64_t>(self);
+            }
+
+            // Provide a generic uint32_t -> float cast only if we have a
+            // non-generic int32_t -> float fast_cast
+            template <class A, class _ = decltype(fast_cast(std::declval<batch<int32_t, A> const&>(), std::declval<batch<float, A> const&>(), A {}))>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<generic>) noexcept
+            {
+                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
+                batch<uint32_t, A> msk_lo(0xFFFF);
+                batch<float, A> cnst65536f(65536.0f);
+
+                auto v_lo = batch_cast<int32_t>(v & msk_lo); /* extract the 16 lowest significant bits of self                             */
+                auto v_hi = batch_cast<int32_t>(v >> 16); /* 16 most significant bits of v                                                 */
+                auto v_lo_flt = batch_cast<float>(v_lo); /* No rounding                                                                */
+                auto v_hi_flt = batch_cast<float>(v_hi); /* No rounding                                                                */
+                v_hi_flt = cnst65536f * v_hi_flt; /* No rounding                                                            */
+                return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer   */
+            }
+
+            // Provide a generic float -> uint32_t cast only if we have a
+            // non-generic float -> int32_t fast_cast
+            template <class A, class _ = decltype(fast_cast(std::declval<batch<float, A> const&>(), std::declval<batch<int32_t, A> const&>(), A {}))>
+            XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& v, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
+            {
+                auto is_large = v >= batch<float, A>(1u << 31);
+                auto small_v = bitwise_cast<float>(batch_cast<int32_t>(v));
+                auto large_v = bitwise_cast<float>(
+                    batch_cast<int32_t>(v - batch<float, A>(1u << 31))
+                    ^ batch<int32_t, A>(1u << 31));
+                return bitwise_cast<uint32_t>(select(is_large, large_v, small_v));
+            }
+        }
+
+        namespace detail
+        {
+            // Generic conversion handling machinery. Each architecture must define
+            // conversion function when such conversions exits in the form of
+            // intrinsic. Then we use that information to automatically decide whether
+            // to use scalar or vector conversion when doing load / store / batch_cast
+            struct with_fast_conversion
+            {
+            };
+            struct with_slow_conversion
+            {
+            };
+
+            template <class A, class From, class To, class = void>
+            struct conversion_type_impl
+            {
+                using type = with_slow_conversion;
+            };
+
+            using xsimd::detail::void_t;
+
+            template <class A, class From, class To>
+            struct conversion_type_impl<A, From, To,
+                                        void_t<decltype(fast_cast(std::declval<const batch<From, A>&>(),
+                                                                  std::declval<const batch<To, A>&>(),
+                                                                  std::declval<const A&>()))>>
+            {
+                using type = with_fast_conversion;
+            };
+
+            template <class A, class From, class To>
+            using conversion_type = typename conversion_type_impl<A, From, To>::type;
+        }
+
+        namespace detail
+        {
+            /* origin: boost/simdfunction/horn.hpp*/
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B, uint64_t c>
+            XSIMD_INLINE B coef() noexcept
+            {
+                using value_type = typename B::value_type;
+                return B(bit_cast<value_type>(as_unsigned_integer_t<value_type>(c)));
+            }
+            template <class B>
+            XSIMD_INLINE B horner(const B&) noexcept
+            {
+                return B(typename B::value_type(0.));
+            }
+
+            template <class B, uint64_t c0>
+            XSIMD_INLINE B horner(const B&) noexcept
+            {
+                return coef<B, c0>();
+            }
+
+            template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
+            XSIMD_INLINE B horner(const B& self) noexcept
+            {
+                return fma(self, horner<B, c1, args...>(self), coef<B, c0>());
+            }
+
+            /* origin: boost/simdfunction/horn1.hpp*/
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            XSIMD_INLINE B horner1(const B&) noexcept
+            {
+                return B(1.);
+            }
+
+            template <class B, uint64_t c0>
+            XSIMD_INLINE B horner1(const B& x) noexcept
+            {
+                return x + detail::coef<B, c0>();
+            }
+
+            template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
+            XSIMD_INLINE B horner1(const B& x) noexcept
+            {
+                return fma(x, horner1<B, c1, args...>(x), detail::coef<B, c0>());
+            }
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/generic/xsimd_generic_logical.hpp b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_logical.hpp
new file mode 100644
index 0000000000000..4f5dd8e4bd04e
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_logical.hpp
@@ -0,0 +1,208 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_LOGICAL_HPP
+#define XSIMD_GENERIC_LOGICAL_HPP
+
+#include "./xsimd_generic_details.hpp"
+
+#include <climits>
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // count
+        template <class A, class T>
+        XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            uint64_t m = self.mask();
+            XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size < 14)
+            {
+                // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64
+                return (m * 0x200040008001ULL & 0x111111111111111ULL) % 0xf;
+            }
+            else
+            {
+#if defined __has_builtin
+#if __has_builtin(__builtin_popcountg)
+#define builtin_popcount(v) __builtin_popcountg(v)
+#endif
+#endif
+
+#ifdef builtin_popcount
+                return builtin_popcount(m);
+#else
+                // FIXME: we could do better by dispatching to the appropriate
+                // popcount instruction depending on the arch...
+                XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size <= 32)
+                {
+                    uint32_t m32 = static_cast<uint32_t>(m);
+                    // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+                    m32 = m32 - ((m32 >> 1) & 0x55555555); // reuse input as temporary
+                    m32 = (m32 & 0x33333333) + ((m32 >> 2) & 0x33333333); // temp
+                    return (((m32 + (m32 >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; // count
+                }
+                else
+                {
+                    // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+                    m = m - ((m >> 1) & (uint64_t) ~(uint64_t)0 / 3); // temp
+                    m = (m & (uint64_t) ~(uint64_t)0 / 15 * 3) + ((m >> 2) & (uint64_t) ~(uint64_t)0 / 15 * 3); // temp
+                    m = (m + (m >> 4)) & (uint64_t) ~(uint64_t)0 / 255 * 15; // temp
+                    return (m * ((uint64_t) ~(uint64_t)0 / 255)) >> (sizeof(uint64_t) - 1) * CHAR_BIT; // count
+                }
+#endif
+            }
+        }
+
+        // from  mask
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
+            // This is inefficient but should never be called. It's just a
+            // temporary implementation until arm support is added.
+            for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
+                buffer[i] = mask & (1ull << i);
+            return batch_bool<T, A>::load_aligned(buffer);
+        }
+
+        // ge
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return other <= self;
+        }
+
+        // gt
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return other < self;
+        }
+
+        // is_even
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> is_even(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return is_flint(self * T(0.5));
+        }
+
+        // is_flint
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> is_flint(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            auto frac = select(isnan(self - self), constants::nan<batch<T, A>>(), self - trunc(self));
+            return frac == T(0.);
+        }
+
+        // is_odd
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> is_odd(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return is_even(self - T(1.));
+        }
+
+        // isinf
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> isinf(batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(false);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> isinf(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return abs(self) == std::numeric_limits<float>::infinity();
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> isinf(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return abs(self) == std::numeric_limits<double>::infinity();
+        }
+
+        // isfinite
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> isfinite(batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(true);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> isfinite(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return (self - self) == 0.f;
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> isfinite(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return (self - self) == 0.;
+        }
+
+        // isnan
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(false);
+        }
+
+        // le
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return (self < other) || (self == other);
+        }
+
+        // neq
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return !(other == self);
+        }
+
+        // logical_and
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> logical_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x && y; },
+                                 self, other);
+        }
+
+        // logical_or
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> logical_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x || y; },
+                                 self, other);
+        }
+
+        // mask
+        template <class A, class T>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
+            self.store_aligned(buffer);
+            // This is inefficient but should never be called. It's just a
+            // temporary implementation until arm support is added.
+            uint64_t res = 0;
+            for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
+                if (buffer[i])
+                    res |= 1ul << i;
+            return res;
+        }
+    }
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/generic/xsimd_generic_math.hpp b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_math.hpp
new file mode 100644
index 0000000000000..b8db7f805d141
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_math.hpp
@@ -0,0 +1,2499 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_MATH_HPP
+#define XSIMD_GENERIC_MATH_HPP
+
+#include "../xsimd_scalar.hpp"
+#include "./xsimd_generic_details.hpp"
+#include "./xsimd_generic_trigo.hpp"
+
+#include <type_traits>
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+        // abs
+        template <class A, class T, class>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            if (std::is_unsigned<T>::value)
+                return self;
+            else
+            {
+                auto sign = bitofsign(self);
+                auto inv = self ^ sign;
+                return inv - sign;
+            }
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> abs(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            return hypot(z.real(), z.imag());
+        }
+
+        // avg
+        namespace detail
+        {
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::true_type, std::false_type) noexcept
+            {
+                return (x & y) + ((x ^ y) >> 1);
+            }
+
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::true_type, std::true_type) noexcept
+            {
+                // Inspired by
+                // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c
+                auto t = (x & y) + ((x ^ y) >> 1);
+                auto t_u = bitwise_cast<typename std::make_unsigned<T>::type>(t);
+                auto avg = t + (bitwise_cast<T>(t_u >> (8 * sizeof(T) - 1)) & (x ^ y));
+                return avg;
+            }
+
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::false_type, std::true_type) noexcept
+            {
+                return (x + y) / 2;
+            }
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, requires_arch<generic>) noexcept
+        {
+            return detail::avg(x, y, typename std::is_integral<T>::type {}, typename std::is_signed<T>::type {});
+        }
+
+        // avgr
+        namespace detail
+        {
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y, std::true_type) noexcept
+            {
+                constexpr unsigned shift = 8 * sizeof(T) - 1;
+                auto adj = std::is_signed<T>::value ? ((x ^ y) & 0x1) : (((x ^ y) << shift) >> shift);
+                return ::xsimd::kernel::avg(x, y, A {}) + adj;
+            }
+
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y, std::false_type) noexcept
+            {
+                return ::xsimd::kernel::avg(x, y, A {});
+            }
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y, requires_arch<generic>) noexcept
+        {
+            return detail::avgr(x, y, typename std::is_integral<T>::type {});
+        }
+
+        // batch_cast
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> batch_cast(batch<T, A> const& self, batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+
+        namespace detail
+        {
+            template <class A, class T_out, class T_in>
+            XSIMD_INLINE batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const& out, requires_arch<generic>, with_fast_conversion) noexcept
+            {
+                return fast_cast(self, out, A {});
+            }
+            template <class A, class T_out, class T_in>
+            XSIMD_INLINE batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<generic>, with_slow_conversion) noexcept
+            {
+                static_assert(!std::is_same<T_in, T_out>::value, "there should be no conversion for this type combination");
+                using batch_type_in = batch<T_in, A>;
+                using batch_type_out = batch<T_out, A>;
+                static_assert(batch_type_in::size == batch_type_out::size, "compatible sizes");
+                alignas(A::alignment()) T_in buffer_in[batch_type_in::size];
+                alignas(A::alignment()) T_out buffer_out[batch_type_out::size];
+                self.store_aligned(&buffer_in[0]);
+                std::copy(std::begin(buffer_in), std::end(buffer_in), std::begin(buffer_out));
+                return batch_type_out::load_aligned(buffer_out);
+            }
+
+        }
+
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const& out, requires_arch<generic>) noexcept
+        {
+            return detail::batch_cast(self, out, A {}, detail::conversion_type<A, T_in, T_out> {});
+        }
+
+        // bitofsign
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitofsign(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            static_assert(std::is_integral<T>::value, "int type implementation");
+            if (std::is_unsigned<T>::value)
+                return batch<T, A>(0);
+            else
+                return self >> (T)(8 * sizeof(T) - 1);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitofsign(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self & constants::signmask<batch<float, A>>();
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitofsign(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self & constants::signmask<batch<double, A>>();
+        }
+
+        // bitwise_cast
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<T, A> const& self, batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+
+        // cbrt
+        /* origin: boost/simd/arch/common/simd/function/cbrt.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A>
+        XSIMD_INLINE batch<float, A> cbrt(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type z = abs(self);
+#ifndef XSIMD_NO_DENORMALS
+            auto denormal = z < constants::smallestposval<batch_type>();
+            z = select(denormal, z * constants::twotonmb<batch_type>(), z);
+            batch_type f = select(denormal, constants::twotonmbo3<batch_type>(), batch_type(1.));
+#endif
+            const batch_type CBRT2(bit_cast<float>(0x3fa14518));
+            const batch_type CBRT4(bit_cast<float>(0x3fcb2ff5));
+            const batch_type CBRT2I(bit_cast<float>(0x3f4b2ff5));
+            const batch_type CBRT4I(bit_cast<float>(0x3f214518));
+            using i_type = as_integer_t<batch_type>;
+            i_type e;
+            batch_type x = frexp(z, e);
+            x = detail::horner<batch_type,
+                               0x3ece0609,
+                               0x3f91eb77,
+                               0xbf745265,
+                               0x3f0bf0fe,
+                               0xbe09e49a>(x);
+            auto flag = e >= i_type(0);
+            i_type e1 = abs(e);
+            i_type rem = e1;
+            e1 /= i_type(3);
+            rem -= e1 * i_type(3);
+            e = e1 * sign(e);
+            const batch_type cbrt2 = select(batch_bool_cast<float>(flag), CBRT2, CBRT2I);
+            const batch_type cbrt4 = select(batch_bool_cast<float>(flag), CBRT4, CBRT4I);
+            batch_type fact = select(batch_bool_cast<float>(rem == i_type(1)), cbrt2, batch_type(1.));
+            fact = select(batch_bool_cast<float>(rem == i_type(2)), cbrt4, fact);
+            x = ldexp(x * fact, e);
+            x -= (x - z / (x * x)) * batch_type(1.f / 3.f);
+#ifndef XSIMD_NO_DENORMALS
+            x = (x | bitofsign(self)) * f;
+#else
+            x = x | bitofsign(self);
+#endif
+#ifndef XSIMD_NO_INFINITIES
+            return select(self == batch_type(0.) || isinf(self), self, x);
+#else
+            return select(self == batch_type(0.), self, x);
+#endif
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> cbrt(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type z = abs(self);
+#ifndef XSIMD_NO_DENORMALS
+            auto denormal = z < constants::smallestposval<batch_type>();
+            z = select(denormal, z * constants::twotonmb<batch_type>(), z);
+            batch_type f = select(denormal, constants::twotonmbo3<batch_type>(), batch_type(1.));
+#endif
+            const batch_type CBRT2(bit_cast<double>(int64_t(0x3ff428a2f98d728b)));
+            const batch_type CBRT4(bit_cast<double>(int64_t(0x3ff965fea53d6e3d)));
+            const batch_type CBRT2I(bit_cast<double>(int64_t(0x3fe965fea53d6e3d)));
+            const batch_type CBRT4I(bit_cast<double>(int64_t(0x3fe428a2f98d728b)));
+            using i_type = as_integer_t<batch_type>;
+            i_type e;
+            batch_type x = frexp(z, e);
+            x = detail::horner<batch_type,
+                               0x3fd9c0c12122a4feull,
+                               0x3ff23d6ee505873aull,
+                               0xbfee8a4ca3ba37b8ull,
+                               0x3fe17e1fc7e59d58ull,
+                               0xbfc13c93386fdff6ull>(x);
+            auto flag = e >= typename i_type::value_type(0);
+            i_type e1 = abs(e);
+            i_type rem = e1;
+            e1 /= i_type(3);
+            rem -= e1 * i_type(3);
+            e = e1 * sign(e);
+            const batch_type cbrt2 = select(batch_bool_cast<double>(flag), CBRT2, CBRT2I);
+            const batch_type cbrt4 = select(batch_bool_cast<double>(flag), CBRT4, CBRT4I);
+            batch_type fact = select(batch_bool_cast<double>(rem == i_type(1)), cbrt2, batch_type(1.));
+            fact = select(batch_bool_cast<double>(rem == i_type(2)), cbrt4, fact);
+            x = ldexp(x * fact, e);
+            x -= (x - z / (x * x)) * batch_type(1. / 3.);
+            x -= (x - z / (x * x)) * batch_type(1. / 3.);
+#ifndef XSIMD_NO_DENORMALS
+            x = (x | bitofsign(self)) * f;
+#else
+            x = x | bitofsign(self);
+#endif
+#ifndef XSIMD_NO_INFINITIES
+            return select(self == batch_type(0.) || isinf(self), self, x);
+#else
+            return select(self == batch_type(0.), self, x);
+#endif
+        }
+
+        // clip
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> clip(batch<T, A> const& self, batch<T, A> const& lo, batch<T, A> const& hi, requires_arch<generic>) noexcept
+        {
+            return min(hi, max(self, lo));
+        }
+
+        // copysign
+        template <class A, class T, class _ = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> copysign(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return abs(self) | bitofsign(other);
+        }
+
+        // erf
+
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/erf_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct erf_kernel;
+
+            template <class A>
+            struct erf_kernel<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                // computes erf(a0)/a0
+                // x is sqr(a0) and 0 <= abs(a0) <= 2/3
+                static XSIMD_INLINE batch_type erf1(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3f906eba, //   1.128379154774254e+00
+                                          0xbec0937e, //  -3.761252839094832e-01
+                                          0x3de70f22, //   1.128218315189123e-01
+                                          0xbcdb61f4, //  -2.678010670585737e-02
+                                          0x3ba4468d, //   5.013293006147870e-03
+                                          0xba1fc83b //  -6.095205117313012e-04
+                                          >(x);
+                }
+
+                // computes erfc(x)*exp(sqr(x))
+                // x >=  2/3
+                static XSIMD_INLINE batch_type erfc2(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3f0a0e8b, //   5.392844046572836e-01
+                                          0xbf918a62, //  -1.137035586823118e+00
+                                          0x3e243828, //   1.603704761054187e-01
+                                          0x3ec4ca6e, //   3.843569094305250e-01
+                                          0x3e1175c7, //   1.420508523645926e-01
+                                          0x3e2006f0, //   1.562764709849380e-01
+                                          0xbfaea865, //  -1.364514006347145e+00
+                                          0x4050b063, //   3.260765682222576e+00
+                                          0xc0cd1a85, //  -6.409487379234005e+00
+                                          0x40d67e3b, //   6.702908785399893e+00
+                                          0xc0283611 //  -2.628299919293280e+00
+                                          >(x);
+                }
+
+                static XSIMD_INLINE batch_type erfc3(const batch_type& x) noexcept
+                {
+                    return (batch_type(1.) - x) * detail::horner<batch_type,
+                                                                 0x3f7ffffe, //   9.9999988e-01
+                                                                 0xbe036d7e, //  -1.2834737e-01
+                                                                 0xbfa11698, //  -1.2585020e+00
+                                                                 0xbffc9284, //  -1.9732213e+00
+                                                                 0xc016c985, //  -2.3560498e+00
+                                                                 0x3f2cff3b, //   6.7576951e-01
+                                                                 0xc010d956, //  -2.2632651e+00
+                                                                 0x401b5680, //   2.4271545e+00
+                                                                 0x41aa8e55 //   2.1319498e+01
+                                                                 >(x);
+                }
+            };
+
+            template <class A>
+            struct erf_kernel<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+                // computes erf(a0)/a0
+                // x is sqr(a0) and 0 <= abs(a0) <= 0.65
+                static XSIMD_INLINE batch_type erf1(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3ff20dd750429b61ull, // 1.12837916709551
+                                          0x3fc16500f106c0a5ull, // 0.135894887627278
+                                          0x3fa4a59a4f02579cull, // 4.03259488531795E-02
+                                          0x3f53b7664358865aull, // 1.20339380863079E-03
+                                          0x3f110512d5b20332ull // 6.49254556481904E-05
+                                          >(x)
+                        / detail::horner<batch_type,
+                                         0x3ff0000000000000ull, // 1
+                                         0x3fdd0a84eb1ca867ull, // 0.453767041780003
+                                         0x3fb64536ca92ea2full, // 8.69936222615386E-02
+                                         0x3f8166f75999dbd1ull, // 8.49717371168693E-03
+                                         0x3f37ea4332348252ull // 3.64915280629351E-04
+                                         >(x);
+                }
+
+                // computes erfc(x)*exp(x*x)
+                // 0.65 <= abs(x) <= 2.2
+                static XSIMD_INLINE batch_type erfc2(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3feffffffbbb552bull, // 0.999999992049799
+                                          0x3ff54dfe9b258a60ull, // 1.33154163936765
+                                          0x3fec1986509e687bull, // 0.878115804155882
+                                          0x3fd53dd7a67c7e9full, // 0.331899559578213
+                                          0x3fb2488a6b5cb5e5ull, // 7.14193832506776E-02
+                                          0x3f7cf4cfe0aacbb4ull, // 7.06940843763253E-03
+                                          0x0ull // 0
+                                          >(x)
+                        / detail::horner<batch_type,
+                                         0x3ff0000000000000ull, // 1
+                                         0x4003adeae79b9708ull, // 2.45992070144246
+                                         0x40053b1052dca8bdull, // 2.65383972869776
+                                         0x3ff9e677c2777c3cull, // 1.61876655543871
+                                         0x3fe307622fcff772ull, // 0.594651311286482
+                                         0x3fc033c113a7deeeull, // 0.126579413030178
+                                         0x3f89a996639b0d00ull // 1.25304936549413E-02
+                                         >(x);
+                }
+
+                // computes erfc(x)*exp(x*x)
+                // 2.2 <= abs(x) <= 6
+                static XSIMD_INLINE batch_type erfc3(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3fefff5a9e697ae2ull, // 0.99992114009714
+                                          0x3ff9fa202deb88e5ull, // 1.62356584489367
+                                          0x3ff44744306832aeull, // 1.26739901455873
+                                          0x3fe29be1cff90d94ull, // 0.581528574177741
+                                          0x3fc42210f88b9d43ull, // 0.157289620742839
+                                          0x3f971d0907ea7a92ull, // 2.25716982919218E-02
+                                          0x0ll // 0
+                                          >(x)
+                        / detail::horner<batch_type,
+                                         0x3ff0000000000000ull, // 1
+                                         0x400602f24bf3fdb6ull, // 2.75143870676376
+                                         0x400afd487397568full, // 3.37367334657285
+                                         0x400315ffdfd5ce91ull, // 2.38574194785344
+                                         0x3ff0cfd4cb6cde9full, // 1.05074004614827
+                                         0x3fd1d7ab774bb837ull, // 0.278788439273629
+                                         0x3fa47bd61bbb3843ull // 4.00072964526861E-02
+                                         >(x);
+                }
+
+                // computes erfc(rx)*exp(rx*rx)
+                // x >=  6 rx = 1/x
+                static XSIMD_INLINE batch_type erfc4(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0xbc7e4ad1ec7d0000ll, // -2.627435221016534e-17
+                                          0x3fe20dd750429a16ll, // 5.641895835477182e-01
+                                          0x3db60000e984b501ll, // 2.000889609806154e-11
+                                          0xbfd20dd753ae5dfdll, // -2.820947949598745e-01
+                                          0x3e907e71e046a820ll, // 2.457786367990903e-07
+                                          0x3fdb1494cac06d39ll, // 4.231311779019112e-01
+                                          0x3f34a451701654f1ll, // 3.149699042180451e-04
+                                          0xbff105e6b8ef1a63ll, // -1.063940737150596e+00
+                                          0x3fb505a857e9ccc8ll, // 8.211757799454056e-02
+                                          0x40074fbabc514212ll, // 2.913930388669777e+00
+                                          0x4015ac7631f7ac4fll, // 5.418419628850713e+00
+                                          0xc0457e03041e9d8bll, // -4.298446704382794e+01
+                                          0x4055803d26c4ec4fll, // 8.600373238783617e+01
+                                          0xc0505fce04ec4ec5ll // -6.549694941594051e+01
+                                          >(x);
+                }
+            };
+        }
+        /* origin: boost/simd/arch/common/simd/function/erf.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> erf(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            batch_type r1(0.);
+            auto test1 = x < batch_type(2.f / 3.f);
+            if (any(test1))
+            {
+                r1 = self * detail::erf_kernel<batch_type>::erf1(x * x);
+                if (all(test1))
+                    return r1;
+            }
+            batch_type z = x / (batch_type(1.) + x);
+            z -= batch_type(0.4f);
+            batch_type r2 = batch_type(1.) - exp(-x * x) * detail::erf_kernel<batch_type>::erfc2(z);
+            r2 = select(self < batch_type(0.), -r2, r2);
+            r1 = select(test1, r1, r2);
+#ifndef XSIMD_NO_INFINITIES
+            r1 = select(xsimd::isinf(self), sign(self), r1);
+#endif
+            return r1;
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> erf(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            batch_type xx = x * x;
+            batch_type lim1(0.65);
+            batch_type lim2(2.2);
+            auto test1 = x < lim1;
+            batch_type r1(0.);
+            if (any(test1))
+            {
+                r1 = self * detail::erf_kernel<batch_type>::erf1(xx);
+                if (all(test1))
+                    return r1;
+            }
+            auto test2 = x < lim2;
+            auto test3 = test2 && !test1;
+            batch_type ex = exp(-xx);
+            if (any(test3))
+            {
+                batch_type z = batch_type(1.) - ex * detail::erf_kernel<batch_type>::erfc2(x);
+                batch_type r2 = select(self < batch_type(0.), -z, z);
+                r1 = select(test1, r1, r2);
+                if (all(test1 || test3))
+                    return r1;
+            }
+            batch_type z = batch_type(1.) - ex * detail::erf_kernel<batch_type>::erfc3(x);
+            z = select(self < batch_type(0.), -z, z);
+#ifndef XSIMD_NO_INFINITIES
+            z = select(xsimd::isinf(self), sign(self), z);
+#endif
+            return select(test2, r1, z);
+        }
+
+        // erfc
+        template <class A>
+        XSIMD_INLINE batch<float, A> erfc(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            auto test0 = self < batch_type(0.);
+            batch_type r1(0.);
+            auto test1 = 3.f * x < 2.f;
+            batch_type z = x / (batch_type(1.) + x);
+            if (any(test1))
+            {
+                r1 = detail::erf_kernel<batch_type>::erfc3(z);
+                if (all(test1))
+                    return select(test0, batch_type(2.) - r1, r1);
+            }
+
+            z -= batch_type(0.4f);
+            batch_type r2 = exp(-x * x) * detail::erf_kernel<batch_type>::erfc2(z);
+            r1 = select(test1, r1, r2);
+#ifndef XSIMD_NO_INFINITIES
+            r1 = select(x == constants::infinity<batch_type>(), batch_type(0.), r1);
+#endif
+            return select(test0, batch_type(2.) - r1, r1);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> erfc(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            batch_type xx = x * x;
+            batch_type lim1(0.65);
+            batch_type lim2(2.2);
+            auto test0 = self < batch_type(0.);
+            auto test1 = x < lim1;
+            batch_type r1(0.);
+            if (any(test1))
+            {
+                r1 = batch_type(1.) - x * detail::erf_kernel<batch_type>::erf1(xx);
+                if (all(test1))
+                    return select(test0, batch_type(2.) - r1, r1);
+            }
+            auto test2 = x < lim2;
+            auto test3 = test2 && !test1;
+            batch_type ex = exp(-xx);
+            if (any(test3))
+            {
+                batch_type z = ex * detail::erf_kernel<batch_type>::erfc2(x);
+                r1 = select(test1, r1, z);
+                if (all(test1 || test3))
+                    return select(test0, batch_type(2.) - r1, r1);
+            }
+            batch_type z = ex * detail::erf_kernel<batch_type>::erfc3(x);
+            r1 = select(test2, r1, z);
+#ifndef XSIMD_NO_INFINITIES
+            r1 = select(x == constants::infinity<batch_type>(), batch_type(0.), r1);
+#endif
+            return select(test0, batch_type(2.) - r1, r1);
+        }
+
+        // estrin
+        namespace detail
+        {
+
+            template <class B>
+            struct estrin
+            {
+                B x;
+
+                template <typename... Ts>
+                XSIMD_INLINE B operator()(const Ts&... coefs) noexcept
+                {
+                    return eval(coefs...);
+                }
+
+            private:
+                XSIMD_INLINE B eval(const B& c0) noexcept
+                {
+                    return c0;
+                }
+
+                XSIMD_INLINE B eval(const B& c0, const B& c1) noexcept
+                {
+                    return fma(x, c1, c0);
+                }
+
+                template <size_t... Is, class Tuple>
+                XSIMD_INLINE B eval(::xsimd::detail::index_sequence<Is...>, const Tuple& tuple)
+                {
+                    return estrin { x * x }(std::get<Is>(tuple)...);
+                }
+
+                template <class... Args>
+                XSIMD_INLINE B eval(const std::tuple<Args...>& tuple) noexcept
+                {
+                    return eval(::xsimd::detail::make_index_sequence<sizeof...(Args)>(), tuple);
+                }
+
+                template <class... Args>
+                XSIMD_INLINE B eval(const std::tuple<Args...>& tuple, const B& c0) noexcept
+                {
+                    return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0))));
+                }
+
+                template <class... Args>
+                XSIMD_INLINE B eval(const std::tuple<Args...>& tuple, const B& c0, const B& c1) noexcept
+                {
+                    return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))));
+                }
+
+                template <class... Args, class... Ts>
+                XSIMD_INLINE B eval(const std::tuple<Args...>& tuple, const B& c0, const B& c1, const Ts&... coefs) noexcept
+                {
+                    return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))), coefs...);
+                }
+
+                template <class... Ts>
+                XSIMD_INLINE B eval(const B& c0, const B& c1, const Ts&... coefs) noexcept
+                {
+                    return eval(std::make_tuple(eval(c0, c1)), coefs...);
+                }
+            };
+        }
+
+        template <class T, class A, uint64_t... Coefs>
+        XSIMD_INLINE batch<T, A> estrin(const batch<T, A>& self) noexcept
+        {
+            using batch_type = batch<T, A>;
+            return detail::estrin<batch_type> { self }(detail::coef<batch_type, Coefs>()...);
+        }
+
+        // exp
+        /* origin: boost/simd/arch/common/detail/simd/expo_base.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        namespace detail
+        {
+            enum exp_reduction_tag
+            {
+                exp_tag,
+                exp2_tag,
+                exp10_tag
+            };
+
+            template <class B, exp_reduction_tag Tag>
+            struct exp_reduction_base;
+
+            template <class B>
+            struct exp_reduction_base<B, exp_tag>
+            {
+                static constexpr B maxlog() noexcept
+                {
+                    return constants::maxlog<B>();
+                }
+
+                static constexpr B minlog() noexcept
+                {
+                    return constants::minlog<B>();
+                }
+            };
+
+            template <class B>
+            struct exp_reduction_base<B, exp10_tag>
+            {
+                static constexpr B maxlog() noexcept
+                {
+                    return constants::maxlog10<B>();
+                }
+
+                static constexpr B minlog() noexcept
+                {
+                    return constants::minlog10<B>();
+                }
+            };
+
+            template <class B>
+            struct exp_reduction_base<B, exp2_tag>
+            {
+                static constexpr B maxlog() noexcept
+                {
+                    return constants::maxlog2<B>();
+                }
+
+                static constexpr B minlog() noexcept
+                {
+                    return constants::minlog2<B>();
+                }
+            };
+
+            template <class T, class A, exp_reduction_tag Tag>
+            struct exp_reduction;
+
+            template <class A>
+            struct exp_reduction<float, A, exp_tag> : exp_reduction_base<batch<float, A>, exp_tag>
+            {
+                using batch_type = batch<float, A>;
+                static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type y = detail::horner<batch_type,
+                                                  0x3f000000, //  5.0000000e-01
+                                                  0x3e2aa9a5, //  1.6666277e-01
+                                                  0x3d2aa957, //  4.1665401e-02
+                                                  0x3c098d8b, //  8.3955629e-03
+                                                  0x3ab778cf //  1.3997796e-03
+                                                  >(x);
+                    return ++fma(y, x * x, x);
+                }
+
+                static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
+                    x = fnma(k, constants::log_2hi<batch_type>(), a);
+                    x = fnma(k, constants::log_2lo<batch_type>(), x);
+                    return k;
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<float, A, exp10_tag> : exp_reduction_base<batch<float, A>, exp10_tag>
+            {
+                using batch_type = batch<float, A>;
+                static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept
+                {
+                    return ++(detail::horner<batch_type,
+                                             0x40135d8e, //    2.3025851e+00
+                                             0x4029a926, //    2.6509490e+00
+                                             0x400237da, //    2.0346589e+00
+                                             0x3f95eb4c, //    1.1712432e+00
+                                             0x3f0aacef, //    5.4170126e-01
+                                             0x3e54dff1 //    2.0788552e-01
+                                             >(x)
+                              * x);
+                }
+
+                static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(constants::invlog10_2<batch_type>() * a);
+                    x = fnma(k, constants::log10_2hi<batch_type>(), a);
+                    x -= k * constants::log10_2lo<batch_type>();
+                    return k;
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<float, A, exp2_tag> : exp_reduction_base<batch<float, A>, exp2_tag>
+            {
+                using batch_type = batch<float, A>;
+                static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type y = detail::horner<batch_type,
+                                                  0x3e75fdf1, //    2.4022652e-01
+                                                  0x3d6356eb, //    5.5502813e-02
+                                                  0x3c1d9422, //    9.6178371e-03
+                                                  0x3ab01218, //    1.3433127e-03
+                                                  0x3922c8c4 //    1.5524315e-04
+                                                  >(x);
+                    return ++fma(y, x * x, x * constants::log_2<batch_type>());
+                }
+
+                static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(a);
+                    x = (a - k);
+                    return k;
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<double, A, exp_tag> : exp_reduction_base<batch<double, A>, exp_tag>
+            {
+                using batch_type = batch<double, A>;
+                static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type t = x * x;
+                    return fnma(t,
+                                detail::horner<batch_type,
+                                               0x3fc555555555553eull,
+                                               0xbf66c16c16bebd93ull,
+                                               0x3f11566aaf25de2cull,
+                                               0xbebbbd41c5d26bf1ull,
+                                               0x3e66376972bea4d0ull>(t),
+                                x);
+                }
+
+                static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
+                    hi = fnma(k, constants::log_2hi<batch_type>(), a);
+                    lo = k * constants::log_2lo<batch_type>();
+                    x = hi - lo;
+                    return k;
+                }
+
+                static XSIMD_INLINE batch_type finalize(const batch_type& x, const batch_type& c, const batch_type& hi, const batch_type& lo) noexcept
+                {
+                    return batch_type(1.) - (((lo - (x * c) / (batch_type(2.) - c)) - hi));
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<double, A, exp10_tag> : exp_reduction_base<batch<double, A>, exp10_tag>
+            {
+                using batch_type = batch<double, A>;
+                static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type xx = x * x;
+                    batch_type px = x * detail::horner<batch_type, 0x40a2b4798e134a01ull, 0x40796b7a050349e4ull, 0x40277d9474c55934ull, 0x3fa4fd75f3062dd4ull>(xx);
+                    batch_type x2 = px / (detail::horner1<batch_type, 0x40a03f37650df6e2ull, 0x4093e05eefd67782ull, 0x405545fdce51ca08ull>(xx) - px);
+                    return ++(x2 + x2);
+                }
+
+                static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(constants::invlog10_2<batch_type>() * a);
+                    x = fnma(k, constants::log10_2hi<batch_type>(), a);
+                    x = fnma(k, constants::log10_2lo<batch_type>(), x);
+                    return k;
+                }
+
+                static XSIMD_INLINE batch_type finalize(const batch_type&, const batch_type& c, const batch_type&, const batch_type&) noexcept
+                {
+                    return c;
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<double, A, exp2_tag> : exp_reduction_base<batch<double, A>, exp2_tag>
+            {
+                using batch_type = batch<double, A>;
+                static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type t = x * x;
+                    return fnma(t,
+                                detail::horner<batch_type,
+                                               0x3fc555555555553eull,
+                                               0xbf66c16c16bebd93ull,
+                                               0x3f11566aaf25de2cull,
+                                               0xbebbbd41c5d26bf1ull,
+                                               0x3e66376972bea4d0ull>(t),
+                                x);
+                }
+
+                static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(a);
+                    x = (a - k) * constants::log_2<batch_type>();
+                    return k;
+                }
+
+                static XSIMD_INLINE batch_type finalize(const batch_type& x, const batch_type& c, const batch_type&, const batch_type&) noexcept
+                {
+                    return batch_type(1.) + x + x * c / (batch_type(2.) - c);
+                }
+            };
+
+            template <exp_reduction_tag Tag, class A>
+            XSIMD_INLINE batch<float, A> exp(batch<float, A> const& self) noexcept
+            {
+                using batch_type = batch<float, A>;
+                using reducer_t = exp_reduction<float, A, Tag>;
+                batch_type x;
+                batch_type k = reducer_t::reduce(self, x);
+                x = reducer_t::approx(x);
+                x = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(x, to_int(k)));
+                x = select(self >= reducer_t::maxlog(), constants::infinity<batch_type>(), x);
+                return x;
+            }
+
+            template <exp_reduction_tag Tag, class A>
+            XSIMD_INLINE batch<double, A> exp(batch<double, A> const& self) noexcept
+            {
+                using batch_type = batch<double, A>;
+                using reducer_t = exp_reduction<double, A, Tag>;
+                batch_type hi, lo, x;
+                batch_type k = reducer_t::reduce(self, hi, lo, x);
+                batch_type c = reducer_t::approx(x);
+                c = reducer_t::finalize(x, c, hi, lo);
+                c = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(c, to_int(k)));
+                c = select(self >= reducer_t::maxlog(), constants::infinity<batch_type>(), c);
+                return c;
+            }
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> exp(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::exp<detail::exp_tag>(self);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> exp(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            auto isincos = sincos(self.imag());
+            return exp(self.real()) * batch_type(std::get<1>(isincos), std::get<0>(isincos));
+        }
+
+        // exp10
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> exp10(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::exp<detail::exp10_tag>(self);
+        }
+
+        // exp2
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> exp2(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::exp<detail::exp2_tag>(self);
+        }
+
+        // expm1
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/expm1_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            static XSIMD_INLINE batch<float, A> expm1(const batch<float, A>& a) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
+                batch_type x = fnma(k, constants::log_2hi<batch_type>(), a);
+                x = fnma(k, constants::log_2lo<batch_type>(), x);
+                batch_type hx = x * batch_type(0.5);
+                batch_type hxs = x * hx;
+                batch_type r = detail::horner<batch_type,
+                                              0X3F800000UL, // 1
+                                              0XBD08887FUL, // -3.3333298E-02
+                                              0X3ACF6DB4UL // 1.582554
+                                              >(hxs);
+                batch_type t = fnma(r, hx, batch_type(3.));
+                batch_type e = hxs * ((r - t) / (batch_type(6.) - x * t));
+                e = fms(x, e, hxs);
+                using i_type = as_integer_t<batch_type>;
+                i_type ik = to_int(k);
+                batch_type two2mk = ::xsimd::bitwise_cast<float>((constants::maxexponent<batch_type>() - ik) << constants::nmb<batch_type>());
+                batch_type y = batch_type(1.) - two2mk - (e - x);
+                return ldexp(y, ik);
+            }
+
+            template <class A>
+            static XSIMD_INLINE batch<double, A> expm1(const batch<double, A>& a) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
+                batch_type hi = fnma(k, constants::log_2hi<batch_type>(), a);
+                batch_type lo = k * constants::log_2lo<batch_type>();
+                batch_type x = hi - lo;
+                batch_type hxs = x * x * batch_type(0.5);
+                batch_type r = detail::horner<batch_type,
+                                              0X3FF0000000000000ULL,
+                                              0XBFA11111111110F4ULL,
+                                              0X3F5A01A019FE5585ULL,
+                                              0XBF14CE199EAADBB7ULL,
+                                              0X3ED0CFCA86E65239ULL,
+                                              0XBE8AFDB76E09C32DULL>(hxs);
+                batch_type t = batch_type(3.) - r * batch_type(0.5) * x;
+                batch_type e = hxs * ((r - t) / (batch_type(6) - x * t));
+                batch_type c = (hi - x) - lo;
+                e = (x * (e - c) - c) - hxs;
+                using i_type = as_integer_t<batch_type>;
+                i_type ik = to_int(k);
+                batch_type two2mk = ::xsimd::bitwise_cast<double>((constants::maxexponent<batch_type>() - ik) << constants::nmb<batch_type>());
+                batch_type ct1 = batch_type(1.) - two2mk - (e - x);
+                batch_type ct2 = ++(x - (e + two2mk));
+                batch_type y = select(k < batch_type(20.), ct1, ct2);
+                return ldexp(y, ik);
+            }
+
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> expm1(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            return select(self < constants::logeps<batch_type>(),
+                          batch_type(-1.),
+                          select(self > constants::maxlog<batch_type>(),
+                                 constants::infinity<batch_type>(),
+                                 detail::expm1(self)));
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> expm1(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch isin = sin(z.imag());
+            real_batch rem1 = expm1(z.real());
+            real_batch re = rem1 + 1.;
+            real_batch si = sin(z.imag() * 0.5);
+            return { rem1 - 2. * re * si * si, re * isin };
+        }
+
+        // polar
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> polar(const batch<T, A>& r, const batch<T, A>& theta, requires_arch<generic>) noexcept
+        {
+            auto sincosTheta = sincos(theta);
+            return { r * sincosTheta.second, r * sincosTheta.first };
+        }
+
+        // fdim
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> fdim(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return fmax(batch<T, A>(0), self - other);
+        }
+
+        // fmod
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> fmod(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return fnma(trunc(self / other), other, self);
+        }
+
+        // frexp
+        /* origin: boost/simd/arch/common/simd/function/ifrexp.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> frexp(const batch<T, A>& self, batch<as_integer_t<T>, A>& exp, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            using int_type = as_integer_t<T>;
+            using i_type = batch<int_type, A>;
+            i_type m1f = constants::mask1frexp<batch_type>();
+            i_type r1 = m1f & ::xsimd::bitwise_cast<int_type>(self);
+            batch_type x = self & ::xsimd::bitwise_cast<T>(~m1f);
+            exp = (r1 >> constants::nmb<batch_type>()) - constants::maxexponentm1<batch_type>();
+            exp = select(batch_bool_cast<typename i_type::value_type>(self != batch_type(0.)), exp, i_type(typename i_type::value_type(0)));
+            return select((self != batch_type(0.)), x | ::xsimd::bitwise_cast<T>(constants::mask2frexp<batch_type>()), batch_type(0.));
+        }
+
+        // from bool
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return batch<T, A>(self.data) & batch<T, A>(1);
+        }
+
+        // horner
+        template <class T, class A, uint64_t... Coefs>
+        XSIMD_INLINE batch<T, A> horner(const batch<T, A>& self) noexcept
+        {
+            return detail::horner<batch<T, A>, Coefs...>(self);
+        }
+
+        // hypot
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> hypot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return sqrt(fma(self, self, other * other));
+        }
+
+        // ipow
+        template <class A, class T, class ITy>
+        XSIMD_INLINE batch<T, A> ipow(batch<T, A> const& self, ITy other, requires_arch<generic>) noexcept
+        {
+            return ::xsimd::detail::ipow(self, other);
+        }
+
+        // ldexp
+        /* origin: boost/simd/arch/common/simd/function/ldexp.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> ldexp(const batch<T, A>& self, const batch<as_integer_t<T>, A>& other, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            using itype = as_integer_t<batch_type>;
+            itype ik = other + constants::maxexponent<T>();
+            ik = ik << constants::nmb<T>();
+            return self * ::xsimd::bitwise_cast<T>(ik);
+        }
+
+        // lgamma
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> lgamma(batch<T, A> const& self, requires_arch<generic>) noexcept;
+
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/gammaln_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            static XSIMD_INLINE batch<float, A> gammalnB(const batch<float, A>& x) noexcept
+            {
+                return horner<batch<float, A>,
+                              0x3ed87730, //    4.227843421859038E-001
+                              0x3ea51a64, //    3.224669577325661E-001,
+                              0xbd89f07e, //   -6.735323259371034E-002,
+                              0x3ca89ed8, //    2.058355474821512E-002,
+                              0xbbf164fd, //   -7.366775108654962E-003,
+                              0x3b3ba883, //    2.863437556468661E-003,
+                              0xbaabeab1, //   -1.311620815545743E-003,
+                              0x3a1ebb94 //    6.055172732649237E-004
+                              >(x);
+            }
+
+            template <class A>
+            static XSIMD_INLINE batch<float, A> gammalnC(const batch<float, A>& x) noexcept
+            {
+                return horner<batch<float, A>,
+                              0xbf13c468, //   -5.772156501719101E-001
+                              0x3f528d34, //    8.224670749082976E-001,
+                              0xbecd27a8, //   -4.006931650563372E-001,
+                              0x3e8a898b, //    2.705806208275915E-001,
+                              0xbe53c04f, //   -2.067882815621965E-001,
+                              0x3e2d4dab, //    1.692415923504637E-001,
+                              0xbe22d329, //   -1.590086327657347E-001,
+                              0x3e0c3c4f //    1.369488127325832E-001
+                              >(x);
+            }
+
+            template <class A>
+            static XSIMD_INLINE batch<float, A> gammaln2(const batch<float, A>& x) noexcept
+            {
+                return horner<batch<float, A>,
+                              0x3daaaa94, //   8.333316229807355E-002f
+                              0xbb358701, //  -2.769887652139868E-003f,
+                              0x3a31fd69 //   6.789774945028216E-004f
+                              >(x);
+            }
+
+            template <class A>
+            static XSIMD_INLINE batch<double, A> gammaln1(const batch<double, A>& x) noexcept
+            {
+                return horner<batch<double, A>,
+                              0xc12a0c675418055eull, //  -8.53555664245765465627E5
+                              0xc13a45890219f20bull, //  -1.72173700820839662146E6,
+                              0xc131bc82f994db51ull, //  -1.16237097492762307383E6,
+                              0xc1143d73f89089e5ull, //  -3.31612992738871184744E5,
+                              0xc0e2f234355bb93eull, //  -3.88016315134637840924E4,
+                              0xc09589018ff36761ull //  -1.37825152569120859100E3
+                              >(x)
+                    / horner<batch<double, A>,
+                             0xc13ece4b6a11e14aull, //  -2.01889141433532773231E6
+                             0xc1435255892ff34cull, //  -2.53252307177582951285E6,
+                             0xc131628671950043ull, //  -1.13933444367982507207E6,
+                             0xc10aeb84b9744c9bull, //  -2.20528590553854454839E5,
+                             0xc0d0aa0d7b89d757ull, //  -1.70642106651881159223E4,
+                             0xc075fd0d1cf312b2ull, //  -3.51815701436523470549E2,
+                             0x3ff0000000000000ull //   1.00000000000000000000E0
+                             >(x);
+            }
+
+            template <class A>
+            static XSIMD_INLINE batch<double, A> gammalnA(const batch<double, A>& x) noexcept
+            {
+                return horner<batch<double, A>,
+                              0x3fb555555555554bull, //    8.33333333333331927722E-2
+                              0xbf66c16c16b0a5a1ull, //   -2.77777777730099687205E-3,
+                              0x3f4a019f20dc5ebbull, //    7.93650340457716943945E-4,
+                              0xbf437fbdb580e943ull, //   -5.95061904284301438324E-4,
+                              0x3f4a985027336661ull //    8.11614167470508450300E-4
+                              >(x);
+            }
+
+            /* origin: boost/simd/arch/common/simd/function/gammaln.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct lgamma_impl;
+
+            template <class A>
+            struct lgamma_impl<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                static XSIMD_INLINE batch_type compute(const batch_type& a) noexcept
+                {
+                    auto inf_result = (a <= batch_type(0.)) && is_flint(a);
+                    batch_type x = select(inf_result, constants::nan<batch_type>(), a);
+                    batch_type q = abs(x);
+#ifndef XSIMD_NO_INFINITIES
+                    inf_result = (x == constants::infinity<batch_type>()) || inf_result;
+#endif
+                    auto ltza = a < batch_type(0.);
+                    batch_type r(0);
+                    batch_type r1 = other(q);
+                    if (any(ltza))
+                    {
+                        r = select(inf_result, constants::infinity<batch_type>(), negative(q, r1));
+                        if (all(ltza))
+                            return r;
+                    }
+                    batch_type r2 = select(ltza, r, r1);
+                    return select(a == constants::minusinfinity<batch_type>(), constants::nan<batch_type>(), select(inf_result, constants::infinity<batch_type>(), r2));
+                }
+
+            private:
+                static XSIMD_INLINE batch_type negative(const batch_type& q, const batch_type& w) noexcept
+                {
+                    batch_type p = floor(q);
+                    batch_type z = q - p;
+                    auto test2 = z < batch_type(0.5);
+                    z = select(test2, z - batch_type(1.), z);
+                    z = q * sin(z, trigo_pi_tag());
+                    return -log(constants::invpi<batch_type>() * abs(z)) - w;
+                }
+
+                static XSIMD_INLINE batch_type other(const batch_type& x) noexcept
+                {
+                    auto xlt650 = (x < batch_type(6.5));
+                    batch_type r0x = x;
+                    batch_type r0z = x;
+                    batch_type r0s = batch_type(1.);
+                    batch_type r1 = batch_type(0.);
+                    batch_type p = constants::nan<batch_type>();
+                    if (any(xlt650))
+                    {
+                        batch_type z = batch_type(1.);
+                        batch_type tx = select(xlt650, x, batch_type(0.));
+                        batch_type nx = batch_type(0.);
+                        const batch_type _075 = batch_type(0.75);
+                        const batch_type _150 = batch_type(1.50);
+                        const batch_type _125 = batch_type(1.25);
+                        const batch_type _250 = batch_type(2.50);
+                        auto xge150 = (x >= _150);
+                        auto txgt250 = (tx > _250);
+
+                        // x >= 1.5
+                        while (any(xge150 && txgt250))
+                        {
+                            nx = select(txgt250, nx - batch_type(1.), nx);
+                            tx = select(txgt250, x + nx, tx);
+                            z = select(txgt250, z * tx, z);
+                            txgt250 = (tx > _250);
+                        }
+                        r0x = select(xge150, x + nx - batch_type(2.), x);
+                        r0z = select(xge150, z, r0z);
+                        r0s = select(xge150, batch_type(1.), r0s);
+
+                        // x >= 1.25 && x < 1.5
+                        auto xge125 = (x >= _125);
+                        auto xge125t = xge125 && !xge150;
+                        if (any(xge125))
+                        {
+                            r0x = select(xge125t, x - batch_type(1.), r0x);
+                            r0z = select(xge125t, z * x, r0z);
+                            r0s = select(xge125t, batch_type(-1.), r0s);
+                        }
+
+                        // x >= 0.75 && x < 1.5
+                        batch_bool<float, A> kernelC(false);
+                        auto xge075 = (x >= _075);
+                        auto xge075t = xge075 && !xge125;
+                        if (any(xge075t))
+                        {
+                            kernelC = xge075t;
+                            r0x = select(xge075t, x - batch_type(1.), x);
+                            r0z = select(xge075t, batch_type(1.), r0z);
+                            r0s = select(xge075t, batch_type(-1.), r0s);
+                            p = gammalnC(r0x);
+                        }
+
+                        // tx < 1.5 && x < 0.75
+                        auto txlt150 = (tx < _150) && !xge075;
+                        if (any(txlt150))
+                        {
+                            auto orig = txlt150;
+                            while (any(txlt150))
+                            {
+                                z = select(txlt150, z * tx, z);
+                                nx = select(txlt150, nx + batch_type(1.), nx);
+                                tx = select(txlt150, x + nx, tx);
+                                txlt150 = (tx < _150) && !xge075;
+                            }
+                            r0x = select(orig, r0x + nx - batch_type(2.), r0x);
+                            r0z = select(orig, z, r0z);
+                            r0s = select(orig, batch_type(-1.), r0s);
+                        }
+                        p = select(kernelC, p, gammalnB(r0x));
+                        if (all(xlt650))
+                            return fma(r0x, p, r0s * log(abs(r0z)));
+                    }
+                    r0z = select(xlt650, abs(r0z), x);
+                    batch_type m = log(r0z);
+                    r1 = fma(r0x, p, r0s * m);
+                    batch_type r2 = fma(x - batch_type(0.5), m, constants::logsqrt2pi<batch_type>() - x);
+                    r2 += gammaln2(batch_type(1.) / (x * x)) / x;
+                    return select(xlt650, r1, r2);
+                }
+            };
+
+            template <class A>
+            struct lgamma_impl<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+
+                static XSIMD_INLINE batch_type compute(const batch_type& a) noexcept
+                {
+                    auto inf_result = (a <= batch_type(0.)) && is_flint(a);
+                    batch_type x = select(inf_result, constants::nan<batch_type>(), a);
+                    batch_type q = abs(x);
+#ifndef XSIMD_NO_INFINITIES
+                    inf_result = (q == constants::infinity<batch_type>());
+#endif
+                    auto test = (a < batch_type(-34.));
+                    batch_type r = constants::nan<batch_type>();
+                    if (any(test))
+                    {
+                        r = large_negative(q);
+                        if (all(test))
+                            return select(inf_result, constants::nan<batch_type>(), r);
+                    }
+                    batch_type r1 = other(a);
+                    batch_type r2 = select(test, r, r1);
+                    return select(a == constants::minusinfinity<batch_type>(), constants::nan<batch_type>(), select(inf_result, constants::infinity<batch_type>(), r2));
+                }
+
+            private:
+                // FIXME: cannot mark this one as XSIMD_INLINE because there's a
+                // recursive loop on `lgamma'.
+                static inline batch_type large_negative(const batch_type& q) noexcept
+                {
+                    batch_type w = lgamma(q);
+                    batch_type p = floor(q);
+                    batch_type z = q - p;
+                    auto test2 = (z < batch_type(0.5));
+                    z = select(test2, z - batch_type(1.), z);
+                    z = q * sin(z, trigo_pi_tag());
+                    z = abs(z);
+                    return constants::logpi<batch_type>() - log(z) - w;
+                }
+
+                static XSIMD_INLINE batch_type other(const batch_type& xx) noexcept
+                {
+                    batch_type x = xx;
+                    auto test = (x < batch_type(13.));
+                    batch_type r1 = batch_type(0.);
+                    if (any(test))
+                    {
+                        batch_type z = batch_type(1.);
+                        batch_type p = batch_type(0.);
+                        batch_type u = select(test, x, batch_type(0.));
+                        auto test1 = (u >= batch_type(3.));
+                        while (any(test1))
+                        {
+                            p = select(test1, p - batch_type(1.), p);
+                            u = select(test1, x + p, u);
+                            z = select(test1, z * u, z);
+                            test1 = (u >= batch_type(3.));
+                        }
+
+                        auto test2 = (u < batch_type(2.));
+                        while (any(test2))
+                        {
+                            z = select(test2, z / u, z);
+                            p = select(test2, p + batch_type(1.), p);
+                            u = select(test2, x + p, u);
+                            test2 = (u < batch_type(2.));
+                        }
+
+                        z = abs(z);
+                        x += p - batch_type(2.);
+                        r1 = x * gammaln1(x) + log(z);
+                        if (all(test))
+                            return r1;
+                    }
+                    batch_type r2 = fma(xx - batch_type(0.5), log(xx), constants::logsqrt2pi<batch_type>() - xx);
+                    batch_type p = batch_type(1.) / (xx * xx);
+                    r2 += gammalnA(p) / xx;
+                    return select(test, r1, r2);
+                }
+            };
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> lgamma(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::lgamma_impl<batch<T, A>>::compute(self);
+        }
+
+        // log
+        /* origin: boost/simd/arch/common/simd/function/log.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A>
+        XSIMD_INLINE batch<float, A> log(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(23), k);
+                x = select(test, x * batch_type(8388608ul), x);
+            }
+#endif
+            i_type ix = ::xsimd::bitwise_cast<int_type>(x);
+            ix += 0x3f800000 - 0x3f3504f3;
+            k += (ix >> 23) - 0x7f;
+            ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
+            x = ::xsimd::bitwise_cast<float>(ix);
+            batch_type f = --x;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3eccce13, 0x3e789e26>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
+            batch_type R = t2 + t1;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type dk = to_float(k);
+            batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>()) - hfsq + f);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> log(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
+
+            batch_type x = self;
+            i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(54), k);
+                x = select(test, x * batch_type(18014398509481984ull), x);
+            }
+#endif
+            hx += 0x3ff00000 - 0x3fe6a09e;
+            k += (hx >> 20) - 0x3ff;
+            batch_type dk = to_float(k);
+            hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
+            x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
+
+            batch_type f = --x;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+
+            batch_type t1 = w * detail::horner<batch_type, 0x3fd999999997fa04ll, 0x3fcc71c51d8e78afll, 0x3fc39a09d078c69fll>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
+            batch_type R = t2 + t1;
+            batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>()) - hfsq + f);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> log(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            return batch<std::complex<T>, A>(log(abs(z)), atan2(z.imag(), z.real()));
+        }
+
+        // log2
+        template <class A>
+        XSIMD_INLINE batch<float, A> log2(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(25), k);
+                x = select(test, x * batch_type(33554432ul), x);
+            }
+#endif
+            i_type ix = ::xsimd::bitwise_cast<int_type>(x);
+            ix += 0x3f800000 - 0x3f3504f3;
+            k += (ix >> 23) - 0x7f;
+            ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
+            x = ::xsimd::bitwise_cast<float>(ix);
+            batch_type f = --x;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3eccce13, 0x3e789e26>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
+            batch_type R = t1 + t2;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type dk = to_float(k);
+            batch_type r = fma(fms(s, hfsq + R, hfsq) + f, constants::invlog_2<batch_type>(), dk);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> log2(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<typename i_type::value_type>(test), k - i_type(54), k);
+                x = select(test, x * batch_type(18014398509481984ull), x);
+            }
+#endif
+            hx += 0x3ff00000 - 0x3fe6a09e;
+            k += (hx >> 20) - 0x3ff;
+            hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
+            x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
+            batch_type f = --x;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3fd999999997fa04ll, 0x3fcc71c51d8e78afll, 0x3fc39a09d078c69fll>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
+            batch_type R = t2 + t1;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type hi = f - hfsq;
+            hi = hi & ::xsimd::bitwise_cast<double>((constants::allbits<i_type>() << 32));
+            batch_type lo = fma(s, hfsq + R, f - hi - hfsq);
+            batch_type val_hi = hi * constants::invlog_2hi<batch_type>();
+            batch_type val_lo = fma(lo + hi, constants::invlog_2lo<batch_type>(), lo * constants::invlog_2hi<batch_type>());
+            batch_type dk = to_float(k);
+            batch_type w1 = dk + val_hi;
+            val_lo += (dk - w1) + val_hi;
+            val_hi = w1;
+            batch_type r = val_lo + val_hi;
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        namespace detail
+        {
+            template <class T, class A>
+            XSIMD_INLINE batch<T, A> logN_complex_impl(const batch<T, A>& z, typename batch<T, A>::value_type base) noexcept
+            {
+                using batch_type = batch<T, A>;
+                using rv_type = typename batch_type::value_type;
+                return log(z) / batch_type(rv_type(base));
+            }
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> log2(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::logN_complex_impl(self, std::log(2));
+        }
+
+        // log10
+        /* origin: FreeBSD /usr/src/lib/msun/src/e_log10f.c */
+        /*
+         * ====================================================
+         * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+         *
+         * Developed at SunPro, a Sun Microsystems, Inc. business.
+         * Permission to use, copy, modify, and distribute this
+         * software is freely granted, provided that this notice
+         * is preserved.
+         * ====================================================
+         */
+        template <class A>
+        XSIMD_INLINE batch<float, A> log10(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            const batch_type
+                ivln10hi(4.3432617188e-01f),
+                ivln10lo(-3.1689971365e-05f),
+                log10_2hi(3.0102920532e-01f),
+                log10_2lo(7.9034151668e-07f);
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(25), k);
+                x = select(test, x * batch_type(33554432ul), x);
+            }
+#endif
+            i_type ix = ::xsimd::bitwise_cast<int_type>(x);
+            ix += 0x3f800000 - 0x3f3504f3;
+            k += (ix >> 23) - 0x7f;
+            ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
+            x = ::xsimd::bitwise_cast<float>(ix);
+            batch_type f = --x;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3eccce13, 0x3e789e26>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
+            batch_type R = t2 + t1;
+            batch_type dk = to_float(k);
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type hibits = f - hfsq;
+            hibits &= ::xsimd::bitwise_cast<float>(i_type(0xfffff000));
+            batch_type lobits = fma(s, hfsq + R, f - hibits - hfsq);
+            batch_type r = fma(dk, log10_2hi,
+                               fma(hibits, ivln10hi,
+                                   fma(lobits, ivln10hi,
+                                       fma(lobits + hibits, ivln10lo, dk * log10_2lo))));
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> log10(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            const batch_type
+                ivln10hi(4.34294481878168880939e-01),
+                ivln10lo(2.50829467116452752298e-11),
+                log10_2hi(3.01029995663611771306e-01),
+                log10_2lo(3.69423907715893078616e-13);
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(54), k);
+                x = select(test, x * batch_type(18014398509481984ull), x);
+            }
+#endif
+            hx += 0x3ff00000 - 0x3fe6a09e;
+            k += (hx >> 20) - 0x3ff;
+            hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
+            x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
+            batch_type f = --x;
+            batch_type dk = to_float(k);
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3fd999999997fa04ll, 0x3fcc71c51d8e78afll, 0x3fc39a09d078c69fll>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
+            batch_type R = t2 + t1;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type hi = f - hfsq;
+            hi = hi & ::xsimd::bitwise_cast<double>(constants::allbits<i_type>() << 32);
+            batch_type lo = f - hi - hfsq + s * (hfsq + R);
+            batch_type val_hi = hi * ivln10hi;
+            batch_type y = dk * log10_2hi;
+            batch_type val_lo = dk * log10_2lo + (lo + hi) * ivln10lo + lo * ivln10hi;
+            batch_type w1 = y + val_hi;
+            val_lo += (y - w1) + val_hi;
+            val_hi = w1;
+            batch_type r = val_lo + val_hi;
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> log10(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            return detail::logN_complex_impl(z, std::log(10));
+        }
+
+        // log1p
+        /* origin: boost/simd/arch/common/simd/function/log1p.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A>
+        XSIMD_INLINE batch<float, A> log1p(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
+            const batch_type uf = self + batch_type(1.);
+            auto isnez = (uf != batch_type(0.));
+            i_type iu = ::xsimd::bitwise_cast<int_type>(uf);
+            iu += 0x3f800000 - 0x3f3504f3;
+            i_type k = (iu >> 23) - 0x7f;
+            iu = (iu & i_type(0x007fffff)) + 0x3f3504f3;
+            batch_type f = --(::xsimd::bitwise_cast<float>(iu));
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3eccce13, 0x3e789e26>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
+            batch_type R = t2 + t1;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type dk = to_float(k);
+            /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
+            batch_type c = select(batch_bool_cast<float>(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf;
+            batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(uf >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> log1p(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
+            const batch_type uf = self + batch_type(1.);
+            auto isnez = (uf != batch_type(0.));
+            i_type hu = ::xsimd::bitwise_cast<int_type>(uf) >> 32;
+            hu += 0x3ff00000 - 0x3fe6a09e;
+            i_type k = (hu >> 20) - 0x3ff;
+            /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
+            batch_type c = select(batch_bool_cast<double>(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf;
+            hu = (hu & i_type(0x000fffff)) + 0x3fe6a09e;
+            batch_type f = ::xsimd::bitwise_cast<double>((hu << 32) | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(uf)));
+            f = --f;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3fd999999997fa04ll, 0x3fcc71c51d8e78afll, 0x3fc39a09d078c69fll>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
+            batch_type R = t2 + t1;
+            batch_type dk = to_float(k);
+            batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, hfsq + R, dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(uf >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> log1p(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            batch_type u = 1 + self;
+            batch_type logu = log(u);
+            return select(u == batch_type(1.),
+                          self,
+                          select(u.real() <= real_batch(0.),
+                                 logu,
+                                 logu * self / (u - batch_type(1.))));
+        }
+
+        // mod
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> mod(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept -> T
+                                 { return x % y; },
+                                 self, other);
+        }
+
+        // nearbyint
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+        namespace detail
+        {
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> nearbyintf(batch<T, A> const& self) noexcept
+            {
+                using batch_type = batch<T, A>;
+                batch_type s = bitofsign(self);
+                batch_type v = self ^ s;
+                batch_type t2n = constants::twotonmb<batch_type>();
+                // Under fast-math, reordering is possible and the compiler optimizes d
+                // to v. That's not what we want, so prevent compiler optimization here.
+                // FIXME: it may be better to emit a memory barrier here (?).
+#ifdef __FAST_MATH__
+                volatile batch_type d0 = v + t2n;
+                batch_type d = *(batch_type*)(void*)(&d0) - t2n;
+#else
+                batch_type d0 = v + t2n;
+                batch_type d = d0 - t2n;
+#endif
+                return s ^ select(v < t2n, d, v);
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::nearbyintf(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::nearbyintf(self);
+        }
+
+        // nearbyint_as_int
+        template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> nearbyint_as_int(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        XSIMD_INLINE batch<as_integer_t<float>, A>
+        nearbyint_as_int(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using U = as_integer_t<float>;
+            return kernel::detail::apply_transform<U>([](float x) noexcept -> U
+                                                      { return std::nearbyintf(x); },
+                                                      self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<as_integer_t<double>, A>
+        nearbyint_as_int(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using U = as_integer_t<double>;
+            return kernel::detail::apply_transform<U>([](double x) noexcept -> U
+                                                      { return std::nearbyint(x); },
+                                                      self);
+        }
+
+        // nextafter
+        namespace detail
+        {
+            template <class T, class A, bool is_int = std::is_integral<T>::value>
+            struct nextafter_kernel
+            {
+                using batch_type = batch<T, A>;
+
+                static XSIMD_INLINE batch_type next(batch_type const& b) noexcept
+                {
+                    return b;
+                }
+
+                static XSIMD_INLINE batch_type prev(batch_type const& b) noexcept
+                {
+                    return b;
+                }
+            };
+
+            template <class T, class A>
+            struct bitwise_cast_batch;
+
+            template <class A>
+            struct bitwise_cast_batch<float, A>
+            {
+                using type = batch<int32_t, A>;
+            };
+
+            template <class A>
+            struct bitwise_cast_batch<double, A>
+            {
+                using type = batch<int64_t, A>;
+            };
+
+            template <class T, class A>
+            struct nextafter_kernel<T, A, false>
+            {
+                using batch_type = batch<T, A>;
+                using int_batch = typename bitwise_cast_batch<T, A>::type;
+                using int_type = typename int_batch::value_type;
+
+                static XSIMD_INLINE batch_type next(const batch_type& b) noexcept
+                {
+                    batch_type n = ::xsimd::bitwise_cast<T>(::xsimd::bitwise_cast<int_type>(b) + int_type(1));
+                    return select(b == constants::infinity<batch_type>(), b, n);
+                }
+
+                static XSIMD_INLINE batch_type prev(const batch_type& b) noexcept
+                {
+                    batch_type p = ::xsimd::bitwise_cast<T>(::xsimd::bitwise_cast<int_type>(b) - int_type(1));
+                    return select(b == constants::minusinfinity<batch_type>(), b, p);
+                }
+            };
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> nextafter(batch<T, A> const& from, batch<T, A> const& to, requires_arch<generic>) noexcept
+        {
+            using kernel = detail::nextafter_kernel<T, A>;
+            return select(from == to, from,
+                          select(to > from, kernel::next(from), kernel::prev(from)));
+        }
+
+        // pow
+        /* origin: boost/simd/arch/common/simd/function/pow.hpp*/
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> pow(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const auto zero = batch_type(0.);
+            auto negself = self < zero;
+            auto iszeropowpos = self == zero && other >= zero;
+            auto adj_self = select(iszeropowpos, batch_type(1), abs(self));
+            batch_type z = exp(other * log(adj_self));
+            z = select(iszeropowpos, zero, z);
+            z = select(is_odd(other) && negself, -z, z);
+            auto invalid = negself && !(is_flint(other) || isinf(other));
+            return select(invalid, constants::nan<batch_type>(), z);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> pow(const batch<std::complex<T>, A>& a, const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using cplx_batch = batch<std::complex<T>, A>;
+            using real_batch = typename cplx_batch::real_batch;
+            real_batch absa = abs(a);
+            real_batch arga = arg(a);
+            real_batch x = z.real();
+            real_batch y = z.imag();
+            real_batch r = pow(absa, x);
+            real_batch theta = x * arga;
+            real_batch ze(0);
+            auto cond = (y == ze);
+            r = select(cond, r, r * exp(-y * arga));
+            theta = select(cond, theta, theta + y * log(absa));
+            auto sincosTheta = xsimd::sincos(theta);
+            return select(absa == ze, cplx_batch(ze), cplx_batch(r * sincosTheta.second, r * sincosTheta.first));
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> pow(const batch<std::complex<T>, A>& a, const batch<T, A>& z, requires_arch<generic>) noexcept
+        {
+            using cplx_batch = batch<std::complex<T>, A>;
+
+            auto absa = abs(a);
+            auto arga = arg(a);
+            auto r = pow(absa, z);
+
+            auto theta = z * arga;
+            auto sincosTheta = xsimd::sincos(theta);
+            return select(absa == 0, cplx_batch(0), cplx_batch(r * sincosTheta.second, r * sincosTheta.first));
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> pow(const batch<T, A>& a, const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            return pow(batch<std::complex<T>, A> { a, batch<T, A> {} }, z);
+        }
+
+        // reciprocal
+        template <class T, class A, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> reciprocal(batch<T, A> const& self,
+                                            requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            return div(batch_type(1), self);
+        }
+
+        // reduce_add
+        template <class A, class T>
+        XSIMD_INLINE std::complex<T> reduce_add(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return { reduce_add(self.real()), reduce_add(self.imag()) };
+        }
+
+        namespace detail
+        {
+            template <class T, T N>
+            struct split_high
+            {
+                static constexpr T get(T i, T)
+                {
+                    return i >= N ? (i % 2) : i + N;
+                }
+            };
+
+            template <class Op, class A, class T>
+            XSIMD_INLINE T reduce(Op, batch<T, A> const& self, std::integral_constant<unsigned, 1>) noexcept
+            {
+                return self.get(0);
+            }
+
+            template <class Op, class A, class T, unsigned Lvl>
+            XSIMD_INLINE T reduce(Op op, batch<T, A> const& self, std::integral_constant<unsigned, Lvl>) noexcept
+            {
+                using index_type = as_unsigned_integer_t<T>;
+                batch<T, A> split = swizzle(self, make_batch_constant<index_type, A, split_high<index_type, Lvl / 2>>());
+                return reduce(op, op(split, self), std::integral_constant<unsigned, Lvl / 2>());
+            }
+        }
+
+        // reduce_max
+        template <class A, class T>
+        XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::reduce([](batch<T, A> const& x, batch<T, A> const& y)
+                                  { return max(x, y); },
+                                  self, std::integral_constant<unsigned, batch<T, A>::size>());
+        }
+
+        // reduce_min
+        template <class A, class T>
+        XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::reduce([](batch<T, A> const& x, batch<T, A> const& y)
+                                  { return min(x, y); },
+                                  self, std::integral_constant<unsigned, batch<T, A>::size>());
+        }
+
+        // remainder
+        template <class A>
+        XSIMD_INLINE batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
+        {
+            return fnma(nearbyint(self / other), other, self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> remainder(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
+        {
+            return fnma(nearbyint(self / other), other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> remainder(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            auto mod = self % other;
+            return select(mod <= other / 2, mod, mod - other);
+        }
+
+        // select
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> select(batch_bool<T, A> const& cond, batch<std::complex<T>, A> const& true_br, batch<std::complex<T>, A> const& false_br, requires_arch<generic>) noexcept
+        {
+            return { select(cond, true_br.real(), false_br.real()), select(cond, true_br.imag(), false_br.imag()) };
+        }
+
+        // sign
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sign(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type res = select(self > batch_type(0), batch_type(1), batch_type(0)) - select(self < batch_type(0), batch_type(1), batch_type(0));
+            return res;
+        }
+
+        namespace detail
+        {
+            template <class T, class A>
+            XSIMD_INLINE batch<T, A> signf(batch<T, A> const& self) noexcept
+            {
+                using batch_type = batch<T, A>;
+                batch_type res = select(self > batch_type(0.f), batch_type(1.f), batch_type(0.f)) - select(self < batch_type(0.f), batch_type(1.f), batch_type(0.f));
+#ifdef XSIMD_NO_NANS
+                return res;
+#else
+                return select(isnan(self), constants::nan<batch_type>(), res);
+#endif
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> sign(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::signf(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> sign(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::signf(self);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> sign(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            auto rz = z.real();
+            auto iz = z.imag();
+            return select(rz != real_batch(0.),
+                          batch_type(sign(rz)),
+                          batch_type(sign(iz)));
+        }
+
+        // signnz
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> signnz(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            return (self >> (sizeof(T) * 8 - 1)) | batch_type(1.);
+        }
+
+        namespace detail
+        {
+            template <class T, class A>
+            XSIMD_INLINE batch<T, A> signnzf(batch<T, A> const& self) noexcept
+            {
+                using batch_type = batch<T, A>;
+#ifndef XSIMD_NO_NANS
+                return select(isnan(self), constants::nan<batch_type>(), batch_type(1.) | (constants::signmask<batch_type>() & self));
+#else
+                return batch_type(1.) | (constants::signmask<batch_type>() & self);
+#endif
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> signnz(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::signnzf(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> signnz(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::signnzf(self);
+        }
+
+        // sqrt
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> sqrt(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+
+            constexpr T csqrt_scale_factor = std::is_same<T, float>::value ? 6.7108864e7f : 1.8014398509481984e16;
+            constexpr T csqrt_scale = std::is_same<T, float>::value ? 1.220703125e-4f : 7.450580596923828125e-9;
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = batch<T, A>;
+            real_batch x = z.real();
+            real_batch y = z.imag();
+            real_batch sqrt_x = sqrt(fabs(x));
+            real_batch sqrt_hy = sqrt(0.5 * fabs(y));
+            auto cond = (fabs(x) > real_batch(4.) || fabs(y) > real_batch(4.));
+            x = select(cond, x * 0.25, x * csqrt_scale_factor);
+            y = select(cond, y * 0.25, y * csqrt_scale_factor);
+            real_batch scale = select(cond, real_batch(2.), real_batch(csqrt_scale));
+            real_batch r = abs(batch_type(x, y));
+
+            auto condxp = x > real_batch(0.);
+            real_batch t0 = select(condxp, xsimd::sqrt(0.5 * (r + x)), xsimd::sqrt(0.5 * (r - x)));
+            real_batch r0 = scale * fabs((0.5 * y) / t0);
+            t0 *= scale;
+            real_batch t = select(condxp, t0, r0);
+            r = select(condxp, r0, t0);
+            batch_type resg = select(y < real_batch(0.), batch_type(t, -r), batch_type(t, r));
+            real_batch ze(0.);
+
+            return select(y == ze,
+                          select(x == ze,
+                                 batch_type(ze, ze),
+                                 select(x < ze, batch_type(ze, sqrt_x), batch_type(sqrt_x, ze))),
+                          select(x == ze,
+                                 select(y > ze, batch_type(sqrt_hy, sqrt_hy), batch_type(sqrt_hy, -sqrt_hy)),
+                                 resg));
+        }
+
+        // tgamma
+
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/stirling_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct stirling_kernel;
+
+            template <class A>
+            struct stirling_kernel<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept
+                {
+                    return horner<batch_type,
+                                  0x3daaaaab,
+                                  0x3b638e39,
+                                  0xbb2fb930,
+                                  0xb970b359>(x);
+                }
+
+                static XSIMD_INLINE batch_type split_limit() noexcept
+                {
+                    return batch_type(bit_cast<float>(uint32_t(0x41d628f6)));
+                }
+
+                static XSIMD_INLINE batch_type large_limit() noexcept
+                {
+                    return batch_type(bit_cast<float>(uint32_t(0x420c28f3)));
+                }
+            };
+
+            template <class A>
+            struct stirling_kernel<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+                static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept
+                {
+                    return horner<batch_type,
+                                  0x3fb5555555555986ull, //   8.33333333333482257126E-2
+                                  0x3f6c71c71b98c5fdull, //   3.47222221605458667310E-3
+                                  0xbf65f72607d44fd7ull, //  -2.68132617805781232825E-3
+                                  0xbf2e166b27e61d7cull, //  -2.29549961613378126380E-4
+                                  0x3f49cc72592d7293ull //   7.87311395793093628397E-4
+                                  >(x);
+                }
+
+                static XSIMD_INLINE batch_type split_limit() noexcept
+                {
+                    return batch_type(bit_cast<double>(uint64_t(0x4061e083ba3443d4)));
+                }
+
+                static XSIMD_INLINE batch_type large_limit() noexcept
+                {
+                    return batch_type(bit_cast<double>(uint64_t(0x4065800000000000)));
+                }
+            };
+
+            /* origin: boost/simd/arch/common/simd/function/stirling.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class T, class A>
+            XSIMD_INLINE batch<T, A> stirling(const batch<T, A>& a) noexcept
+            {
+                using batch_type = batch<T, A>;
+                const batch_type stirlingsplitlim = stirling_kernel<batch_type>::split_limit();
+                const batch_type stirlinglargelim = stirling_kernel<batch_type>::large_limit();
+                batch_type x = select(a >= batch_type(0.), a, constants::nan<batch_type>());
+                batch_type w = batch_type(1.) / x;
+                w = fma(w, stirling_kernel<batch_type>::compute(w), batch_type(1.));
+                batch_type y = exp(-x);
+                auto test = (x < stirlingsplitlim);
+                batch_type z = x - batch_type(0.5);
+                z = select(test, z, batch_type(0.5) * z);
+                batch_type v = exp(z * log(abs(x)));
+                y *= v;
+                y = select(test, y, y * v);
+                y *= constants::sqrt_2pi<batch_type>() * w;
+#ifndef XSIMD_NO_INFINITIES
+                y = select(isinf(x), x, y);
+#endif
+                return select(x > stirlinglargelim, constants::infinity<batch_type>(), y);
+            }
+
+            /* origin: boost/simd/arch/common/detail/generic/gamma_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct tgamma_kernel;
+
+            template <class A>
+            struct tgamma_kernel<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept
+                {
+                    return horner<batch_type,
+                                  0x3f800000UL, //  9.999999757445841E-01
+                                  0x3ed87799UL, //  4.227874605370421E-01
+                                  0x3ed2d411UL, //  4.117741948434743E-01
+                                  0x3da82a34UL, //  8.211174403261340E-02
+                                  0x3d93ae7cUL, //  7.211014349068177E-02
+                                  0x3b91db14UL, //  4.451165155708328E-03
+                                  0x3ba90c99UL, //  5.158972571345137E-03
+                                  0x3ad28b22UL //  1.606319369134976E-03
+                                  >(x);
+                }
+            };
+
+            template <class A>
+            struct tgamma_kernel<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+                static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept
+                {
+                    return horner<batch_type,
+                                  0x3ff0000000000000ULL, // 9.99999999999999996796E-1
+                                  0x3fdfa1373993e312ULL, // 4.94214826801497100753E-1
+                                  0x3fca8da9dcae7d31ULL, // 2.07448227648435975150E-1
+                                  0x3fa863d918c423d3ULL, // 4.76367800457137231464E-2
+                                  0x3f8557cde9db14b0ULL, // 1.04213797561761569935E-2
+                                  0x3f5384e3e686bfabULL, // 1.19135147006586384913E-3
+                                  0x3f24fcb839982153ULL // 1.60119522476751861407E-4
+                                  >(x)
+                        / horner<batch_type,
+                                 0x3ff0000000000000ULL, //  1.00000000000000000320E00
+                                 0x3fb24944c9cd3c51ULL, //  7.14304917030273074085E-2
+                                 0xbfce071a9d4287c2ULL, // -2.34591795718243348568E-1
+                                 0x3fa25779e33fde67ULL, //  3.58236398605498653373E-2
+                                 0x3f8831ed5b1bb117ULL, //  1.18139785222060435552E-2
+                                 0xBf7240e4e750b44aULL, // -4.45641913851797240494E-3
+                                 0x3f41ae8a29152573ULL, //  5.39605580493303397842E-4
+                                 0xbef8487a8400d3aFULL // -2.31581873324120129819E-5
+                                 >(x);
+                }
+            };
+
+            /* origin: boost/simd/arch/common/simd/function/gamma.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            XSIMD_INLINE B tgamma_large_negative(const B& a) noexcept
+            {
+                B st = stirling(a);
+                B p = floor(a);
+                B sgngam = select(is_even(p), -B(1.), B(1.));
+                B z = a - p;
+                auto test2 = z < B(0.5);
+                z = select(test2, z - B(1.), z);
+                z = a * sin(z, trigo_pi_tag());
+                z = abs(z);
+                return sgngam * constants::pi<B>() / (z * st);
+            }
+
+            template <class B, class BB>
+            XSIMD_INLINE B tgamma_other(const B& a, const BB& test) noexcept
+            {
+                B x = select(test, B(2.), a);
+#ifndef XSIMD_NO_INFINITIES
+                auto inf_result = (a == constants::infinity<B>());
+                x = select(inf_result, B(2.), x);
+#endif
+                B z = B(1.);
+                auto test1 = (x >= B(3.));
+                while (any(test1))
+                {
+                    x = select(test1, x - B(1.), x);
+                    z = select(test1, z * x, z);
+                    test1 = (x >= B(3.));
+                }
+                test1 = (x < B(0.));
+                while (any(test1))
+                {
+                    z = select(test1, z / x, z);
+                    x = select(test1, x + B(1.), x);
+                    test1 = (x < B(0.));
+                }
+                auto test2 = (x < B(2.));
+                while (any(test2))
+                {
+                    z = select(test2, z / x, z);
+                    x = select(test2, x + B(1.), x);
+                    test2 = (x < B(2.));
+                }
+                x = z * tgamma_kernel<B>::compute(x - B(2.));
+#ifndef XSIMD_NO_INFINITIES
+                return select(inf_result, a, x);
+#else
+                return x;
+#endif
+            }
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> tgamma(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            auto nan_result = (self < batch_type(0.) && is_flint(self));
+#ifndef XSIMD_NO_INVALIDS
+            nan_result = isnan(self) || nan_result;
+#endif
+            batch_type q = abs(self);
+            auto test = (self < batch_type(-33.));
+            batch_type r = constants::nan<batch_type>();
+            if (any(test))
+            {
+                r = detail::tgamma_large_negative(q);
+                if (all(test))
+                    return select(nan_result, constants::nan<batch_type>(), r);
+            }
+            batch_type r1 = detail::tgamma_other(self, test);
+            batch_type r2 = select(test, r, r1);
+            return select(self == batch_type(0.), copysign(constants::infinity<batch_type>(), self), select(nan_result, constants::nan<batch_type>(), r2));
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/generic/xsimd_generic_memory.hpp b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_memory.hpp
new file mode 100644
index 0000000000000..fbe1bbc136620
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_memory.hpp
@@ -0,0 +1,672 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_MEMORY_HPP
+#define XSIMD_GENERIC_MEMORY_HPP
+
+#include <algorithm>
+#include <complex>
+#include <stdexcept>
+
+#include "../../types/xsimd_batch_constant.hpp"
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+
+    template <typename T, class A, bool... Values>
+    struct batch_bool_constant;
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // compress
+        namespace detail
+        {
+            template <class IT, class A, class I, size_t... Is>
+            XSIMD_INLINE batch<IT, A> create_compress_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence<Is...>)
+            {
+                batch<IT, A> swizzle_mask(IT(0));
+                alignas(A::alignment()) IT mask_buffer[batch<IT, A>::size] = { Is... };
+                size_t inserted = 0;
+                for (size_t i = 0; i < sizeof...(Is); ++i)
+                    if ((bitmask >> i) & 1u)
+                        std::swap(mask_buffer[inserted++], mask_buffer[i]);
+                return batch<IT, A>::load_aligned(&mask_buffer[0]);
+            }
+        }
+
+        template <typename A, typename T>
+        XSIMD_INLINE batch<T, A>
+        compress(batch<T, A> const& x, batch_bool<T, A> const& mask,
+                 kernel::requires_arch<generic>) noexcept
+        {
+            using IT = as_unsigned_integer_t<T>;
+            constexpr std::size_t size = batch_bool<T, A>::size;
+            auto bitmask = mask.mask();
+            auto z = select(mask, x, batch<T, A>((T)0));
+            auto compress_mask = detail::create_compress_swizzle_mask<IT, A>(bitmask, ::xsimd::detail::make_index_sequence<size>());
+            return swizzle(z, compress_mask);
+        }
+
+        // expand
+        namespace detail
+        {
+            template <class IT, class A, class I, size_t... Is>
+            XSIMD_INLINE batch<IT, A> create_expand_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence<Is...>)
+            {
+                batch<IT, A> swizzle_mask(IT(0));
+                IT j = 0;
+                (void)std::initializer_list<bool> { ((swizzle_mask = insert(swizzle_mask, j, index<Is>())), (j += ((bitmask >> Is) & 1u)), true)... };
+                return swizzle_mask;
+            }
+        }
+
+        template <typename A, typename T>
+        XSIMD_INLINE batch<T, A>
+        expand(batch<T, A> const& x, batch_bool<T, A> const& mask,
+               kernel::requires_arch<generic>) noexcept
+        {
+            constexpr std::size_t size = batch_bool<T, A>::size;
+            auto bitmask = mask.mask();
+            auto swizzle_mask = detail::create_expand_swizzle_mask<as_unsigned_integer_t<T>, A>(bitmask, ::xsimd::detail::make_index_sequence<size>());
+            auto z = swizzle(x, swizzle_mask);
+            return select(mask, z, batch<T, A>(T(0)));
+        }
+
+        // extract_pair
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(i < size && "index in bounds");
+
+            alignas(A::alignment()) T self_buffer[size];
+            self.store_aligned(self_buffer);
+
+            alignas(A::alignment()) T other_buffer[size];
+            other.store_aligned(other_buffer);
+
+            alignas(A::alignment()) T concat_buffer[size];
+
+            for (std::size_t j = 0; j < (size - i); ++j)
+            {
+                concat_buffer[j] = other_buffer[i + j];
+                if (j < i)
+                {
+                    concat_buffer[size - 1 - j] = self_buffer[i - 1 - j];
+                }
+            }
+            return batch<T, A>::load_aligned(concat_buffer);
+        }
+
+        // gather
+        namespace detail
+        {
+            // Not using XSIMD_INLINE here as it makes msvc hand got ever on avx512
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
+            inline batch<T, A> gather(U const* src, batch<V, A> const& index,
+                                      ::xsimd::index<N> I) noexcept
+            {
+                return insert(batch<T, A> {}, static_cast<T>(src[index.get(I)]), I);
+            }
+
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
+            inline batch<T, A>
+            gather(U const* src, batch<V, A> const& index, ::xsimd::index<N> I) noexcept
+            {
+                static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
+
+                const auto test = gather<N - 1, T, A>(src, index, {});
+                return insert(test, static_cast<T>(src[index.get(I)]), I);
+            }
+        } // namespace detail
+
+        template <typename T, typename A, typename V>
+        XSIMD_INLINE batch<T, A>
+        gather(batch<T, A> const&, T const* src, batch<V, A> const& index,
+               kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Index and destination sizes must match");
+
+            return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
+        }
+
+        // Gather with runtime indexes and mismatched strides.
+        template <typename T, typename A, typename U, typename V>
+        XSIMD_INLINE detail::sizes_mismatch_t<T, U, batch<T, A>>
+        gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
+               kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Index and destination sizes must match");
+
+            return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
+        }
+
+        // Gather with runtime indexes and matching strides.
+        template <typename T, typename A, typename U, typename V>
+        XSIMD_INLINE detail::stride_match_t<T, U, batch<T, A>>
+        gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
+               kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Index and destination sizes must match");
+
+            return batch_cast<T>(kernel::gather(batch<U, A> {}, src, index, A {}));
+        }
+
+        // insert
+        template <class A, class T, size_t I>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept
+        {
+            struct index_mask
+            {
+                static constexpr bool get(size_t index, size_t /* size*/)
+                {
+                    return index != I;
+                }
+            };
+            batch<T, A> tmp(val);
+            return select(make_batch_bool_constant<T, A, index_mask>(), self, tmp);
+        }
+
+        // get
+        template <class A, size_t I, class T>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[I];
+        }
+
+        template <class A, size_t I, class T>
+        XSIMD_INLINE T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch_bool<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[I];
+        }
+
+        template <class A, size_t I, class T>
+        XSIMD_INLINE auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
+        {
+            alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[I];
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[i];
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[i];
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
+        {
+            using T2 = typename batch<std::complex<T>, A>::value_type;
+            alignas(A::alignment()) T2 buffer[batch<std::complex<T>, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[i];
+        }
+
+        // load_aligned
+        namespace detail
+        {
+            template <class A, class T_in, class T_out>
+            XSIMD_INLINE batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
+            {
+                using batch_type_in = batch<T_in, A>;
+                using batch_type_out = batch<T_out, A>;
+                return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {});
+            }
+            template <class A, class T_in, class T_out>
+            XSIMD_INLINE batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept
+            {
+                static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
+                using batch_type_out = batch<T_out, A>;
+                alignas(A::alignment()) T_out buffer[batch_type_out::size];
+                std::copy(mem, mem + batch_type_out::size, std::begin(buffer));
+                return batch_type_out::load_aligned(buffer);
+            }
+        }
+        template <class A, class T_in, class T_out>
+        XSIMD_INLINE batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
+        {
+            return detail::load_aligned<A>(mem, cvt, A {}, detail::conversion_type<A, T_in, T_out> {});
+        }
+
+        // load_unaligned
+        namespace detail
+        {
+            template <class A, class T_in, class T_out>
+            XSIMD_INLINE batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
+            {
+                using batch_type_in = batch<T_in, A>;
+                using batch_type_out = batch<T_out, A>;
+                return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {});
+            }
+
+            template <class A, class T_in, class T_out>
+            XSIMD_INLINE batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept
+            {
+                static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
+                return load_aligned<A>(mem, cvt, generic {}, with_slow_conversion {});
+            }
+        }
+        template <class A, class T_in, class T_out>
+        XSIMD_INLINE batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
+        {
+            return detail::load_unaligned<A>(mem, cvt, generic {}, detail::conversion_type<A, T_in, T_out> {});
+        }
+
+        // rotate_right
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            struct rotate_generator
+            {
+                static constexpr size_t get(size_t index, size_t size)
+                {
+                    return (index - N) % size;
+                }
+            };
+
+            return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
+        }
+
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> rotate_right(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return { rotate_right<N>(self.real()), rotate_right<N>(self.imag()) };
+        }
+
+        // rotate_left
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            struct rotate_generator
+            {
+                static constexpr size_t get(size_t index, size_t size)
+                {
+                    return (index + N) % size;
+                }
+            };
+
+            return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
+        }
+
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> rotate_left(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return { rotate_left<N>(self.real()), rotate_left<N>(self.imag()) };
+        }
+
+        // Scatter with runtime indexes.
+        namespace detail
+        {
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
+            XSIMD_INLINE void scatter(batch<T, A> const& src, U* dst,
+                                      batch<V, A> const& index,
+                                      ::xsimd::index<N> I) noexcept
+            {
+                dst[index.get(I)] = static_cast<U>(src.get(I));
+            }
+
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
+            XSIMD_INLINE void
+            scatter(batch<T, A> const& src, U* dst, batch<V, A> const& index,
+                    ::xsimd::index<N> I) noexcept
+            {
+                static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
+
+                kernel::detail::scatter<N - 1, T, A, U, V>(
+                    src, dst, index, {});
+                dst[index.get(I)] = static_cast<U>(src.get(I));
+            }
+        } // namespace detail
+
+        template <typename A, typename T, typename V>
+        XSIMD_INLINE void
+        scatter(batch<T, A> const& src, T* dst,
+                batch<V, A> const& index,
+                kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Source and index sizes must match");
+            kernel::detail::scatter<batch<V, A>::size - 1, T, A, T, V>(
+                src, dst, index, {});
+        }
+
+        template <typename A, typename T, typename U, typename V>
+        XSIMD_INLINE detail::sizes_mismatch_t<T, U, void>
+        scatter(batch<T, A> const& src, U* dst,
+                batch<V, A> const& index,
+                kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Source and index sizes must match");
+            kernel::detail::scatter<batch<V, A>::size - 1, T, A, U, V>(
+                src, dst, index, {});
+        }
+
+        template <typename A, typename T, typename U, typename V>
+        XSIMD_INLINE detail::stride_match_t<T, U, void>
+        scatter(batch<T, A> const& src, U* dst,
+                batch<V, A> const& index,
+                kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Source and index sizes must match");
+            const auto tmp = batch_cast<U>(src);
+            kernel::scatter<A>(tmp, dst, index, A {});
+        }
+
+        // shuffle
+        namespace detail
+        {
+            constexpr bool is_swizzle_fst(size_t)
+            {
+                return true;
+            }
+            template <typename ITy, typename... ITys>
+            constexpr bool is_swizzle_fst(size_t bsize, ITy index, ITys... indices)
+            {
+                return index < bsize && is_swizzle_fst(bsize, indices...);
+            }
+            constexpr bool is_swizzle_snd(size_t)
+            {
+                return true;
+            }
+            template <typename ITy, typename... ITys>
+            constexpr bool is_swizzle_snd(size_t bsize, ITy index, ITys... indices)
+            {
+                return index >= bsize && is_swizzle_snd(bsize, indices...);
+            }
+
+            constexpr bool is_zip_lo(size_t)
+            {
+                return true;
+            }
+
+            template <typename ITy>
+            constexpr bool is_zip_lo(size_t, ITy)
+            {
+                return false;
+            }
+
+            template <typename ITy0, typename ITy1, typename... ITys>
+            constexpr bool is_zip_lo(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices)
+            {
+                return index0 == (bsize - (sizeof...(indices) + 2)) && index1 == (2 * bsize - (sizeof...(indices) + 2)) && is_zip_lo(bsize, indices...);
+            }
+
+            constexpr bool is_zip_hi(size_t)
+            {
+                return true;
+            }
+
+            template <typename ITy>
+            constexpr bool is_zip_hi(size_t, ITy)
+            {
+                return false;
+            }
+
+            template <typename ITy0, typename ITy1, typename... ITys>
+            constexpr bool is_zip_hi(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices)
+            {
+                return index0 == (bsize / 2 + bsize - (sizeof...(indices) + 2)) && index1 == (bsize / 2 + 2 * bsize - (sizeof...(indices) + 2)) && is_zip_hi(bsize, indices...);
+            }
+
+            constexpr bool is_select(size_t)
+            {
+                return true;
+            }
+
+            template <typename ITy, typename... ITys>
+            constexpr bool is_select(size_t bsize, ITy index, ITys... indices)
+            {
+                return (index < bsize ? index : index - bsize) == (bsize - sizeof...(ITys)) && is_select(bsize, indices...);
+            }
+
+        }
+
+        template <class A, typename T, typename ITy, ITy... Indices>
+        XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept
+        {
+            constexpr size_t bsize = sizeof...(Indices);
+            static_assert(bsize == batch<T, A>::size, "valid shuffle");
+
+            // Detect common patterns
+            XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...))
+            {
+                return swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
+            }
+
+            XSIMD_IF_CONSTEXPR(detail::is_swizzle_snd(bsize, Indices...))
+            {
+                return swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
+            }
+
+            XSIMD_IF_CONSTEXPR(detail::is_zip_lo(bsize, Indices...))
+            {
+                return zip_lo(x, y);
+            }
+
+            XSIMD_IF_CONSTEXPR(detail::is_zip_hi(bsize, Indices...))
+            {
+                return zip_hi(x, y);
+            }
+
+            XSIMD_IF_CONSTEXPR(detail::is_select(bsize, Indices...))
+            {
+                return select(batch_bool_constant<T, A, (Indices < bsize)...>(), x, y);
+            }
+
+#if defined(__has_builtin) && !defined(XSIMD_WITH_EMULATED)
+#if __has_builtin(__builtin_shufflevector)
+#define builtin_shuffle __builtin_shufflevector
+#endif
+#endif
+
+#if defined(builtin_shuffle)
+            typedef T vty __attribute__((__vector_size__(sizeof(batch<T, A>))));
+            return (typename batch<T, A>::register_type)builtin_shuffle((vty)x.data, (vty)y.data, Indices...);
+
+// FIXME: my experiments show that GCC only correctly optimizes this builtin
+// starting at GCC 13, where it already has __builtin_shuffle_vector
+//
+// #elif __has_builtin(__builtin_shuffle) || GCC >= 6
+//            typedef ITy integer_vector_type __attribute__((vector_size(sizeof(batch<ITy, A>))));
+//            return __builtin_shuffle(x.data, y.data, integer_vector_type{Indices...});
+#else
+            // Use a generic_pattern. It is suboptimal but clang optimizes this
+            // pretty well.
+            batch<T, A> x_lane = swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
+            batch<T, A> y_lane = swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
+            batch_bool_constant<T, A, (Indices < bsize)...> select_x_lane;
+            return select(select_x_lane, x_lane, y_lane);
+#endif
+        }
+
+        // store
+        template <class T, class A>
+        XSIMD_INLINE void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            constexpr auto size = batch_bool<T, A>::size;
+            alignas(A::alignment()) T buffer[size];
+            kernel::store_aligned<A>(&buffer[0], batch_type(self), A {});
+            for (std::size_t i = 0; i < size; ++i)
+                mem[i] = bool(buffer[i]);
+        }
+
+        // store_aligned
+        template <class A, class T_in, class T_out>
+        XSIMD_INLINE void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
+        {
+            static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
+            alignas(A::alignment()) T_in buffer[batch<T_in, A>::size];
+            store_aligned(&buffer[0], self);
+            std::copy(std::begin(buffer), std::end(buffer), mem);
+        }
+
+        // store_unaligned
+        template <class A, class T_in, class T_out>
+        XSIMD_INLINE void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
+        {
+            static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
+            return store_aligned<A>(mem, self, generic {});
+        }
+
+        // swizzle
+        template <class A, class T, class ITy, ITy... Vs>
+        XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<generic>) noexcept
+        {
+            return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
+        }
+
+        template <class A, class T, class ITy>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<generic>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            alignas(A::alignment()) T self_buffer[size];
+            store_aligned(&self_buffer[0], self);
+
+            alignas(A::alignment()) ITy mask_buffer[size];
+            store_aligned(&mask_buffer[0], mask);
+
+            alignas(A::alignment()) T out_buffer[size];
+            for (size_t i = 0; i < size; ++i)
+                out_buffer[i] = self_buffer[mask_buffer[i]];
+            return batch<T, A>::load_aligned(out_buffer);
+        }
+
+        template <class A, class T, class ITy>
+        XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch<ITy, A> mask, requires_arch<generic>) noexcept
+        {
+            return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
+        }
+
+        // load_complex_aligned
+        namespace detail
+        {
+            template <class A, class T>
+            XSIMD_INLINE batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept
+            {
+                static_assert(std::is_same<T, void>::value, "load_complex not implemented for the required architecture");
+            }
+
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
+            {
+                static_assert(std::is_same<T, void>::value, "complex_high not implemented for the required architecture");
+            }
+
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
+            {
+                static_assert(std::is_same<T, void>::value, "complex_low not implemented for the required architecture");
+            }
+        }
+
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_out, A>;
+            T_in const* buffer = reinterpret_cast<T_in const*>(mem);
+            real_batch hi = real_batch::load_aligned(buffer),
+                       lo = real_batch::load_aligned(buffer + real_batch::size);
+            return detail::load_complex(hi, lo, A {});
+        }
+
+        // load_complex_unaligned
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_out, A>;
+            T_in const* buffer = reinterpret_cast<T_in const*>(mem);
+            real_batch hi = real_batch::load_unaligned(buffer),
+                       lo = real_batch::load_unaligned(buffer + real_batch::size);
+            return detail::load_complex(hi, lo, A {});
+        }
+
+        // store_complex_aligned
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_in, A>;
+            real_batch hi = detail::complex_high(src, A {});
+            real_batch lo = detail::complex_low(src, A {});
+            T_out* buffer = reinterpret_cast<T_out*>(dst);
+            lo.store_aligned(buffer);
+            hi.store_aligned(buffer + real_batch::size);
+        }
+
+        // store_compelx_unaligned
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_in, A>;
+            real_batch hi = detail::complex_high(src, A {});
+            real_batch lo = detail::complex_low(src, A {});
+            T_out* buffer = reinterpret_cast<T_out*>(dst);
+            lo.store_unaligned(buffer);
+            hi.store_unaligned(buffer + real_batch::size);
+        }
+
+        // transpose
+        template <class A, class T>
+        XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<generic>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            alignas(A::alignment()) T scratch_buffer[batch<T, A>::size * batch<T, A>::size];
+            for (size_t i = 0; i < batch<T, A>::size; ++i)
+            {
+                matrix_begin[i].store_aligned(&scratch_buffer[i * batch<T, A>::size]);
+            }
+            // FIXME: this is super naive we can probably do better.
+            for (size_t i = 0; i < batch<T, A>::size; ++i)
+            {
+                for (size_t j = 0; j < i; ++j)
+                {
+                    std::swap(scratch_buffer[i * batch<T, A>::size + j],
+                              scratch_buffer[j * batch<T, A>::size + i]);
+                }
+            }
+            for (size_t i = 0; i < batch<T, A>::size; ++i)
+            {
+                matrix_begin[i] = batch<T, A>::load_aligned(&scratch_buffer[i * batch<T, A>::size]);
+            }
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/generic/xsimd_generic_rounding.hpp b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_rounding.hpp
new file mode 100644
index 0000000000000..daf7b58ea718d
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_rounding.hpp
@@ -0,0 +1,72 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_ROUNDING_HPP
+#define XSIMD_GENERIC_ROUNDING_HPP
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // ceil
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> ceil(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            batch<T, A> truncated_self = trunc(self);
+            return select(truncated_self < self, truncated_self + 1, truncated_self);
+        }
+
+        // floor
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> floor(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            batch<T, A> truncated_self = trunc(self);
+            return select(truncated_self > self, truncated_self - 1, truncated_self);
+        }
+
+        // round
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> round(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            auto v = abs(self);
+            auto c = ceil(v);
+            auto cp = select(c - 0.5 > v, c - 1, c);
+            return select(v > constants::maxflint<batch<T, A>>(), self, copysign(cp, self));
+        }
+
+        // trunc
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> trunc(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return select(abs(self) < constants::maxflint<batch<float, A>>(), to_float(to_int(self)), self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> trunc(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return select(abs(self) < constants::maxflint<batch<double, A>>(), to_float(to_int(self)), self);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/generic/xsimd_generic_trigo.hpp b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_trigo.hpp
new file mode 100644
index 0000000000000..b1bb68f25e9f9
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/generic/xsimd_generic_trigo.hpp
@@ -0,0 +1,969 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_TRIGO_HPP
+#define XSIMD_GENERIC_TRIGO_HPP
+
+#include "./xsimd_generic_details.hpp"
+
+#include <array>
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        /* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+
+        using namespace types;
+
+        // acos
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> acos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            auto x_larger_05 = x > batch_type(0.5);
+            x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self);
+            x = asin(x);
+            x = select(x_larger_05, x + x, x);
+            x = select(self < batch_type(-0.5), constants::pi<batch_type>() - x, x);
+            return select(x_larger_05, x, constants::pio2<batch_type>() - x);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> acos(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            batch_type tmp = asin(z);
+            return { constants::pio2<real_batch>() - tmp.real(), -tmp.imag() };
+        }
+
+        // acosh
+        /* origin: boost/simd/arch/common/simd/function/acosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> acosh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = self - batch_type(1.);
+            auto test = x > constants::oneotwoeps<batch_type>();
+            batch_type z = select(test, self, x + sqrt(x + x + x * x));
+            batch_type l1pz = log1p(z);
+            return select(test, l1pz + constants::log_2<batch_type>(), l1pz);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> acosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = acos(z);
+            w = batch_type(-w.imag(), w.real());
+            return w;
+        }
+
+        // asin
+        template <class A>
+        XSIMD_INLINE batch<float, A> asin(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            batch_type sign = bitofsign(self);
+            auto x_larger_05 = x > batch_type(0.5);
+            batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x);
+            x = select(x_larger_05, sqrt(z), x);
+            batch_type z1 = detail::horner<batch_type,
+                                           0x3e2aaae4,
+                                           0x3d9980f6,
+                                           0x3d3a3ec7,
+                                           0x3cc617e3,
+                                           0x3d2cb352>(z);
+            z1 = fma(z1, z * x, x);
+            z = select(x_larger_05, constants::pio2<batch_type>() - (z1 + z1), z1);
+            return z ^ sign;
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> asin(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            auto small_cond = x < constants::sqrteps<batch_type>();
+            batch_type ct1 = batch_type(bit_cast<double>(int64_t(0x3fe4000000000000)));
+            batch_type zz1 = batch_type(1.) - x;
+            batch_type vp = zz1 * detail::horner<batch_type, 0x403c896240f3081dull, 0xc03991aaac01ab68ull, 0x401bdff5baf33e6aull, 0xbfe2079259f9290full, 0x3f684fc3988e9f08ull>(zz1) / detail::horner1<batch_type, 0x40756709b0b644beull, 0xc077fe08959063eeull, 0x40626219af6a7f42ull, 0xc035f2a2b6bf5d8cull>(zz1);
+            zz1 = sqrt(zz1 + zz1);
+            batch_type z = constants::pio4<batch_type>() - zz1;
+            zz1 = fms(zz1, vp, constants::pio_2lo<batch_type>());
+            z = z - zz1;
+            zz1 = z + constants::pio4<batch_type>();
+            batch_type zz2 = self * self;
+            z = zz2 * detail::horner<batch_type, 0xc020656c06ceafd5ull, 0x40339007da779259ull, 0xc0304331de27907bull, 0x4015c74b178a2dd9ull, 0xbfe34341333e5c16ull, 0x3f716b9b0bd48ad3ull>(zz2) / detail::horner1<batch_type, 0xc04898220a3607acull, 0x4061705684ffbf9dull, 0xc06265bb6d3576d7ull, 0x40519fc025fe9054ull, 0xc02d7b590b5e0eabull>(zz2);
+            zz2 = fma(x, z, x);
+            return select(x > batch_type(1.), constants::nan<batch_type>(),
+                          select(small_cond, x,
+                                 select(x > ct1, zz1, zz2))
+                              ^ bitofsign(self));
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> asin(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch x = z.real();
+            real_batch y = z.imag();
+
+            batch_type ct(-y, x);
+            batch_type zz(real_batch(1.) - (x - y) * (x + y), -2 * x * y);
+            zz = log(ct + sqrt(zz));
+            batch_type resg(zz.imag(), -zz.real());
+
+            return select(y == real_batch(0.),
+                          select(fabs(x) > real_batch(1.),
+                                 batch_type(constants::pio2<real_batch>(), real_batch(0.)),
+                                 batch_type(asin(x), real_batch(0.))),
+                          resg);
+        }
+
+        // asinh
+        /* origin: boost/simd/arch/common/simd/function/asinh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        namespace detail
+        {
+            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+            XSIMD_INLINE batch<T, A>
+            average(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
+            {
+                return (x1 & x2) + ((x1 ^ x2) >> 1);
+            }
+
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A>
+            averagef(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
+            {
+                using batch_type = batch<T, A>;
+                return fma(x1, batch_type(0.5), x2 * batch_type(0.5));
+            }
+            template <class A>
+            XSIMD_INLINE batch<float, A> average(batch<float, A> const& x1, batch<float, A> const& x2) noexcept
+            {
+                return averagef(x1, x2);
+            }
+            template <class A>
+            XSIMD_INLINE batch<double, A> average(batch<double, A> const& x1, batch<double, A> const& x2) noexcept
+            {
+                return averagef(x1, x2);
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> asinh(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            auto lthalf = x < batch_type(0.5);
+            batch_type x2 = x * x;
+            batch_type bts = bitofsign(self);
+            batch_type z(0.);
+            if (any(lthalf))
+            {
+                z = detail::horner<batch_type,
+                                   0x3f800000,
+                                   0xbe2aa9ad,
+                                   0x3d9949b1,
+                                   0xbd2ee581,
+                                   0x3ca4d6e6>(x2)
+                    * x;
+                if (all(lthalf))
+                    return z ^ bts;
+            }
+            batch_type tmp = select(x > constants::oneosqrteps<batch_type>(), x, detail::average(x, hypot(batch_type(1.), x)));
+#ifndef XSIMD_NO_NANS
+            return select(isnan(self), constants::nan<batch_type>(), select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts);
+#else
+            return select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts;
+#endif
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> asinh(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            auto test = x > constants::oneosqrteps<batch_type>();
+            batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x)));
+#ifndef XSIMD_NO_INFINITIES
+            z = select(x == constants::infinity<batch_type>(), x, z);
+#endif
+            batch_type l1pz = log1p(z);
+            z = select(test, l1pz + constants::log_2<batch_type>(), l1pz);
+            return bitofsign(self) ^ z;
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> asinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = asin(batch_type(-z.imag(), z.real()));
+            w = batch_type(w.imag(), -w.real());
+            return w;
+        }
+
+        // atan
+        namespace detail
+        {
+            template <class A>
+            static XSIMD_INLINE batch<float, A> kernel_atan(const batch<float, A>& x, const batch<float, A>& recx) noexcept
+            {
+                using batch_type = batch<float, A>;
+                const auto flag1 = x < constants::tan3pio8<batch_type>();
+                const auto flag2 = (x >= batch_type(bit_cast<float>((uint32_t)0x3ed413cd))) && flag1;
+                batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
+                yy = select(flag2, constants::pio4<batch_type>(), yy);
+                batch_type xx = select(flag1, x, -recx);
+                xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
+                const batch_type z = xx * xx;
+                batch_type z1 = detail::horner<batch_type,
+                                               0xbeaaaa2aul,
+                                               0x3e4c925ful,
+                                               0xbe0e1b85ul,
+                                               0x3da4f0d1ul>(z);
+                z1 = fma(xx, z1 * z, xx);
+                z1 = select(flag2, z1 + constants::pio_4lo<batch_type>(), z1);
+                z1 = select(!flag1, z1 + constants::pio_2lo<batch_type>(), z1);
+                return yy + z1;
+            }
+            template <class A>
+            static XSIMD_INLINE batch<double, A> kernel_atan(const batch<double, A>& x, const batch<double, A>& recx) noexcept
+            {
+                using batch_type = batch<double, A>;
+                const auto flag1 = x < constants::tan3pio8<batch_type>();
+                const auto flag2 = (x >= constants::tanpio8<batch_type>()) && flag1;
+                batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
+                yy = select(flag2, constants::pio4<batch_type>(), yy);
+                batch_type xx = select(flag1, x, -recx);
+                xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
+                batch_type z = xx * xx;
+                z *= detail::horner<batch_type,
+                                    0xc0503669fd28ec8eull,
+                                    0xc05eb8bf2d05ba25ull,
+                                    0xc052c08c36880273ull,
+                                    0xc03028545b6b807aull,
+                                    0xbfec007fa1f72594ull>(z)
+                    / detail::horner1<batch_type,
+                                      0x4068519efbbd62ecull,
+                                      0x407e563f13b049eaull,
+                                      0x407b0e18d2e2be3bull,
+                                      0x4064a0dd43b8fa25ull,
+                                      0x4038dbc45b14603cull>(z);
+                z = fma(xx, z, xx);
+                z = select(flag2, z + constants::pio_4lo<batch_type>(), z);
+                z = z + select(flag1, batch_type(0.), constants::pio_2lo<batch_type>());
+                return yy + z;
+            }
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> atan(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type absa = abs(self);
+            const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa);
+            return x ^ bitofsign(self);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> atan(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch x = z.real();
+            real_batch y = z.imag();
+            real_batch x2 = x * x;
+            real_batch one(1.);
+            real_batch a = one - x2 - (y * y);
+            real_batch w = 0.5 * atan2(2. * x, a);
+            real_batch num = y + one;
+            num = x2 + num * num;
+            real_batch den = y - one;
+            den = x2 + den * den;
+            batch_type res = select((x == real_batch(0.)) && (y == real_batch(1.)),
+                                    batch_type(real_batch(0.), constants::infinity<real_batch>()),
+                                    batch_type(w, 0.25 * log(num / den)));
+            return res;
+        }
+
+        // atanh
+        /* origin: boost/simd/arch/common/simd/function/acosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> atanh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            batch_type t = x + x;
+            batch_type z = batch_type(1.) - x;
+            auto test = x < batch_type(0.5);
+            batch_type tmp = select(test, x, t) / z;
+            return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp)));
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> atanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = atan(batch_type(-z.imag(), z.real()));
+            w = batch_type(w.imag(), -w.real());
+            return w;
+        }
+
+        // atan2
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type q = abs(self / other);
+            const batch_type z = detail::kernel_atan(q, batch_type(1.) / q);
+            return select(other > batch_type(0.), z, constants::pi<batch_type>() - z) * signnz(self);
+        }
+
+        // cos
+        namespace detail
+        {
+            template <class T, class A>
+            XSIMD_INLINE batch<T, A> quadrant(const batch<T, A>& x) noexcept
+            {
+                return x & batch<T, A>(3);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<float, A> quadrant(const batch<float, A>& x) noexcept
+            {
+                return to_float(quadrant(to_int(x)));
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> quadrant(const batch<double, A>& x) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type a = x * batch_type(0.25);
+                return (a - floor(a)) * batch_type(4.);
+            }
+            /* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+
+            template <class A>
+            XSIMD_INLINE batch<float, A> cos_eval(const batch<float, A>& z) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0x3d2aaaa5,
+                                              0xbab60619,
+                                              0x37ccf5ce>(z);
+                return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<float, A> sin_eval(const batch<float, A>& z, const batch<float, A>& x) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0xbe2aaaa2,
+                                              0x3c08839d,
+                                              0xb94ca1f9>(z);
+                return fma(y * z, x, x);
+            }
+
+            template <class A>
+            static XSIMD_INLINE batch<float, A> base_tancot_eval(const batch<float, A>& z) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type zz = z * z;
+                batch_type y = detail::horner<batch_type,
+                                              0x3eaaaa6f,
+                                              0x3e0896dd,
+                                              0x3d5ac5c9,
+                                              0x3cc821b5,
+                                              0x3b4c779c,
+                                              0x3c19c53b>(zz);
+                return fma(y, zz * z, z);
+            }
+
+            template <class A, class BB>
+            static XSIMD_INLINE batch<float, A> tan_eval(const batch<float, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, y, -batch_type(1.) / y);
+            }
+
+            template <class A, class BB>
+            static XSIMD_INLINE batch<float, A> cot_eval(const batch<float, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, batch_type(1.) / y, -y);
+            }
+
+            /* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            static XSIMD_INLINE batch<double, A> cos_eval(const batch<double, A>& z) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0x3fe0000000000000ull,
+                                              0xbfa5555555555551ull,
+                                              0x3f56c16c16c15d47ull,
+                                              0xbefa01a019ddbcd9ull,
+                                              0x3e927e4f8e06d9a5ull,
+                                              0xbe21eea7c1e514d4ull,
+                                              0x3da8ff831ad9b219ull>(z);
+                return batch_type(1.) - y * z;
+            }
+
+            template <class A>
+            static XSIMD_INLINE batch<double, A> sin_eval(const batch<double, A>& z, const batch<double, A>& x) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0xbfc5555555555548ull,
+                                              0x3f8111111110f7d0ull,
+                                              0xbf2a01a019bfdf03ull,
+                                              0x3ec71de3567d4896ull,
+                                              0xbe5ae5e5a9291691ull,
+                                              0x3de5d8fd1fcf0ec1ull>(z);
+                return fma(y * z, x, x);
+            }
+
+            template <class A>
+            static XSIMD_INLINE batch<double, A> base_tancot_eval(const batch<double, A>& z) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type zz = z * z;
+                batch_type num = detail::horner<batch_type,
+                                                0xc1711fead3299176ull,
+                                                0x413199eca5fc9dddull,
+                                                0xc0c992d8d24f3f38ull>(zz);
+                batch_type den = detail::horner1<batch_type,
+                                                 0xc189afe03cbe5a31ull,
+                                                 0x4177d98fc2ead8efull,
+                                                 0xc13427bc582abc96ull,
+                                                 0x40cab8a5eeb36572ull>(zz);
+                return fma(z, (zz * (num / den)), z);
+            }
+
+            template <class A, class BB>
+            static XSIMD_INLINE batch<double, A> tan_eval(const batch<double, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, y, -batch_type(1.) / y);
+            }
+
+            template <class A, class BB>
+            static XSIMD_INLINE batch<double, A> cot_eval(const batch<double, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, batch_type(1.) / y, -y);
+            }
+            /* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+
+            struct trigo_radian_tag
+            {
+            };
+            struct trigo_pi_tag
+            {
+            };
+
+            template <class B, class Tag = trigo_radian_tag>
+            struct trigo_reducer
+            {
+                static XSIMD_INLINE B reduce(const B& x, B& xr) noexcept
+                {
+                    if (all(x <= constants::pio4<B>()))
+                    {
+                        xr = x;
+                        return B(0.);
+                    }
+                    else if (all(x <= constants::pio2<B>()))
+                    {
+                        auto test = x > constants::pio4<B>();
+                        xr = x - constants::pio2_1<B>();
+                        xr -= constants::pio2_2<B>();
+                        xr -= constants::pio2_3<B>();
+                        xr = select(test, xr, x);
+                        return select(test, B(1.), B(0.));
+                    }
+                    else if (all(x <= constants::twentypi<B>()))
+                    {
+                        B xi = nearbyint(x * constants::twoopi<B>());
+                        xr = fnma(xi, constants::pio2_1<B>(), x);
+                        xr -= xi * constants::pio2_2<B>();
+                        xr -= xi * constants::pio2_3<B>();
+                        return quadrant(xi);
+                    }
+                    else if (all(x <= constants::mediumpi<B>()))
+                    {
+                        B fn = nearbyint(x * constants::twoopi<B>());
+                        B r = x - fn * constants::pio2_1<B>();
+                        B w = fn * constants::pio2_1t<B>();
+                        B t = r;
+                        w = fn * constants::pio2_2<B>();
+                        r = t - w;
+                        w = fn * constants::pio2_2t<B>() - ((t - r) - w);
+                        t = r;
+                        w = fn * constants::pio2_3<B>();
+                        r = t - w;
+                        w = fn * constants::pio2_3t<B>() - ((t - r) - w);
+                        xr = r - w;
+                        return quadrant(fn);
+                    }
+                    else
+                    {
+                        static constexpr std::size_t size = B::size;
+                        using value_type = typename B::value_type;
+                        alignas(B) std::array<value_type, size> tmp;
+                        alignas(B) std::array<value_type, size> txr;
+                        alignas(B) std::array<value_type, size> args;
+                        x.store_aligned(args.data());
+
+                        for (std::size_t i = 0; i < size; ++i)
+                        {
+                            double arg = args[i];
+                            if (arg == std::numeric_limits<value_type>::infinity())
+                            {
+                                tmp[i] = 0.;
+                                txr[i] = std::numeric_limits<value_type>::quiet_NaN();
+                            }
+                            else
+                            {
+                                double y[2];
+                                std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y);
+                                tmp[i] = value_type(n & 3);
+                                txr[i] = value_type(y[0]);
+                            }
+                        }
+                        xr = B::load_aligned(&txr[0]);
+                        B res = B::load_aligned(&tmp[0]);
+                        return res;
+                    }
+                }
+            };
+
+            template <class B>
+            struct trigo_reducer<B, trigo_pi_tag>
+            {
+                static XSIMD_INLINE B reduce(const B& x, B& xr) noexcept
+                {
+                    B xi = nearbyint(x * B(2.));
+                    B x2 = x - xi * B(0.5);
+                    xr = x2 * constants::pi<B>();
+                    return quadrant(xi);
+                }
+            };
+
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> cos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type z = xr * xr;
+            const batch_type se = detail::sin_eval(z, xr);
+            const batch_type ce = detail::cos_eval(z);
+            const batch_type z1 = select(swap_bit != batch_type(0.), se, ce);
+            return z1 ^ sign_bit;
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> cos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) };
+        }
+
+        // cosh
+
+        /* origin: boost/simd/arch/common/simd/function/cosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> cosh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
+            batch_type fac = select(test1, batch_type(0.5), batch_type(1.));
+            batch_type tmp = exp(x * fac);
+            batch_type tmp1 = batch_type(0.5) * tmp;
+            return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp));
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> cosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            auto x = z.real();
+            auto y = z.imag();
+            return { cosh(x) * cos(y), sinh(x) * sin(y) };
+        }
+
+        // sin
+        namespace detail
+        {
+            template <class A, class T, class Tag = trigo_radian_tag>
+            XSIMD_INLINE batch<T, A> sin(batch<T, A> const& self, Tag = Tag()) noexcept
+            {
+                using batch_type = batch<T, A>;
+                const batch_type x = abs(self);
+                batch_type xr = constants::nan<batch_type>();
+                const batch_type n = detail::trigo_reducer<batch_type, Tag>::reduce(x, xr);
+                auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+                auto swap_bit = fma(batch_type(-2.), tmp, n);
+                auto sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+                const batch_type z = xr * xr;
+                const batch_type se = detail::sin_eval(z, xr);
+                const batch_type ce = detail::cos_eval(z);
+                const batch_type z1 = select(swap_bit == batch_type(0.), se, ce);
+                return z1 ^ sign_bit;
+            }
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> sin(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::sin(self);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> sin(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) };
+        }
+
+        // sincos
+        template <class A, class T>
+        XSIMD_INLINE std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            const batch_type z = xr * xr;
+            const batch_type se = detail::sin_eval(z, xr);
+            const batch_type ce = detail::cos_eval(z);
+            auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce);
+            auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce);
+            return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE std::pair<batch<std::complex<T>, A>, batch<std::complex<T>, A>>
+        sincos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch rcos = cos(z.real());
+            real_batch rsin = sin(z.real());
+            real_batch icosh = cosh(z.imag());
+            real_batch isinh = sinh(z.imag());
+            return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh));
+        }
+
+        // sinh
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            XSIMD_INLINE batch<float, A> sinh_kernel(batch<float, A> const& self) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type sqr_self = self * self;
+                return detail::horner<batch_type,
+                                      0x3f800000, // 1.0f
+                                      0x3e2aaacc, // 1.66667160211E-1f
+                                      0x3c087bbe, // 8.33028376239E-3f
+                                      0x39559e2f // 2.03721912945E-4f
+                                      >(sqr_self)
+                    * self;
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> sinh_kernel(batch<double, A> const& self) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type sqrself = self * self;
+                return fma(self, (detail::horner<batch_type,
+                                                 0xc115782bdbf6ab05ull, //  -3.51754964808151394800E5
+                                                 0xc0c694b8c71d6182ull, //  -1.15614435765005216044E4,
+                                                 0xc064773a398ff4feull, //  -1.63725857525983828727E2,
+                                                 0xbfe9435fe8bb3cd6ull //  -7.89474443963537015605E-1
+                                                 >(sqrself)
+                                  / detail::horner1<batch_type,
+                                                    0xc1401a20e4f90044ull, //  -2.11052978884890840399E6
+                                                    0x40e1a7ba7ed72245ull, //   3.61578279834431989373E4,
+                                                    0xc0715b6096e96484ull //  -2.77711081420602794433E2,
+                                                    >(sqrself))
+                               * sqrself,
+                           self);
+            }
+        }
+        /* origin: boost/simd/arch/common/simd/function/sinh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> sinh(batch<T, A> const& a, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type half(0.5);
+            batch_type x = abs(a);
+            auto lt1 = x < batch_type(1.);
+            batch_type bts = bitofsign(a);
+            batch_type z(0.);
+            if (any(lt1))
+            {
+                z = detail::sinh_kernel(x);
+                if (all(lt1))
+                    return z ^ bts;
+            }
+            auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
+            batch_type fac = select(test1, half, batch_type(1.));
+            batch_type tmp = exp(x * fac);
+            batch_type tmp1 = half * tmp;
+            batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp);
+            return select(lt1, z, r) ^ bts;
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> sinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            auto x = z.real();
+            auto y = z.imag();
+            return { sinh(x) * cos(y), cosh(x) * sin(y) };
+        }
+
+        // tan
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> tan(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            auto test = (swap_bit == batch_type(0.));
+            const batch_type y = detail::tan_eval(xr, test);
+            return y ^ bitofsign(self);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> tan(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch d = cos(2 * z.real()) + cosh(2 * z.imag());
+            batch_type winf(constants::infinity<real_batch>(), constants::infinity<real_batch>());
+            real_batch wreal = sin(2 * z.real()) / d;
+            real_batch wimag = sinh(2 * z.imag());
+            batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d));
+            return select(d == real_batch(0.), winf, wres);
+        }
+
+        // tanh
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct tanh_kernel;
+
+            template <class A>
+            struct tanh_kernel<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                static XSIMD_INLINE batch_type tanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    return fma(detail::horner<batch_type,
+                                              0xbeaaaa99, //    -3.33332819422E-1F
+                                              0x3e088393, //    +1.33314422036E-1F
+                                              0xbd5c1e2d, //    -5.37397155531E-2F
+                                              0x3ca9134e, //    +2.06390887954E-2F
+                                              0xbbbaf0ea //    -5.70498872745E-3F
+                                              >(sqrx)
+                                   * sqrx,
+                               x, x);
+                }
+
+                static XSIMD_INLINE batch_type cotanh(const batch_type& x) noexcept
+                {
+                    return batch_type(1.) / tanh(x);
+                }
+            };
+
+            template <class A>
+            struct tanh_kernel<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+                static XSIMD_INLINE batch_type tanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    return fma(sqrx * p(sqrx) / q(sqrx), x, x);
+                }
+
+                static XSIMD_INLINE batch_type cotanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    batch_type qval = q(sqrx);
+                    return qval / (x * fma(p(sqrx), sqrx, qval));
+                }
+
+                static XSIMD_INLINE batch_type p(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0xc0993ac030580563, // -1.61468768441708447952E3
+                                          0xc058d26a0e26682d, // -9.92877231001918586564E1,
+                                          0xbfeedc5baafd6f4b // -9.64399179425052238628E-1
+                                          >(x);
+                }
+
+                static XSIMD_INLINE batch_type q(const batch_type& x) noexcept
+                {
+                    return detail::horner1<batch_type,
+                                           0x40b2ec102442040c, //  4.84406305325125486048E3
+                                           0x40a176fa0e5535fa, //  2.23548839060100448583E3,
+                                           0x405c33f28a581B86 //  1.12811678491632931402E2,
+                                           >(x);
+                }
+            };
+
+        }
+        /* origin: boost/simd/arch/common/simd/function/tanh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> tanh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type one(1.);
+            batch_type x = abs(self);
+            auto test = x < (batch_type(5.) / batch_type(8.));
+            batch_type bts = bitofsign(self);
+            batch_type z = one;
+            if (any(test))
+            {
+                z = detail::tanh_kernel<batch_type>::tanh(x);
+                if (all(test))
+                    return z ^ bts;
+            }
+            batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one);
+            return select(test, z, r) ^ bts;
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch<std::complex<T>, A> tanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using real_batch = typename batch<std::complex<T>, A>::real_batch;
+            auto x = z.real();
+            auto y = z.imag();
+            real_batch two(2);
+            auto d = cosh(two * x) + cos(two * y);
+            return { sinh(two * x) / d, sin(two * y) / d };
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_avx.hpp b/include/onnxruntime/xsimd/arch/xsimd_avx.hpp
new file mode 100644
index 0000000000000..116ea7762472e
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_avx.hpp
@@ -0,0 +1,1820 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX_HPP
+#define XSIMD_AVX_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "../types/xsimd_avx_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fwd
+        template <class A, class T, size_t I>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+
+        namespace detail
+        {
+            XSIMD_INLINE void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
+            {
+                low = _mm256_castsi256_si128(val);
+                high = _mm256_extractf128_si256(val, 1);
+            }
+            XSIMD_INLINE void split_avx(__m256 val, __m128& low, __m128& high) noexcept
+            {
+                low = _mm256_castps256_ps128(val);
+                high = _mm256_extractf128_ps(val, 1);
+            }
+            XSIMD_INLINE void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept
+            {
+                low = _mm256_castpd256_pd128(val);
+                high = _mm256_extractf128_pd(val, 1);
+            }
+            XSIMD_INLINE __m256i merge_sse(__m128i low, __m128i high) noexcept
+            {
+                return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1);
+            }
+            XSIMD_INLINE __m256 merge_sse(__m128 low, __m128 high) noexcept
+            {
+                return _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1);
+            }
+            XSIMD_INLINE __m256d merge_sse(__m128d low, __m128d high) noexcept
+            {
+                return _mm256_insertf128_pd(_mm256_castpd128_pd256(low), high, 1);
+            }
+            template <class F>
+            XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self) noexcept
+            {
+                __m128i self_low, self_high;
+                split_avx(self, self_low, self_high);
+                __m128i res_low = f(self_low);
+                __m128i res_high = f(self_high);
+                return merge_sse(res_low, res_high);
+            }
+            template <class F>
+            XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept
+            {
+                __m128i self_low, self_high, other_low, other_high;
+                split_avx(self, self_low, self_high);
+                split_avx(other, other_low, other_high);
+                __m128i res_low = f(self_low, other_low);
+                __m128i res_high = f(self_high, other_high);
+                return merge_sse(res_low, res_high);
+            }
+            template <class F>
+            XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept
+            {
+                __m128i self_low, self_high;
+                split_avx(self, self_low, self_high);
+                __m128i res_low = f(self_low, other);
+                __m128i res_high = f(self_high, other);
+                return merge_sse(res_low, res_high);
+            }
+        }
+
+        // abs
+        template <class A>
+        XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31
+            return _mm256_andnot_ps(sign_mask, self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31
+            return _mm256_andnot_pd(sign_mask, self);
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return add(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_add_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_add_pd(self, other);
+        }
+
+        // all
+        template <class A>
+        XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_testc_ps(self, batch_bool<float, A>(true)) != 0;
+        }
+        template <class A>
+        XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_testc_pd(self, batch_bool<double, A>(true)) != 0;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_testc_si256(self, batch_bool<T, A>(true)) != 0;
+        }
+
+        // any
+        template <class A>
+        XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return !_mm256_testz_ps(self, self);
+        }
+        template <class A>
+        XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return !_mm256_testz_pd(self, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return !_mm256_testz_si256(self, self);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx>) noexcept
+        {
+            return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
+        }
+
+        // bitwise_and
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_pd(self, other);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_pd(self, other);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_ps(other, self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_pd(other, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_ps(other, self);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_pd(other, self);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
+                                      { return bitwise_lshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
+                                      self, other);
+        }
+
+        // bitwise_not
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s) noexcept
+                                      { return bitwise_not(batch<T, sse4_2>(s), sse4_2 {}); },
+                                      self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s) noexcept
+                                      { return bitwise_not(batch_bool<T, sse4_2>(s), sse4_2 {}); },
+                                      self);
+        }
+
+        // bitwise_or
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_pd(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_or(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_or(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
+                                      { return bitwise_rshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
+                                      self, other);
+        }
+
+        // bitwise_xor
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_xor(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_xor(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o), sse4_2 {}); },
+                                      self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castsi256_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castsi256_pd(self);
+        }
+        template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
+        XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx>) noexcept
+        {
+            return batch<Tp, A>(self.data);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castps_pd(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castps_si256(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castpd_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castpd_si256(self);
+        }
+
+        // bitwise_not
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
+        }
+
+        // broadcast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_set1_epi8(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_set1_epi16(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_set1_epi32(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_set1_epi64x(val);
+            }
+            else
+            {
+                assert(false && "unsupported");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<avx>) noexcept
+        {
+            return _mm256_set1_ps(val);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<avx>) noexcept
+        {
+            return _mm256_set1_pd(val);
+        }
+
+        // ceil
+        template <class A>
+        XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_ceil_ps(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_ceil_pd(self);
+        }
+
+        namespace detail
+        {
+            // On clang, _mm256_extractf128_ps is built upon build_shufflevector
+            // which require index parameter to be a constant
+            template <int index, class B>
+            XSIMD_INLINE B get_half_complex_f(const B& real, const B& imag) noexcept
+            {
+                __m128 tmp0 = _mm256_extractf128_ps(real, index);
+                __m128 tmp1 = _mm256_extractf128_ps(imag, index);
+                __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1);
+                tmp0 = _mm_unpacklo_ps(tmp0, tmp1);
+                __m256 res = real;
+                res = _mm256_insertf128_ps(res, tmp0, 0);
+                res = _mm256_insertf128_ps(res, tmp2, 1);
+                return res;
+            }
+            template <int index, class B>
+            XSIMD_INLINE B get_half_complex_d(const B& real, const B& imag) noexcept
+            {
+                __m128d tmp0 = _mm256_extractf128_pd(real, index);
+                __m128d tmp1 = _mm256_extractf128_pd(imag, index);
+                __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1);
+                tmp0 = _mm_unpacklo_pd(tmp0, tmp1);
+                __m256d res = real;
+                res = _mm256_insertf128_pd(res, tmp0, 0);
+                res = _mm256_insertf128_pd(res, tmp2, 1);
+                return res;
+            }
+
+            // complex_low
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_f<0>(self.real(), self.imag());
+            }
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_d<0>(self.real(), self.imag());
+            }
+
+            // complex_high
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_f<1>(self.real(), self.imag());
+            }
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_d<1>(self.real(), self.imag());
+            }
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+            {
+                return _mm256_cvtepi32_ps(self);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>) noexcept
+            {
+                return _mm256_cvttps_epi32(self);
+            }
+        }
+
+        // decr_if
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
+        {
+            return self + batch<T, A>(mask.data);
+        }
+
+        // div
+        template <class A>
+        XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_div_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_div_pd(self, other);
+        }
+
+        // eq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self != other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self != other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return eq(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+                                      self, other);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self != other);
+        }
+
+        // floor
+        template <class A>
+        XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_floor_ps(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_floor_pd(self);
+        }
+
+        // from_mask
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut32[] = {
+                0x0000000000000000ul,
+                0x00000000FFFFFFFFul,
+                0xFFFFFFFF00000000ul,
+                0xFFFFFFFFFFFFFFFFul,
+            };
+            assert(!(mask & ~0xFFul) && "inbound mask");
+            return _mm256_castsi256_ps(_mm256_setr_epi64x(lut32[mask & 0x3], lut32[(mask >> 2) & 0x3], lut32[(mask >> 4) & 0x3], lut32[mask >> 6]));
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut64[][4] = {
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+            };
+            assert(!(mask & ~0xFul) && "inbound mask");
+            return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)lut64[mask]));
+        }
+        template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        {
+            alignas(A::alignment()) static const uint32_t lut32[] = {
+                0x00000000,
+                0x000000FF,
+                0x0000FF00,
+                0x0000FFFF,
+                0x00FF0000,
+                0x00FF00FF,
+                0x00FFFF00,
+                0x00FFFFFF,
+                0xFF000000,
+                0xFF0000FF,
+                0xFF00FF00,
+                0xFF00FFFF,
+                0xFFFF0000,
+                0xFFFF00FF,
+                0xFFFFFF00,
+                0xFFFFFFFF,
+            };
+            alignas(A::alignment()) static const uint64_t lut64[] = {
+                0x0000000000000000ul,
+                0x000000000000FFFFul,
+                0x00000000FFFF0000ul,
+                0x00000000FFFFFFFFul,
+                0x0000FFFF00000000ul,
+                0x0000FFFF0000FFFFul,
+                0x0000FFFFFFFF0000ul,
+                0x0000FFFFFFFFFFFFul,
+                0xFFFF000000000000ul,
+                0xFFFF00000000FFFFul,
+                0xFFFF0000FFFF0000ul,
+                0xFFFF0000FFFFFFFFul,
+                0xFFFFFFFF00000000ul,
+                0xFFFFFFFF0000FFFFul,
+                0xFFFFFFFFFFFF0000ul,
+                0xFFFFFFFFFFFFFFFFul,
+            };
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(!(mask & ~0xFFFFFFFFul) && "inbound mask");
+                return _mm256_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF],
+                                         lut32[(mask >> 8) & 0xF], lut32[(mask >> 12) & 0xF],
+                                         lut32[(mask >> 16) & 0xF], lut32[(mask >> 20) & 0xF],
+                                         lut32[(mask >> 24) & 0xF], lut32[mask >> 28]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(!(mask & ~0xFFFFul) && "inbound mask");
+                return _mm256_setr_epi64x(lut64[mask & 0xF], lut64[(mask >> 4) & 0xF], lut64[(mask >> 8) & 0xF], lut64[(mask >> 12) & 0xF]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_castps_si256(from_mask(batch_bool<float, A> {}, mask, avx {}));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_castpd_si256(from_mask(batch_bool<double, A> {}, mask, avx {}));
+            }
+        }
+
+        // haddp
+        template <class A>
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx>) noexcept
+        {
+            // row = (a,b,c,d,e,f,g,h)
+            // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7)
+            __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]);
+            // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7)
+            __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]);
+            // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3,
+            // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7)
+            tmp1 = _mm256_hadd_ps(tmp0, tmp1);
+            // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7)
+            tmp0 = _mm256_hadd_ps(row[4], row[5]);
+            // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7)
+            __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]);
+            // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3,
+            // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7)
+            tmp2 = _mm256_hadd_ps(tmp0, tmp2);
+            // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3,
+            // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7)
+            tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000);
+            // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7,
+            // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3)
+            tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21);
+            return _mm256_add_ps(tmp0, tmp1);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx>) noexcept
+        {
+            // row = (a,b,c,d)
+            // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3)
+            __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]);
+            // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3)
+            __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]);
+            // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3)
+            __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100);
+            // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3)
+            tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21);
+            return _mm256_add_pd(tmp1, tmp2);
+        }
+
+        // incr_if
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
+        {
+            return self - batch<T, A>(mask.data);
+        }
+
+        // insert
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
+        {
+#if !defined(_MSC_VER) || _MSC_VER > 1900
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_insert_epi8(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_insert_epi16(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_insert_epi32(self, val, I);
+            }
+            else
+            {
+                return insert(self, val, pos, generic {});
+            }
+#endif
+            return insert(self, val, pos, generic {});
+        }
+
+        // isnan
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, self, _CMP_UNORD_Q);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, self, _CMP_UNORD_Q);
+        }
+
+        // le
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_LE_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_LE_OQ);
+        }
+
+        // load_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
+        {
+            return _mm256_load_si256((__m256i const*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
+        {
+            return _mm256_load_ps(mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
+        {
+            return _mm256_load_pd(mem);
+        }
+
+        namespace detail
+        {
+            // load_complex
+            template <class A>
+            XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx>) noexcept
+            {
+                using batch_type = batch<float, A>;
+                __m128 tmp0 = _mm256_extractf128_ps(hi, 0);
+                __m128 tmp1 = _mm256_extractf128_ps(hi, 1);
+                __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+                __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+                batch_type real = _mm256_castps128_ps256(tmp_real);
+                batch_type imag = _mm256_castps128_ps256(tmp_imag);
+
+                tmp0 = _mm256_extractf128_ps(lo, 0);
+                tmp1 = _mm256_extractf128_ps(lo, 1);
+                tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+                tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+                real = _mm256_insertf128_ps(real, tmp_real, 1);
+                imag = _mm256_insertf128_ps(imag, tmp_imag, 1);
+                return { real, imag };
+            }
+            template <class A>
+            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx>) noexcept
+            {
+                using batch_type = batch<double, A>;
+                __m128d tmp0 = _mm256_extractf128_pd(hi, 0);
+                __m128d tmp1 = _mm256_extractf128_pd(hi, 1);
+                batch_type real = _mm256_castpd128_pd256(_mm_unpacklo_pd(tmp0, tmp1));
+                batch_type imag = _mm256_castpd128_pd256(_mm_unpackhi_pd(tmp0, tmp1));
+
+                tmp0 = _mm256_extractf128_pd(lo, 0);
+                tmp1 = _mm256_extractf128_pd(lo, 1);
+                __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1);
+                __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1);
+                real = _mm256_blend_pd(real, re_tmp1, 12);
+                imag = _mm256_blend_pd(imag, im_tmp1, 12);
+                return { real, imag };
+            }
+        }
+
+        // load_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
+        {
+            return _mm256_loadu_si256((__m256i const*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
+        {
+            return _mm256_loadu_ps(mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
+        {
+            return _mm256_loadu_pd(mem);
+        }
+
+        // lt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_LT_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_LT_OQ);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return lt(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // mask
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+            {
+                __m128i self_low, self_high;
+                detail::split_avx(self, self_low, self_high);
+                return mask(batch_bool<T, sse4_2>(self_low), sse4_2 {}) | (mask(batch_bool<T, sse4_2>(self_high), sse4_2 {}) << (128 / (8 * sizeof(T))));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_movemask_ps(_mm256_castsi256_ps(self));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_movemask_pd(_mm256_castsi256_pd(self));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_movemask_ps(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_movemask_pd(self);
+        }
+
+        // max
+        template <class A>
+        XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_max_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_max_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return select(self > other, self, other);
+        }
+
+        // min
+        template <class A>
+        XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_min_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_min_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return select(self <= other, self, other);
+        }
+
+        // mul
+        template <class A>
+        XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_mul_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_mul_pd(self, other);
+        }
+
+        // nearbyint
+        template <class A>
+        XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                        requires_arch<avx>) noexcept
+        {
+            return _mm256_cvtps_epi32(self);
+        }
+
+        // neg
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return 0 - self;
+        }
+        template <class A>
+        batch<float, A> neg(batch<float, A> const& self, requires_arch<avx>)
+        {
+            return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000)));
+        }
+
+        // neq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self == other);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(self.data), _mm256_castsi256_ps(other.data)));
+        }
+
+        // reciprocal
+        template <class A>
+        XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
+                                                kernel::requires_arch<avx>) noexcept
+        {
+            return _mm256_rcp_ps(self);
+        }
+
+        // reduce_add
+        template <class A>
+        XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx>) noexcept
+        {
+            // Warning about _mm256_hadd_ps:
+            // _mm256_hadd_ps(a,b) gives
+            // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't
+            // rely on a naive use of this method
+            // rhs = (x0, x1, x2, x3, x4, x5, x6, x7)
+            // tmp = (x4, x5, x6, x7, x0, x1, x2, x3)
+            __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1);
+            // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7)
+            tmp = _mm256_add_ps(rhs, tmp);
+            // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -)
+            tmp = _mm256_hadd_ps(tmp, tmp);
+            // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -)
+            tmp = _mm256_hadd_ps(tmp, tmp);
+            return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
+        }
+        template <class A>
+        XSIMD_INLINE double reduce_add(batch<double, A> const& rhs, requires_arch<avx>) noexcept
+        {
+            // rhs = (x0, x1, x2, x3)
+            // tmp = (x2, x3, x0, x1)
+            __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
+            // tmp = (x2+x0, x3+x1, -, -)
+            tmp = _mm256_add_pd(rhs, tmp);
+            // tmp = (x2+x0+x3+x1, -, -, -)
+            tmp = _mm256_hadd_pd(tmp, tmp);
+            return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            __m128i low, high;
+            detail::split_avx(self, low, high);
+            batch<T, sse4_2> blow(low), bhigh(high);
+            return reduce_add(blow) + reduce_add(bhigh);
+        }
+
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(1, 0);
+            batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
+            batch<T, A> acc = max(self, step);
+            __m128i low = _mm256_castsi256_si128(acc);
+            return reduce_max(batch<T, sse4_2>(low));
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(1, 0);
+            batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
+            batch<T, A> acc = min(self, step);
+            __m128i low = _mm256_castsi256_si128(acc);
+            return reduce_min(batch<T, sse4_2>(low));
+        }
+
+        // rsqrt
+        template <class A>
+        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_rsqrt_ps(val);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(val)));
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                auto mask = (other >> (8 * sizeof(T) - 1));
+                auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+                auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+                return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
+            }
+            else
+            {
+                const auto diffmax = std::numeric_limits<T>::max() - self;
+                const auto mindiff = min(diffmax, other);
+                return self + mindiff;
+            }
+        }
+
+        // select
+        template <class A>
+        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            return _mm256_blendv_ps(false_br, true_br, cond);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            return _mm256_blendv_pd(false_br, true_br, cond);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            __m128i cond_low, cond_hi;
+            detail::split_avx(cond, cond_low, cond_hi);
+
+            __m128i true_low, true_hi;
+            detail::split_avx(true_br, true_low, true_hi);
+
+            __m128i false_low, false_hi;
+            detail::split_avx(false_br, false_low, false_hi);
+
+            __m128i res_low = select(batch_bool<T, sse4_2>(cond_low), batch<T, sse4_2>(true_low), batch<T, sse4_2>(false_low), sse4_2 {});
+            __m128i res_hi = select(batch_bool<T, sse4_2>(cond_hi), batch<T, sse4_2>(true_hi), batch<T, sse4_2>(false_hi), sse4_2 {});
+            return detail::merge_sse(res_low, res_hi);
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
+        }
+
+        template <class A, bool... Values>
+        XSIMD_INLINE batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = batch_bool_constant<float, A, Values...>::mask();
+            return _mm256_blend_ps(false_br, true_br, mask);
+        }
+
+        template <class A, bool... Values>
+        XSIMD_INLINE batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = batch_bool_constant<double, A, Values...>::mask();
+            return _mm256_blend_pd(false_br, true_br, mask);
+        }
+
+        // set
+        template <class A, class... Values>
+        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
+            return _mm256_setr_ps(values...);
+        }
+
+        template <class A, class... Values>
+        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
+            return _mm256_setr_pd(values...);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3) noexcept
+        {
+            return _mm256_set_epi64x(v3, v2, v1, v0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        {
+            return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        {
+            return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                                     T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
+        {
+            return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+        }
+
+        template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
+        }
+
+        template <class A, class... Values>
+        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
+            return _mm256_castsi256_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        template <class A, class... Values>
+        XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
+            return _mm256_castsi256_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        // shuffle
+        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
+        XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
+        {
+            constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
+            // shuffle within lane
+            if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I0 < 4 && I1 < 4 && I2 >= 8 && I2 < 12 && I3 >= 8 && I3 < 12)
+                return _mm256_shuffle_ps(x, y, smask);
+
+            // shuffle within opposite lane
+            if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I2 < 4 && I3 < 4 && I0 >= 8 && I0 < 12 && I1 >= 8 && I1 < 12)
+                return _mm256_shuffle_ps(y, x, smask);
+
+            return shuffle(x, y, mask, generic {});
+        }
+
+        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
+        XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
+        {
+            constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3);
+            // shuffle within lane
+            if (I0 < 2 && I1 >= 4 && I1 < 6 && I2 >= 2 && I2 < 4 && I3 >= 6)
+                return _mm256_shuffle_pd(x, y, smask);
+
+            // shuffle within opposite lane
+            if (I1 < 2 && I0 >= 4 && I0 < 6 && I3 >= 2 && I3 < 4 && I2 >= 6)
+                return _mm256_shuffle_pd(y, x, smask);
+
+            return shuffle(x, y, mask, generic {});
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                __m128i low = _mm256_castsi256_si128(x);
+                auto y = _mm_slli_si128(low, M);
+                __m256i zero = _mm256_setzero_si256();
+                return _mm256_insertf128_si256(zero, y, 1);
+            }
+            if (BitCount == 128)
+            {
+                __m128i low = _mm256_castsi256_si128(x);
+                __m256i zero = _mm256_setzero_si256();
+                return _mm256_insertf128_si256(zero, low, 1);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+
+            __m128i low = _mm256_castsi256_si128(x);
+            auto ylow = _mm_slli_si128(low, M);
+            auto zlow = _mm_srli_si128(low, 16 - M);
+
+            __m128i high = _mm256_extractf128_si256(x, 1);
+            auto yhigh = _mm_slli_si128(high, M);
+
+            __m256i res = _mm256_castsi128_si256(ylow);
+            return _mm256_insertf128_si256(res, _mm_or_si128(yhigh, zlow), 1);
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                __m128i high = _mm256_extractf128_si256(x, 1);
+                __m128i y = _mm_srli_si128(high, M);
+                __m256i zero = _mm256_setzero_si256();
+                return _mm256_insertf128_si256(zero, y, 0);
+            }
+            if (BitCount == 128)
+            {
+                __m128i high = _mm256_extractf128_si256(x, 1);
+                return _mm256_castsi128_si256(high);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+
+            __m128i low = _mm256_castsi256_si128(x);
+            auto ylow = _mm_srli_si128(low, M);
+
+            __m128i high = _mm256_extractf128_si256(x, 1);
+            auto yhigh = _mm_srli_si128(high, M);
+            auto zhigh = _mm_slli_si128(high, 16 - M);
+
+            __m256i res = _mm256_castsi128_si256(_mm_or_si128(ylow, zhigh));
+            return _mm256_insertf128_si256(res, yhigh, 1);
+        }
+
+        // sqrt
+        template <class A>
+        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_sqrt_ps(val);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_sqrt_pd(val);
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                return sadd(self, -other);
+            }
+            else
+            {
+                const auto diff = min(self, other);
+                return self - diff;
+            }
+        }
+
+        // store_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_si256((__m256i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_si256((__m256i*)mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_ps(mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_pd(mem, self);
+        }
+
+        // store_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_si256((__m256i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_si256((__m256i*)mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_ps(mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_pd(mem, self);
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return sub(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_sub_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_sub_pd(self, other);
+        }
+
+        // swizzle (dynamic mask)
+        template <class A>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx>) noexcept
+        {
+            // duplicate low and high part of input
+            __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
+            __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
+
+            __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
+            __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
+
+            // normalize mask
+            batch<uint32_t, A> half_mask = mask % 4;
+
+            // permute within each lane
+            __m256 r0 = _mm256_permutevar_ps(low_low, half_mask);
+            __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask);
+
+            // mask to choose the right lane
+            batch_bool<uint32_t, A> blend_mask = mask >= 4;
+
+            // blend the two permutes
+            return _mm256_blendv_ps(r0, r1, batch_bool_cast<float>(blend_mask));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
+        {
+            // duplicate low and high part of input
+            __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
+            __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
+
+            __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
+            __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
+
+            // normalize mask
+            batch<uint64_t, A> half_mask = -(mask & 1);
+
+            // permute within each lane
+            __m256d r0 = _mm256_permutevar_pd(low_low, half_mask);
+            __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask);
+
+            // mask to choose the right lane
+            batch_bool<uint64_t, A> blend_mask = mask >= 2;
+
+            // blend the two permutes
+            return _mm256_blendv_pd(r0, r1, batch_bool_cast<double>(blend_mask));
+        }
+
+        template <class A, typename T, detail::enable_sized_integral_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<uint32_t, A> const& mask, requires_arch<avx>) noexcept
+        {
+            return bitwise_cast<T>(
+                swizzle(bitwise_cast<float>(self), mask));
+        }
+
+        template <class A, typename T, detail::enable_sized_integral_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A>
+        swizzle(batch<T, A> const& self, batch<uint64_t, A> const& mask, requires_arch<avx>) noexcept
+        {
+            return bitwise_cast<T>(
+                swizzle(bitwise_cast<double>(self), mask));
+        }
+
+        // swizzle (constant mask)
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
+        {
+            // duplicate low and high part of input
+            __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
+            __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
+
+            __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
+            __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
+
+            // normalize mask
+            batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
+
+            // permute within each lane
+            __m256 r0 = _mm256_permutevar_ps(low_low, half_mask.as_batch());
+            __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask.as_batch());
+
+            // mask to choose the right lane
+            batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
+
+            // blend the two permutes
+            constexpr auto mask = blend_mask.mask();
+            return _mm256_blend_ps(r0, r1, mask);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx>) noexcept
+        {
+            // duplicate low and high part of input
+            __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
+            __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
+
+            __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
+            __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
+
+            // normalize mask
+            batch_constant<uint64_t, A, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
+
+            // permute within each lane
+            __m256d r0 = _mm256_permutevar_pd(low_low, half_mask.as_batch());
+            __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask.as_batch());
+
+            // mask to choose the right lane
+            batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
+
+            // blend the two permutes
+            constexpr auto mask = blend_mask.mask();
+            return _mm256_blend_pd(r0, r1, mask);
+        }
+        template <class A,
+                  typename T,
+                  uint32_t V0,
+                  uint32_t V1,
+                  uint32_t V2,
+                  uint32_t V3,
+                  uint32_t V4,
+                  uint32_t V5,
+                  uint32_t V6,
+                  uint32_t V7,
+                  detail::enable_sized_integral_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self,
+                                         batch_constant<uint32_t, A,
+                                                        V0,
+                                                        V1,
+                                                        V2,
+                                                        V3,
+                                                        V4,
+                                                        V5,
+                                                        V6,
+                                                        V7> const& mask,
+                                         requires_arch<avx>) noexcept
+        {
+            return bitwise_cast<T>(
+                swizzle(bitwise_cast<float>(self), mask));
+        }
+
+        template <class A,
+                  typename T,
+                  uint64_t V0,
+                  uint64_t V1,
+                  uint64_t V2,
+                  uint64_t V3,
+                  detail::enable_sized_integral_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A>
+        swizzle(batch<T, A> const& self,
+                batch_constant<uint64_t, A, V0, V1, V2, V3> const& mask,
+                requires_arch<avx>) noexcept
+        {
+            return bitwise_cast<T>(
+                swizzle(bitwise_cast<double>(self), mask));
+        }
+        // transpose
+        template <class A>
+        XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<avx>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            // See
+            // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2
+            auto r0 = matrix_begin[0], r1 = matrix_begin[1],
+                 r2 = matrix_begin[2], r3 = matrix_begin[3],
+                 r4 = matrix_begin[4], r5 = matrix_begin[5],
+                 r6 = matrix_begin[6], r7 = matrix_begin[7];
+
+            auto t0 = _mm256_unpacklo_ps(r0, r1);
+            auto t1 = _mm256_unpackhi_ps(r0, r1);
+            auto t2 = _mm256_unpacklo_ps(r2, r3);
+            auto t3 = _mm256_unpackhi_ps(r2, r3);
+            auto t4 = _mm256_unpacklo_ps(r4, r5);
+            auto t5 = _mm256_unpackhi_ps(r4, r5);
+            auto t6 = _mm256_unpacklo_ps(r6, r7);
+            auto t7 = _mm256_unpackhi_ps(r6, r7);
+
+            r0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1, 0, 1, 0));
+            r1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 2, 3, 2));
+            r2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0));
+            r3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2));
+            r4 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1, 0, 1, 0));
+            r5 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3, 2, 3, 2));
+            r6 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0));
+            r7 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2));
+
+            matrix_begin[0] = _mm256_permute2f128_ps(r0, r4, 0x20);
+            matrix_begin[1] = _mm256_permute2f128_ps(r1, r5, 0x20);
+            matrix_begin[2] = _mm256_permute2f128_ps(r2, r6, 0x20);
+            matrix_begin[3] = _mm256_permute2f128_ps(r3, r7, 0x20);
+            matrix_begin[4] = _mm256_permute2f128_ps(r0, r4, 0x31);
+            matrix_begin[5] = _mm256_permute2f128_ps(r1, r5, 0x31);
+            matrix_begin[6] = _mm256_permute2f128_ps(r2, r6, 0x31);
+            matrix_begin[7] = _mm256_permute2f128_ps(r3, r7, 0x31);
+        }
+
+        template <class A>
+        XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<avx>) noexcept
+        {
+            return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
+        }
+        template <class A>
+        XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<avx>) noexcept
+        {
+            return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
+        }
+
+        template <class A>
+        XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<avx>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            auto r0 = matrix_begin[0], r1 = matrix_begin[1],
+                 r2 = matrix_begin[2], r3 = matrix_begin[3];
+
+            auto t0 = _mm256_unpacklo_pd(r0, r1); // r00 r10 r01 r11
+            auto t1 = _mm256_unpackhi_pd(r0, r1); // r02 r12 r03 r13
+            auto t2 = _mm256_unpacklo_pd(r2, r3); // r20 r30 r21 r31
+            auto t3 = _mm256_unpackhi_pd(r2, r3); // r22 r32 r23 r33
+
+            matrix_begin[0] = _mm256_permute2f128_pd(t0, t2, 0x20);
+            matrix_begin[1] = _mm256_permute2f128_pd(t1, t3, 0x20);
+            matrix_begin[2] = _mm256_permute2f128_pd(t0, t2, 0x31);
+            matrix_begin[3] = _mm256_permute2f128_pd(t1, t3, 0x31);
+        }
+
+        template <class A>
+        XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<avx>) noexcept
+        {
+            return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
+        }
+        template <class A>
+        XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<avx>) noexcept
+        {
+            return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
+        }
+
+        // trunc
+        template <class A>
+        XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_ps(self, _MM_FROUND_TO_ZERO);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> trunc(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_pd(self, _MM_FROUND_TO_ZERO);
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+            {
+                // extract high word
+                __m128i self_hi = _mm256_extractf128_si256(self, 1);
+                __m128i other_hi = _mm256_extractf128_si256(other, 1);
+
+                // interleave
+                __m128i res_lo, res_hi;
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    res_lo = _mm_unpacklo_epi8(self_hi, other_hi);
+                    res_hi = _mm_unpackhi_epi8(self_hi, other_hi);
+                }
+                else
+                {
+                    res_lo = _mm_unpacklo_epi16(self_hi, other_hi);
+                    res_hi = _mm_unpackhi_epi16(self_hi, other_hi);
+                }
+
+                // fuse
+                return _mm256_castps_si256(
+                    _mm256_insertf128_ps(
+                        _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
+                        _mm_castsi128_ps(res_hi),
+                        1));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                return _mm256_castps_si256(_mm256_permute2f128_ps(lo, hi, 0x31));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                return _mm256_castpd_si256(_mm256_permute2f128_pd(lo, hi, 0x31));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_ps(self, other);
+            auto hi = _mm256_unpackhi_ps(self, other);
+            return _mm256_permute2f128_ps(lo, hi, 0x31);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_pd(self, other);
+            auto hi = _mm256_unpackhi_pd(self, other);
+            return _mm256_permute2f128_pd(lo, hi, 0x31);
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+            {
+                // extract low word
+                __m128i self_lo = _mm256_extractf128_si256(self, 0);
+                __m128i other_lo = _mm256_extractf128_si256(other, 0);
+
+                // interleave
+                __m128i res_lo, res_hi;
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    res_lo = _mm_unpacklo_epi8(self_lo, other_lo);
+                    res_hi = _mm_unpackhi_epi8(self_lo, other_lo);
+                }
+                else
+                {
+                    res_lo = _mm_unpacklo_epi16(self_lo, other_lo);
+                    res_hi = _mm_unpackhi_epi16(self_lo, other_lo);
+                }
+
+                // fuse
+                return _mm256_castps_si256(
+                    _mm256_insertf128_ps(
+                        _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
+                        _mm_castsi128_ps(res_hi),
+                        1));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                return _mm256_castps_si256(_mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                return _mm256_castpd_si256(_mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_ps(self, other);
+            auto hi = _mm256_unpackhi_ps(self, other);
+            return _mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_pd(self, other);
+            auto hi = _mm256_unpackhi_pd(self, other);
+            return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1);
+        }
+    }
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_avx2.hpp b/include/onnxruntime/xsimd/arch/xsimd_avx2.hpp
new file mode 100644
index 0000000000000..506299a0dd8e8
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_avx2.hpp
@@ -0,0 +1,1021 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX2_HPP
+#define XSIMD_AVX2_HPP
+
+#include <complex>
+#include <type_traits>
+
+#include "../types/xsimd_avx2_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // abs
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_abs_epi8(self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_abs_epi16(self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_abs_epi32(self);
+                }
+                else
+                {
+                    return abs(self, avx {});
+                }
+            }
+            return self;
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_add_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_add_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_add_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_add_epi64(self, other);
+            }
+            else
+            {
+                return add(self, other, avx {});
+            }
+        }
+
+        // avgr
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_avg_epu8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_avg_epu16(self, other);
+            }
+            else
+            {
+                return avgr(self, other, generic {});
+            }
+        }
+
+        // avg
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto adj = ((self ^ other) << 7) >> 7;
+                return avgr(self, other, A {}) - adj;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto adj = ((self ^ other) << 15) >> 15;
+                return avgr(self, other, A {}) - adj;
+            }
+            else
+            {
+                return avg(self, other, generic {});
+            }
+        }
+
+        // bitwise_and
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_and_si256(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_and_si256(self, other);
+        }
+
+        // bitwise_andnot
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_andnot_si256(other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_andnot_si256(other, self);
+        }
+
+        // bitwise_not
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_slli_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_slli_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_slli_epi64(self, other);
+            }
+            else
+            {
+                return bitwise_lshift(self, other, avx {});
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_sllv_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_sllv_epi64(self, other);
+            }
+            else
+            {
+                return bitwise_lshift(self, other, avx {});
+            }
+        }
+
+        // bitwise_or
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_or_si256(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_or_si256(self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF);
+                    __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self);
+                    __m256i res = _mm256_srai_epi16(self, other);
+                    return _mm256_or_si256(
+                        detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                           { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+                                           sign_mask, cmp_is_negative),
+                        _mm256_andnot_si256(sign_mask, res));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_srai_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srai_epi32(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_srli_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srli_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_srli_epi64(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srav_epi32(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srlv_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_srlv_epi64(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+        }
+
+        // bitwise_xor
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, other);
+        }
+
+        // complex_low
+        template <class A>
+        XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
+        {
+            __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0));
+            __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0));
+            return _mm256_blend_pd(tmp0, tmp1, 10);
+        }
+
+        // complex_high
+        template <class A>
+        XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
+        {
+            __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2));
+            __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0));
+            return _mm256_blend_pd(tmp0, tmp1, 10);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to avx
+                __m256i xH = _mm256_srli_epi64(x, 32);
+                xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); //  2^84
+                __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
+                                                 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
+                __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); //  2^52
+                __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to avx
+                __m256i xH = _mm256_srai_epi32(x, 16);
+                xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
+                xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); //  3*2^67
+                __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
+                                                 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
+                __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); //  2^52
+                __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
+            }
+        }
+
+        // eq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_cmpeq_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_cmpeq_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_cmpeq_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_cmpeq_epi64(self, other);
+            }
+            else
+            {
+                return eq(self, other, avx {});
+            }
+        }
+
+        // gather
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                        kernel::requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
+        }
+
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                        kernel::requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
+        }
+
+        template <class A, class U,
+                  detail::enable_sized_integral_t<U, 4> = 0>
+        XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, float const* src,
+                                            batch<U, A> const& index,
+                                            kernel::requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i32gather_ps(src, index, sizeof(float));
+        }
+
+        template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
+        XSIMD_INLINE batch<double, A> gather(batch<double, A> const&, double const* src,
+                                             batch<U, A> const& index,
+                                             requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i64gather_pd(src, index, sizeof(double));
+        }
+
+        // gather: handmade conversions
+        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+        XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, double const* src,
+                                            batch<V, A> const& index,
+                                            requires_arch<avx2>) noexcept
+        {
+            const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
+            const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
+            return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data));
+        }
+
+        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+        XSIMD_INLINE batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
+                                              batch<V, A> const& index,
+                                              requires_arch<avx2>) noexcept
+        {
+            const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
+            const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
+            return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data));
+        }
+
+        // lt
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_cmpgt_epi8(other, self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_cmpgt_epi16(other, self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_cmpgt_epi32(other, self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_cmpgt_epi64(other, self);
+                }
+                else
+                {
+                    return lt(self, other, avx {});
+                }
+            }
+            else
+            {
+                return lt(self, other, avx {});
+            }
+        }
+
+        // load_complex
+        template <class A>
+        XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type real = _mm256_castpd_ps(
+                _mm256_permute4x64_pd(
+                    _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))),
+                    _MM_SHUFFLE(3, 1, 2, 0)));
+            batch_type imag = _mm256_castpd_ps(
+                _mm256_permute4x64_pd(
+                    _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))),
+                    _MM_SHUFFLE(3, 1, 2, 0)));
+            return { real, imag };
+        }
+        template <class A>
+        XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx2>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
+            batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
+            return { real, imag };
+        }
+        // mask
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
+                return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12);
+            }
+            else
+            {
+                return mask(self, avx {});
+            }
+        }
+
+        // max
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_max_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_max_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_max_epi32(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_max_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_max_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_max_epu32(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx {});
+                }
+            }
+        }
+
+        // min
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_min_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_min_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_min_epi32(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_min_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_min_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_min_epu32(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx {});
+                }
+            }
+        }
+
+        // mul
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                __m256i mask_hi = _mm256_set1_epi32(0xFF00FF00);
+                __m256i res_lo = _mm256_mullo_epi16(self, other);
+                __m256i other_hi = _mm256_srli_epi16(other, 8);
+                __m256i self_hi = _mm256_and_si256(self, mask_hi);
+                __m256i res_hi = _mm256_mullo_epi16(self_hi, other_hi);
+                __m256i res = _mm256_blendv_epi8(res_lo, res_hi, mask_hi);
+                return res;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_mullo_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_mullo_epi32(self, other);
+            }
+            else
+            {
+                return mul(self, other, avx {});
+            }
+        }
+
+        // reduce_add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m256i tmp1 = _mm256_hadd_epi32(self, self);
+                __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
+                __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
+                __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
+                return _mm_cvtsi128_si32(tmp4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
+                __m256i tmp2 = _mm256_add_epi64(self, tmp1);
+                __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
+                __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
+#if defined(__x86_64__)
+                return _mm_cvtsi128_si64(res);
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, res);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                return reduce_add(self, avx {});
+            }
+        }
+
+        // rotate_left
+        template <size_t N, class A>
+        XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<avx2>) noexcept
+        {
+            return _mm256_alignr_epi8(self, self, N);
+        }
+        template <size_t N, class A>
+        XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<avx2>) noexcept
+        {
+            return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), avx2 {}));
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_adds_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_adds_epi16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_adds_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_adds_epu16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx {});
+                }
+            }
+        }
+
+        // select
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else
+            {
+                return select(cond, true_br, false_br, avx {});
+            }
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
+            // FIXME: for some reason mask here is not considered as an immediate,
+            // but it's okay for _mm256_blend_epi32
+            // case 2: return _mm256_blend_epi16(false_br, true_br, mask);
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_blend_epi32(false_br, true_br, mask);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                constexpr int imask = detail::interleave(mask);
+                return _mm256_blend_epi32(false_br, true_br, imask);
+            }
+            else
+            {
+                return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
+            }
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                auto y = _mm256_bslli_epi128(x, M);
+                return _mm256_permute2x128_si256(y, y, 0x28);
+            }
+            if (BitCount == 128)
+            {
+                return _mm256_permute2x128_si256(x, x, 0x28);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+            auto y = _mm256_bslli_epi128(x, M);
+            auto z = _mm256_bsrli_epi128(x, 16 - M);
+            auto w = _mm256_permute2x128_si256(z, z, 0x28);
+            return _mm256_or_si256(y, w);
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                auto y = _mm256_bsrli_epi128(x, M);
+                return _mm256_permute2x128_si256(y, y, 0x81);
+            }
+            if (BitCount == 128)
+            {
+                return _mm256_permute2x128_si256(x, x, 0x81);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+            auto y = _mm256_bsrli_epi128(x, M);
+            auto z = _mm256_bslli_epi128(x, 16 - M);
+            auto w = _mm256_permute2x128_si256(z, z, 0x81);
+            return _mm256_or_si256(y, w);
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_subs_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_subs_epi16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_subs_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_subs_epu16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx {});
+                }
+            }
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_sub_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_sub_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_sub_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_sub_epi64(self, other);
+            }
+            else
+            {
+                return sub(self, other, avx {});
+            }
+        }
+
+        // swizzle (dynamic mask)
+        template <class A>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
+        {
+            return _mm256_permutevar8x32_ps(self, mask);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
+        {
+            batch<uint32_t, A> broadcaster = { 0, 1, 0, 1, 0, 1, 0, 1 };
+            constexpr uint64_t comb = 0x0000000100000001ul * 2;
+            return bitwise_cast<double>(swizzle(bitwise_cast<float>(self), bitwise_cast<uint32_t>(mask * comb) + broadcaster, avx2 {}));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
+        {
+            return bitwise_cast<uint64_t>(swizzle(bitwise_cast<double>(self), mask, avx2 {}));
+        }
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<double>(self), mask, avx2 {}));
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
+        {
+            return _mm256_permutevar8x32_epi32(self, mask);
+        }
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
+        }
+
+        // swizzle (constant mask)
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        {
+            return _mm256_permutevar8x32_ps(self, mask.as_batch());
+        }
+
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
+            return _mm256_permute4x64_pd(self, mask);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
+            return _mm256_permute4x64_epi64(self, mask);
+        }
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        {
+            return _mm256_permutevar8x32_epi32(self, mask.as_batch());
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto lo = _mm256_unpacklo_epi8(self, other);
+                auto hi = _mm256_unpackhi_epi8(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto lo = _mm256_unpacklo_epi16(self, other);
+                auto hi = _mm256_unpackhi_epi16(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_epi32(self, other);
+                auto hi = _mm256_unpackhi_epi32(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_epi64(self, other);
+                auto hi = _mm256_unpackhi_epi64(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto lo = _mm256_unpacklo_epi8(self, other);
+                auto hi = _mm256_unpackhi_epi8(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto lo = _mm256_unpacklo_epi16(self, other);
+                auto hi = _mm256_unpackhi_epi16(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_epi32(self, other);
+                auto hi = _mm256_unpackhi_epi32(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_epi64(self, other);
+                auto hi = _mm256_unpackhi_epi64(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+    }
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_avx512bw.hpp b/include/onnxruntime/xsimd/arch/xsimd_avx512bw.hpp
new file mode 100644
index 0000000000000..724ced08776ef
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_avx512bw.hpp
@@ -0,0 +1,701 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512BW_HPP
+#define XSIMD_AVX512BW_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512bw_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            template <class A, class T, int Cmp>
+            XSIMD_INLINE batch_bool<T, A> compare_int_avx512bw(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                using register_type = typename batch_bool<T, A>::register_type;
+                if (std::is_signed<T>::value)
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
+                    }
+                }
+                else
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
+                    }
+                }
+            }
+        }
+
+        // abs
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_unsigned<T>::value)
+            {
+                return self;
+            }
+
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_abs_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_abs_epi16(self);
+            }
+            else
+            {
+                return abs(self, avx512dq {});
+            }
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_add_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_add_epi16(self, other);
+            }
+            else
+            {
+                return add(self, other, avx512dq {});
+            }
+        }
+
+        // avgr
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_avg_epu8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_avg_epu16(self, other);
+            }
+            else
+            {
+                return avgr(self, other, generic {});
+            }
+        }
+
+        // avg
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto adj = ((self ^ other) << 7) >> 7;
+                return avgr(self, other, A {}) - adj;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto adj = ((self ^ other) << 15) >> 15;
+                return avgr(self, other, A {}) - adj;
+            }
+            else
+            {
+                return avg(self, other, generic {});
+            }
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
+        {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_sllv_epi16(self, _mm512_set1_epi16(other));
+#else
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_slli_epi16(self, other);
+#endif
+            }
+            else
+            {
+                return bitwise_lshift(self, other, avx512dq {});
+            }
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF);
+                    __m512i zeros = _mm512_setzero_si512();
+                    __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self);
+                    __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask);
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                    __m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other));
+#else
+                    __m512i res = _mm512_srai_epi16(self, other);
+#endif
+                    return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res));
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srav_epi16(self, _mm512_set1_epi16(other));
+#else
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srai_epi16(self, other);
+#endif
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srlv_epi16(self, _mm512_set1_epi16(other));
+#else
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srli_epi16(self, other);
+#endif
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // eq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_EQ>(self, other);
+        }
+
+        // ge
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GE>(self, other);
+        }
+
+        // gt
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GT>(self, other);
+        }
+
+        // le
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LE>(self, other);
+        }
+
+        // lt
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LT>(self, other);
+        }
+
+        // max
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_max_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_max_epi16(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_max_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_max_epu16(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // min
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_min_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_min_epi16(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_min_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_min_epu16(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // mul
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                __m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8));
+                __m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8);
+                return _mm512_or_si512(upper, lower);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_mullo_epi16(self, other);
+            }
+            else
+            {
+                return mul(self, other, avx512dq {});
+            }
+        }
+
+        // neq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_NE>(self, other);
+        }
+
+        // rotate_left
+        template <size_t N, class A>
+        XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<avx512bw>) noexcept
+        {
+            return _mm512_alignr_epi8(self, self, N);
+        }
+        template <size_t N, class A>
+        XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<avx512bw>) noexcept
+        {
+            return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), avx2 {}));
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_adds_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_adds_epi16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_adds_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_adds_epu16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // select
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_mask_blend_epi8(cond, false_br.data, true_br.data);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_mask_blend_epi16(cond, false_br.data, true_br.data);
+            }
+            else
+            {
+                return select(cond, true_br, false_br, avx512dq {});
+            }
+        }
+
+        // slide_left
+        namespace detail
+        {
+            template <size_t... Is>
+            constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_hi(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is == 0 ? 8 : Is - 1)... };
+            }
+
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_pattern(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is >= N ? Is - N : 0)... };
+            }
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_mask(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is >= N ? 0xFFFF : 0x0000)... };
+            }
+        }
+
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 512)
+            {
+                return batch<T, A>(T(0));
+            }
+            batch<T, A> xx;
+            if (N & 1)
+            {
+                alignas(A::alignment()) uint64_t buffer[8];
+                _mm512_store_epi64(&buffer[0], x);
+                for (int i = 7; i > 0; --i)
+                    buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56);
+                buffer[0] = buffer[0] << 8;
+                xx = _mm512_load_epi64(&buffer[0]);
+
+                alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>());
+                __m512i xl = _mm512_slli_epi64(x, 8);
+                __m512i xr = _mm512_srli_epi64(x, 56);
+                xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
+                xx = _mm512_or_si512(xr, xl);
+                if (N == 1)
+                    return xx;
+            }
+            else
+            {
+                xx = x;
+            }
+            alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
+        }
+
+        // slide_right
+        namespace detail
+        {
+            template <size_t... Is>
+            constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_low(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is + 1)... };
+            }
+
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_pattern(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is < (32 - N) ? Is + N : 0)... };
+            }
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_mask(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is < 32 - N ? 0xFFFF : 0x0000)... };
+            }
+        }
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 512)
+            {
+                return batch<T, A>(T(0));
+            }
+            batch<T, A> xx;
+            if (N & 1)
+            {
+                alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>());
+                __m512i xr = _mm512_srli_epi64(x, 8);
+                __m512i xl = _mm512_slli_epi64(x, 56);
+                xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
+                xx = _mm512_or_si512(xr, xl);
+                if (N == 1)
+                    return xx;
+            }
+            else
+            {
+                xx = x;
+            }
+            alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_subs_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_subs_epi16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_subs_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_subs_epu16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_sub_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_sub_epi16(self, other);
+            }
+            else
+            {
+                return sub(self, other, avx512dq {});
+            }
+        }
+
+        // swizzle (dynamic version)
+        template <class A>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch<uint16_t, A> mask, requires_arch<avx512bw>) noexcept
+        {
+            return _mm512_permutexvar_epi16(mask, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch<uint16_t, A> mask, requires_arch<avx512bw>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512bw {}));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<avx512bw>) noexcept
+        {
+            return _mm512_shuffle_epi8(self, mask);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<avx512bw>) noexcept
+        {
+            return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, avx512bw {}));
+        }
+
+        // swizzle (static version)
+        template <class A, uint16_t... Vs>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return swizzle(self, mask.as_batch(), avx512bw {});
+        }
+
+        template <class A, uint16_t... Vs>
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return swizzle(self, mask.as_batch(), avx512bw {});
+        }
+
+        template <class A, uint8_t... Vs>
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return swizzle(self, mask.as_batch(), avx512bw {});
+        }
+
+        template <class A, uint8_t... Vs>
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return swizzle(self, mask.as_batch(), avx512bw {});
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                lo = _mm512_unpacklo_epi8(self, other);
+                hi = _mm512_unpackhi_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                lo = _mm512_unpacklo_epi16(self, other);
+                hi = _mm512_unpackhi_epi16(self, other);
+            }
+            else
+            {
+                return zip_hi(self, other, avx512f {});
+            }
+            return _mm512_inserti32x4(
+                _mm512_inserti32x4(
+                    _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
+                    _mm512_extracti32x4_epi32(lo, 3),
+                    2),
+                _mm512_extracti32x4_epi32(hi, 2),
+                1);
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                lo = _mm512_unpacklo_epi8(self, other);
+                hi = _mm512_unpackhi_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                lo = _mm512_unpacklo_epi16(self, other);
+                hi = _mm512_unpackhi_epi16(self, other);
+            }
+            else
+            {
+                return zip_lo(self, other, avx512f {});
+            }
+            return _mm512_inserti32x4(
+                _mm512_inserti32x4(
+                    _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
+                    _mm512_extracti32x4_epi32(hi, 1),
+                    3),
+                _mm512_extracti32x4_epi32(lo, 1),
+                2);
+        }
+    }
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_avx512cd.hpp b/include/onnxruntime/xsimd/arch/xsimd_avx512cd.hpp
new file mode 100644
index 0000000000000..95f3f1df8f6de
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_avx512cd.hpp
@@ -0,0 +1,28 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512CD_HPP
+#define XSIMD_AVX512CD_HPP
+
+#include "../types/xsimd_avx512cd_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        // Nothing there yet.
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_avx512dq.hpp b/include/onnxruntime/xsimd/arch/xsimd_avx512dq.hpp
new file mode 100644
index 0000000000000..4788d19e94823
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_avx512dq.hpp
@@ -0,0 +1,212 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512_DQHPP
+#define XSIMD_AVX512_D_HPP
+
+#include "../types/xsimd_avx512dq_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // bitwise_and
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_and_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_and_pd(self, other);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_andnot_ps(other, self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_andnot_pd(other, self);
+        }
+
+        // bitwise_not
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
+        }
+
+        // bitwise_or
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_or_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_or_pd(self, other);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data | other.data);
+        }
+
+        // bitwise_xor
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_xor_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_xor_pd(self, other);
+        }
+
+        // haddp
+        template <class A>
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512dq>) noexcept
+        {
+            // The following folds over the vector once:
+            // tmp1 = [a0..8, b0..8]
+            // tmp2 = [a8..f, b8..f]
+#define XSIMD_AVX512_HADDP_STEP1(I, a, b)                                \
+    batch<float, avx512f> res##I;                                        \
+    {                                                                    \
+        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
+        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
+        res##I = _mm512_add_ps(tmp1, tmp2);                              \
+    }
+
+            XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
+            XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
+            XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
+            XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
+            XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
+            XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
+            XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
+            XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
+
+#undef XSIMD_AVX512_HADDP_STEP1
+
+            // The following flds the code and shuffles so that hadd_ps produces the correct result
+            // tmp1 = [a0..4,  a8..12,  b0..4,  b8..12] (same for tmp3)
+            // tmp2 = [a5..8, a12..16, b5..8, b12..16]  (same for tmp4)
+            // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
+#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d)                               \
+    batch<float, avx2> halfx##I;                                              \
+    {                                                                         \
+        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0));      \
+        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1));      \
+                                                                              \
+        auto resx1 = _mm512_add_ps(tmp1, tmp2);                               \
+                                                                              \
+        auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0));      \
+        auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1));      \
+                                                                              \
+        auto resx2 = _mm512_add_ps(tmp3, tmp4);                               \
+                                                                              \
+        auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
+        auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
+                                                                              \
+        auto resx3 = _mm512_add_ps(tmp5, tmp6);                               \
+                                                                              \
+        halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0),           \
+                                  _mm512_extractf32x8_ps(resx3, 1));          \
+    }
+
+            XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
+            XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
+
+#undef XSIMD_AVX512_HADDP_STEP2
+
+            auto concat = _mm512_castps256_ps512(halfx0);
+            concat = _mm512_insertf32x8(concat, halfx1, 1);
+            return concat;
+        }
+
+        // ldexp
+        template <class A>
+        XSIMD_INLINE batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other));
+        }
+
+        // mul
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> mul(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_mullo_epi64(self, other);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> mul(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_mullo_epi64(self, other);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
+                                                        requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_cvtpd_epi64(self);
+        }
+
+        // reduce_add
+        template <class A>
+        XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
+            __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
+            __m256 res1 = _mm256_add_ps(tmp1, tmp2);
+            return reduce_add(batch<float, avx2>(res1), avx2 {});
+        }
+
+        // convert
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept
+            {
+                return _mm512_cvtepi64_pd(self);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<avx512dq>) noexcept
+            {
+                return _mm512_cvttpd_epi64(self);
+            }
+
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_avx512er.hpp b/include/onnxruntime/xsimd/arch/xsimd_avx512er.hpp
new file mode 100644
index 0000000000000..be02f9850b113
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_avx512er.hpp
@@ -0,0 +1,20 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512ER_HPP
+#define XSIMD_AVX512ER_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512er_register.hpp"
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_avx512f.hpp b/include/onnxruntime/xsimd/arch/xsimd_avx512f.hpp
new file mode 100644
index 0000000000000..c2b485a30e3d3
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_avx512f.hpp
@@ -0,0 +1,2167 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512F_HPP
+#define XSIMD_AVX512F_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "../types/xsimd_avx512f_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            XSIMD_INLINE void split_avx512(__m512 val, __m256& low, __m256& high) noexcept
+            {
+                low = _mm512_castps512_ps256(val);
+                high = _mm512_extractf32x8_ps(val, 1);
+            }
+            XSIMD_INLINE void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept
+            {
+                low = _mm512_castpd512_pd256(val);
+                high = _mm512_extractf64x4_pd(val, 1);
+            }
+            XSIMD_INLINE void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept
+            {
+                low = _mm512_castsi512_si256(val);
+                high = _mm512_extracti64x4_epi64(val, 1);
+            }
+            XSIMD_INLINE __m512i merge_avx(__m256i low, __m256i high) noexcept
+            {
+                return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1);
+            }
+            XSIMD_INLINE __m512 merge_avx(__m256 low, __m256 high) noexcept
+            {
+                return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castpd256_pd512(_mm256_castps_pd(low)), _mm256_castps_pd(high), 1));
+            }
+            XSIMD_INLINE __m512d merge_avx(__m256d low, __m256d high) noexcept
+            {
+                return _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1);
+            }
+            template <class F>
+            __m512i fwd_to_avx(F f, __m512i self)
+            {
+                __m256i self_low, self_high;
+                split_avx512(self, self_low, self_high);
+                __m256i res_low = f(self_low);
+                __m256i res_high = f(self_high);
+                return merge_avx(res_low, res_high);
+            }
+            template <class F>
+            __m512i fwd_to_avx(F f, __m512i self, __m512i other)
+            {
+                __m256i self_low, self_high, other_low, other_high;
+                split_avx512(self, self_low, self_high);
+                split_avx512(other, other_low, other_high);
+                __m256i res_low = f(self_low, other_low);
+                __m256i res_high = f(self_high, other_high);
+                return merge_avx(res_low, res_high);
+            }
+            template <class F>
+            __m512i fwd_to_avx(F f, __m512i self, int32_t other)
+            {
+                __m256i self_low, self_high;
+                split_avx512(self, self_low, self_high);
+                __m256i res_low = f(self_low, other);
+                __m256i res_high = f(self_high, other);
+                return merge_avx(res_low, res_high);
+            }
+        }
+        namespace detail
+        {
+
+            XSIMD_INLINE uint32_t morton(uint16_t x, uint16_t y) noexcept
+            {
+
+                static const unsigned short MortonTable256[256] = {
+                    0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015,
+                    0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055,
+                    0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115,
+                    0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155,
+                    0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415,
+                    0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455,
+                    0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515,
+                    0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555,
+                    0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015,
+                    0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055,
+                    0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115,
+                    0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155,
+                    0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415,
+                    0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455,
+                    0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515,
+                    0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555,
+                    0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015,
+                    0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055,
+                    0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115,
+                    0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155,
+                    0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415,
+                    0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455,
+                    0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515,
+                    0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555,
+                    0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015,
+                    0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055,
+                    0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115,
+                    0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155,
+                    0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415,
+                    0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455,
+                    0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515,
+                    0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555
+                };
+
+                uint32_t z = MortonTable256[y >> 8] << 17 | MortonTable256[x >> 8] << 16 | MortonTable256[y & 0xFF] << 1 | MortonTable256[x & 0xFF];
+                return z;
+            }
+
+            template <class A, class T, int Cmp>
+            XSIMD_INLINE batch_bool<T, A> compare_int_avx512f(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                using register_type = typename batch_bool<T, A>::register_type;
+                if (std::is_signed<T>::value)
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        // shifting to take sign into account
+                        uint64_t mask_low0 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x000000FF)) << 24,
+                                                                   (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x000000FF)) << 24,
+                                                                   Cmp);
+                        uint64_t mask_low1 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FF00)) << 16,
+                                                                   (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FF00)) << 16,
+                                                                   Cmp);
+                        uint64_t mask_high0 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x00FF0000)) << 8,
+                                                                    (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x00FF0000)) << 8,
+                                                                    Cmp);
+                        uint64_t mask_high1 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFF000000)),
+                                                                    (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFF000000)),
+                                                                    Cmp);
+                        uint64_t mask = 0;
+                        for (unsigned i = 0; i < 16; ++i)
+                        {
+                            mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
+                            mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
+                            mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
+                            mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
+                        }
+                        return (register_type)mask;
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        // shifting to take sign into account
+                        uint16_t mask_low = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
+                                                                  (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
+                                                                  Cmp);
+                        uint16_t mask_high = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFFFF0000)),
+                                                                   (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFFFF0000)),
+                                                                   Cmp);
+                        return static_cast<register_type>(morton(mask_low, mask_high));
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
+                    }
+                }
+                else
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        uint64_t mask_low0 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x000000FF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x000000FF)), Cmp);
+                        uint64_t mask_low1 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FF00)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FF00)), Cmp);
+                        uint64_t mask_high0 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x00FF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x00FF0000)), Cmp);
+                        uint64_t mask_high1 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFF000000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFF000000)), Cmp);
+                        uint64_t mask = 0;
+                        for (unsigned i = 0; i < 16; ++i)
+                        {
+                            mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
+                            mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
+                            mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
+                            mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
+                        }
+                        return (register_type)mask;
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        uint16_t mask_low = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FFFF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FFFF)), Cmp);
+                        uint16_t mask_high = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFFFF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFFFF0000)), Cmp);
+                        return static_cast<register_type>(morton(mask_low, mask_high));
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
+                    }
+                }
+            }
+        }
+
+        // abs
+        template <class A>
+        XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            __m512 self_asf = (__m512)self;
+            __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asf);
+            __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), self_asi);
+            return *reinterpret_cast<__m512*>(&res_asi);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            __m512d self_asd = (__m512d)self;
+            __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asd);
+            __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
+                                               self_asi);
+            return *reinterpret_cast<__m512d*>(&res_asi);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_unsigned<T>::value)
+            {
+                return self;
+            }
+
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return detail::fwd_to_avx([](__m256i s) noexcept
+                                          { return abs(batch<T, avx2>(s)); },
+                                          self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return detail::fwd_to_avx([](__m256i s) noexcept
+                                          { return abs(batch<T, avx2>(s)); },
+                                          self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_abs_epi32(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_abs_epi64(self);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return add(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return add(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_add_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_add_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_add_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_add_pd(self, other);
+        }
+
+        // all
+        template <class A, class T>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return self.data == register_type(-1);
+        }
+
+        // any
+        template <class A, class T>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return self.data != register_type(0);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return self.data;
+        }
+
+        // bitwise_and
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+#if defined(_MSC_VER)
+            return _mm512_and_ps(self, other);
+#else
+            return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
+#endif
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_and_si512(self, other);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data & other.data);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(other), _mm512_castps_si512(self)));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(other), _mm512_castpd_si512(self)));
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_andnot_si512(other, self);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data & ~other.data);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                __m512i tmp = _mm512_sllv_epi32(self, _mm512_set1_epi32(other));
+#else
+                __m512i tmp = _mm512_slli_epi32(self, other);
+#endif
+                return _mm512_and_si512(_mm512_set1_epi8(0xFF << other), tmp);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
+                                          { return bitwise_lshift(batch<T, avx2>(s), o, avx2 {}); },
+                                          self, other);
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_sllv_epi32(self, _mm512_set1_epi32(other));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_sllv_epi64(self, _mm512_set1_epi64(other));
+#else
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_slli_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_slli_epi64(self, other);
+#endif
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // bitwise_not
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_xor_si512(self, _mm512_set1_epi32(-1));
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(~self.data);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_set1_epi32(-1)));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_set1_epi32(-1)));
+        }
+
+        // bitwise_or
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data | other.data);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_or_si512(self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_srav_epi32(self, _mm512_set1_epi32(other));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_srav_epi64(self, _mm512_set1_epi64(other));
+#else
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_srai_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_srai_epi64(self, other);
+#endif
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
+                                              { return bitwise_rshift(batch<T, avx2>(s), o, avx2 {}); },
+                                              self, other);
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                    __m512i tmp = _mm512_srlv_epi32(self, _mm512_set1_epi32(other));
+#else
+                    __m512i tmp = _mm512_srli_epi32(self, other);
+#endif
+                    return _mm512_and_si512(_mm512_set1_epi8(0xFF >> other), tmp);
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_srlv_epi32(self, _mm512_set1_epi32(other));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_srlv_epi64(self, _mm512_set1_epi64(other));
+#else
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_srli_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_srli_epi64(self, other);
+#endif
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
+                                              { return bitwise_rshift(batch<T, avx2>(s), o, avx2 {}); },
+                                              self, other);
+                }
+            }
+        }
+
+        // bitwise_xor
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data | other.data);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_xor_si512(self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(self);
+        }
+        template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
+        XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return batch<Tp, A>(self.data);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castps_pd(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castps_si512(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castpd_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castpd_si512(self);
+        }
+
+        // broadcast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_set1_epi8(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_set1_epi16(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_set1_epi32(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_set1_epi64(val);
+            }
+            else
+            {
+                assert(false && "unsupported");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_set1_ps(val);
+        }
+        template <class A>
+        batch<double, A> XSIMD_INLINE broadcast(double val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_set1_pd(val);
+        }
+
+        // ceil
+        template <class A>
+        XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_ps(self, _MM_FROUND_TO_POS_INF);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF);
+        }
+
+        // compress
+        template <class A>
+        XSIMD_INLINE batch<float, A> compress(batch<float, A> const& self, batch_bool<float, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_compress_ps(mask.mask(), self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> compress(batch<double, A> const& self, batch_bool<double, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_compress_pd(mask.mask(), self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> compress(batch<int32_t, A> const& self, batch_bool<int32_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_compress_epi32(mask.mask(), self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint32_t, A> compress(batch<uint32_t, A> const& self, batch_bool<uint32_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_compress_epi32(mask.mask(), self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> compress(batch<int64_t, A> const& self, batch_bool<int64_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_compress_epi64(mask.mask(), self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> compress(batch<uint64_t, A> const& self, batch_bool<uint64_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_compress_epi64(mask.mask(), self);
+        }
+
+        // convert
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+            {
+                return _mm512_cvtepi32_ps(self);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx512f>) noexcept
+            {
+                return _mm512_cvttps_epi32(self);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+            {
+                return _mm512_cvtepu32_ps(self);
+            }
+
+            template <class A>
+            batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx512f>)
+            {
+                return _mm512_cvttps_epu32(self);
+            }
+        }
+
+        namespace detail
+        {
+            // complex_low
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
+            {
+                __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+                return _mm512_permutex2var_ps(self.real(), idx, self.imag());
+            }
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
+            {
+                __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11);
+                return _mm512_permutex2var_pd(self.real(), idx, self.imag());
+            }
+
+            // complex_high
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
+            {
+                __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+                return _mm512_permutex2var_ps(self.real(), idx, self.imag());
+            }
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
+            {
+                __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15);
+                return _mm512_permutex2var_pd(self.real(), idx, self.imag());
+            }
+        }
+
+        // div
+        template <class A>
+        XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_div_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_div_pd(self, other);
+        }
+
+        // eq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_EQ_OQ);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_EQ>(self, other);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(~self.data ^ other.data);
+        }
+
+        // expand
+        template <class A>
+        XSIMD_INLINE batch<float, A> expand(batch<float, A> const& self, batch_bool<float, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_expand_ps(mask.mask(), self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> expand(batch<double, A> const& self, batch_bool<double, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_expand_pd(mask.mask(), self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> expand(batch<int32_t, A> const& self, batch_bool<int32_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_expand_epi32(mask.mask(), self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint32_t, A> expand(batch<uint32_t, A> const& self, batch_bool<uint32_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_expand_epi32(mask.mask(), self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> expand(batch<int64_t, A> const& self, batch_bool<int64_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_expand_epi64(mask.mask(), self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> expand(batch<uint64_t, A> const& self, batch_bool<uint64_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_expand_epi64(mask.mask(), self);
+        }
+
+        // floor
+        template <class A>
+        XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_ps(self, _MM_FROUND_TO_NEG_INF);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF);
+        }
+
+        // fnma
+        template <class A>
+        XSIMD_INLINE batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fnmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fnmadd_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmadd_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmsub_pd(x, y, z);
+        }
+
+        // from bool
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return select(self, batch<T, A>(1), batch<T, A>(0));
+        }
+
+        // from_mask
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx512f>) noexcept
+        {
+            return static_cast<typename batch_bool<T, A>::register_type>(mask);
+        }
+
+        // gather
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                        kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_i32gather_epi32(index, static_cast<const void*>(src), sizeof(T));
+        }
+
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                        kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_i64gather_epi64(index, static_cast<const void*>(src), sizeof(T));
+        }
+
+        template <class A, class U, detail::enable_sized_integral_t<U, 4> = 0>
+        XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, float const* src,
+                                            batch<U, A> const& index,
+                                            kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_i32gather_ps(index, src, sizeof(float));
+        }
+
+        template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
+        XSIMD_INLINE batch<double, A>
+        gather(batch<double, A> const&, double const* src, batch<U, A> const& index,
+               kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_i64gather_pd(index, src, sizeof(double));
+        }
+
+        // gather: handmade conversions
+        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+        XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, double const* src,
+                                            batch<V, A> const& index,
+                                            requires_arch<avx512f>) noexcept
+        {
+            const batch<double, A> low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double)));
+            const batch<double, A> high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double)));
+            return detail::merge_avx(_mm512_cvtpd_ps(low.data), _mm512_cvtpd_ps(high.data));
+        }
+
+        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+        XSIMD_INLINE batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
+                                              batch<V, A> const& index,
+                                              requires_arch<avx512f>) noexcept
+        {
+            const batch<double, A> low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double)));
+            const batch<double, A> high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double)));
+            return detail::merge_avx(_mm512_cvtpd_epi32(low.data), _mm512_cvtpd_epi32(high.data));
+        }
+
+        // ge
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_GE_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_GE_OQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_GE>(self, other);
+        }
+
+        // gt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_GT_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_GT_OQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_GT>(self, other);
+        }
+
+        // haddp
+        template <class A>
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512f>) noexcept
+        {
+            // The following folds over the vector once:
+            // tmp1 = [a0..8, b0..8]
+            // tmp2 = [a8..f, b8..f]
+#define XSIMD_AVX512_HADDP_STEP1(I, a, b)                                \
+    batch<float, avx512f> res##I;                                        \
+    {                                                                    \
+        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
+        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
+        res##I = _mm512_add_ps(tmp1, tmp2);                              \
+    }
+
+            XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
+            XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
+            XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
+            XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
+            XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
+            XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
+            XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
+            XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
+
+#undef XSIMD_AVX512_HADDP_STEP1
+
+            // The following flds the code and shuffles so that hadd_ps produces the correct result
+            // tmp1 = [a0..4,  a8..12,  b0..4,  b8..12] (same for tmp3)
+            // tmp2 = [a5..8, a12..16, b5..8, b12..16]  (same for tmp4)
+            // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
+#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d)                                                                                                         \
+    batch<float, avx2> halfx##I;                                                                                                                        \
+    {                                                                                                                                                   \
+        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0));                                                                                \
+        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1));                                                                                \
+                                                                                                                                                        \
+        auto resx1 = _mm512_add_ps(tmp1, tmp2);                                                                                                         \
+                                                                                                                                                        \
+        auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0));                                                                                \
+        auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1));                                                                                \
+                                                                                                                                                        \
+        auto resx2 = _mm512_add_ps(tmp3, tmp4);                                                                                                         \
+                                                                                                                                                        \
+        auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0));                                                                           \
+        auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1));                                                                           \
+                                                                                                                                                        \
+        auto resx3 = _mm512_add_ps(tmp5, tmp6);                                                                                                         \
+                                                                                                                                                        \
+        halfx##I = _mm256_hadd_ps(_mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 0)), _mm512_extractf32x4_ps(resx3, 1), 1),  \
+                                  _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 2)), _mm512_extractf32x4_ps(resx3, 3), 1)); \
+    }
+
+            XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
+            XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
+
+#undef XSIMD_AVX512_HADDP_STEP2
+
+            auto concat = _mm512_castps256_ps512(halfx0);
+            concat = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(concat), _mm256_castps_pd(halfx1), 1));
+            return concat;
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx512f>) noexcept
+        {
+#define step1(I, a, b)                                                   \
+    batch<double, avx512f> res##I;                                       \
+    {                                                                    \
+        auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
+        auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
+        res##I = _mm512_add_pd(tmp1, tmp2);                              \
+    }
+
+            step1(1, row[0], row[2]);
+            step1(2, row[4], row[6]);
+            step1(3, row[1], row[3]);
+            step1(4, row[5], row[7]);
+
+#undef step1
+
+            auto tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0));
+            auto tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1));
+
+            auto resx1 = _mm512_add_pd(tmp5, tmp6);
+
+            auto tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0));
+            auto tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1));
+
+            auto resx2 = _mm512_add_pd(tmp7, tmp8);
+
+            auto tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000);
+            auto tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111);
+
+            return _mm512_add_pd(tmpx, tmpy);
+        }
+
+        // isnan
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, self, _CMP_UNORD_Q);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, self, _CMP_UNORD_Q);
+        }
+
+        // ldexp
+        template <class A>
+        XSIMD_INLINE batch<float, A> ldexp(const batch<float, A>& self, const batch<as_integer_t<float>, A>& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_scalef_ps(self, _mm512_cvtepi32_ps(other));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512f>) noexcept
+        {
+            // FIXME: potential data loss here when converting other elements to
+            // int32 before converting them back to double.
+            __m512d adjusted_index = _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(other));
+            return _mm512_scalef_pd(self, adjusted_index);
+        }
+
+        // le
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_LE_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_LE_OQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_LE>(self, other);
+        }
+
+        // load_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_load_si512((__m512i const*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_load_ps(mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_load_pd(mem);
+        }
+
+        // load_complex
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx512f>) noexcept
+            {
+                __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+                __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+                auto real = _mm512_permutex2var_ps(hi, real_idx, lo);
+                auto imag = _mm512_permutex2var_ps(hi, imag_idx, lo);
+                return { real, imag };
+            }
+            template <class A>
+            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx512f>) noexcept
+            {
+                __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14);
+                __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15);
+                auto real = _mm512_permutex2var_pd(hi, real_idx, lo);
+                auto imag = _mm512_permutex2var_pd(hi, imag_idx, lo);
+                return { real, imag };
+            }
+        }
+
+        // load_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_loadu_si512((__m512i const*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_loadu_ps(mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_loadu_pd(mem);
+        }
+
+        // lt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_LT_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_LT_OQ);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_LT>(self, other);
+        }
+
+        // mask
+        template <class A, class T>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return self.data;
+        }
+
+        // max
+        template <class A>
+        XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_max_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_max_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_max_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_max_epi64(self, other);
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                              { return max(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                              self, other);
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_max_epu32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_max_epu64(self, other);
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                              { return max(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                              self, other);
+                }
+            }
+        }
+
+        // min
+        template <class A>
+        XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_min_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_min_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_min_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_min_epi64(self, other);
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                              { return min(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                              self, other);
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_min_epu32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_min_epu64(self, other);
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                              { return min(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                              self, other);
+                }
+            }
+        }
+
+        // mul
+        template <class A>
+        XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_mul_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_mul_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_mullo_epi32(self, other);
+            }
+            else
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return mul(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+        }
+
+        // nearbyint
+        template <class A>
+        XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                        requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cvtps_epi32(self);
+        }
+
+        // neg
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return 0 - self;
+        }
+
+        // neq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return ~(self == other);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data ^ other.data);
+        }
+
+        // reciprocal
+        template <class A>
+        XSIMD_INLINE batch<float, A>
+        reciprocal(batch<float, A> const& self,
+                   kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_rcp14_ps(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A>
+        reciprocal(batch<double, A> const& self,
+                   kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_rcp14_pd(self);
+        }
+
+        // reduce_add
+        template <class A>
+        XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            __m128 tmp1 = _mm512_extractf32x4_ps(rhs, 0);
+            __m128 tmp2 = _mm512_extractf32x4_ps(rhs, 1);
+            __m128 tmp3 = _mm512_extractf32x4_ps(rhs, 2);
+            __m128 tmp4 = _mm512_extractf32x4_ps(rhs, 3);
+            __m128 res1 = _mm_add_ps(tmp1, tmp2);
+            __m128 res2 = _mm_add_ps(tmp3, tmp4);
+            __m128 res3 = _mm_add_ps(res1, res2);
+            return reduce_add(batch<float, sse4_2>(res3), sse4_2 {});
+        }
+        template <class A>
+        XSIMD_INLINE double reduce_add(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1);
+            __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0);
+            __m256d res1 = _mm256_add_pd(tmp1, tmp2);
+            return reduce_add(batch<double, avx2>(res1), avx2 {});
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            __m256i low, high;
+            detail::split_avx512(self, low, high);
+            batch<T, avx2> blow(low), bhigh(high);
+            return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {});
+        }
+
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
+        XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
+            batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
+            batch<T, A> acc = max(self, step);
+            __m256i low = _mm512_castsi512_si256(acc);
+            return reduce_max(batch<T, avx2>(low));
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
+        XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
+            batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
+            batch<T, A> acc = min(self, step);
+            __m256i low = _mm512_castsi512_si256(acc);
+            return reduce_min(batch<T, avx2>(low));
+        }
+
+        // rsqrt
+        template <class A>
+        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_rsqrt14_ps(val);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_rsqrt14_pd(val);
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                auto mask = other < 0;
+                auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+                auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+                return other + select(mask, self_neg_branch, self_pos_branch);
+            }
+            else
+            {
+                const auto diffmax = std::numeric_limits<T>::max() - self;
+                const auto mindiff = min(diffmax, other);
+                return self + mindiff;
+            }
+        }
+
+        // scatter
+        template <class A, class T,
+                  class = typename std::enable_if<std::is_same<uint32_t, T>::value || std::is_same<int32_t, T>::value, void>::type>
+        XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst,
+                                  batch<int32_t, A> const& index,
+                                  kernel::requires_arch<avx512f>) noexcept
+        {
+            _mm512_i32scatter_epi32(dst, index, src, sizeof(T));
+        }
+
+        template <class A, class T,
+                  class = typename std::enable_if<std::is_same<uint64_t, T>::value || std::is_same<int64_t, T>::value, void>::type>
+        XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst,
+                                  batch<int64_t, A> const& index,
+                                  kernel::requires_arch<avx512f>) noexcept
+        {
+            _mm512_i64scatter_epi64(dst, index, src, sizeof(T));
+        }
+
+        template <class A>
+        XSIMD_INLINE void scatter(batch<float, A> const& src, float* dst,
+                                  batch<int32_t, A> const& index,
+                                  kernel::requires_arch<avx512f>) noexcept
+        {
+            _mm512_i32scatter_ps(dst, index, src, sizeof(float));
+        }
+
+        template <class A>
+        XSIMD_INLINE void scatter(batch<double, A> const& src, double* dst,
+                                  batch<int64_t, A> const& index,
+                                  kernel::requires_arch<avx512f>) noexcept
+        {
+            _mm512_i64scatter_pd(dst, index, src, sizeof(double));
+        }
+
+        // select
+        template <class A>
+        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_mask_blend_ps(cond, false_br, true_br);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_mask_blend_pd(cond, false_br, true_br);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                alignas(avx2::alignment()) uint8_t buffer[64];
+                // FIXME: ultra inefficient
+                for (int i = 0; i < 64; ++i)
+                    buffer[i] = cond.data & (1ull << i) ? 0xFF : 0;
+                __m256i cond_low = batch<uint8_t, avx2>::load_aligned(&buffer[0]);
+                __m256i cond_hi = batch<uint8_t, avx2>::load_aligned(&buffer[32]);
+
+                __m256i true_low, true_hi;
+                detail::split_avx512(true_br, true_low, true_hi);
+
+                __m256i false_low, false_hi;
+                detail::split_avx512(false_br, false_low, false_hi);
+
+                __m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
+                __m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
+                return detail::merge_avx(res_low, res_hi);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0));
+                __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0));
+
+                __m256i true_low, true_hi;
+                detail::split_avx512(true_br, true_low, true_hi);
+
+                __m256i false_low, false_hi;
+                detail::split_avx512(false_br, false_low, false_hi);
+
+                __m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
+                __m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
+                return detail::merge_avx(res_low, res_hi);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_mask_blend_epi32(cond, false_br, true_br);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_mask_blend_epi64(cond, false_br, true_br);
+            }
+            else
+            {
+                assert(false && "unsupported arch/type combination");
+                return {};
+            }
+        }
+
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, avx512f {});
+        }
+
+        namespace detail
+        {
+            template <class T>
+            using enable_signed_integer_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value,
+                                                                    int>::type;
+
+            template <class T>
+            using enable_unsigned_integer_t = typename std::enable_if<std::is_integral<T>::value && std::is_unsigned<T>::value,
+                                                                      int>::type;
+        }
+
+        // set
+        template <class A>
+        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<avx512f>, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) noexcept
+        {
+            return _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<avx512f>, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) noexcept
+        {
+            return _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        {
+            return _mm512_set_epi64(v7, v6, v5, v4, v3, v2, v1, v0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                                     T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        {
+            return _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+        template <class A, class T, detail::enable_signed_integer_t<T> = 0>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                                     T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                                     T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                                     T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
+        {
+#if defined(__clang__) || __GNUC__
+            return __extension__(__m512i)(__v32hi) {
+                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+            };
+#else
+            return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                                    v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+#endif
+        }
+
+        template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                                     T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                                     T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                                     T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
+        {
+#if defined(__clang__) || __GNUC__
+            return __extension__(__m512i)(__v32hu) {
+                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+            };
+#else
+            return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                                    v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+#endif
+        }
+
+        template <class A, class T, detail::enable_signed_integer_t<T> = 0>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                                     T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                                     T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                                     T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
+                                     T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
+                                     T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
+                                     T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
+                                     T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
+        {
+
+#if defined(__clang__) || __GNUC__
+            return __extension__(__m512i)(__v64qi) {
+                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+                v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+                v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
+            };
+#else
+            return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                                   v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+                                   v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+                                   v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63);
+#endif
+        }
+        template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                                     T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                                     T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                                     T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
+                                     T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
+                                     T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
+                                     T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
+                                     T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
+        {
+
+#if defined(__clang__) || __GNUC__
+            return __extension__(__m512i)(__v64qu) {
+                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+                v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+                v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
+            };
+#else
+            return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                                   v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+                                   v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+                                   v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63);
+#endif
+        }
+
+        template <class A, class T, class... Values>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx512f>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
+            using register_type = typename batch_bool<T, A>::register_type;
+            register_type r = 0;
+            unsigned shift = 0;
+            (void)std::initializer_list<register_type> { (r |= register_type(values ? 1 : 0) << (shift++))... };
+            return r;
+        }
+
+        // shuffle
+        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7, ITy I8, ITy I9, ITy I10, ITy I11, ITy I12, ITy I13, ITy I14, ITy I15>
+        XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y,
+                                             batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15> mask,
+                                             requires_arch<avx512f>) noexcept
+        {
+            constexpr uint32_t smask = (I0 & 0x3) | ((I1 & 0x3) << 2) | ((I2 & 0x3) << 4) | ((I3 & 0x3) << 6);
+
+            // shuffle within lane
+            if ((I4 == I0 + 4) && (I5 == I1 + 4) && (I6 == I2 + 4) && (I7 == I3 + 4) && (I8 == I0 + 8) && (I9 == I1 + 8) && (I10 == I2 + 8) && (I11 == I3 + 8) && (I12 == I0 + 12) && (I13 == I1 + 12) && (I14 == I2 + 12) && (I15 == I3 + 12) && I0 < 4 && I1 < 4 && I2 >= 16 && I2 < 20 && I3 >= 16 && I3 < 20)
+                return _mm512_shuffle_ps(x, y, smask);
+
+            // shuffle within opposite lane
+            if ((I4 == I0 + 4) && (I5 == I1 + 4) && (I6 == I2 + 4) && (I7 == I3 + 4) && (I8 == I0 + 8) && (I9 == I1 + 8) && (I10 == I2 + 8) && (I11 == I3 + 8) && (I12 == I0 + 12) && (I13 == I1 + 12) && (I14 == I2 + 12) && (I15 == I3 + 12) && I2 < 4 && I3 < 4 && I0 >= 16 && I0 < 20 && I1 >= 16 && I1 < 20)
+                return _mm512_shuffle_ps(y, x, smask);
+
+            return shuffle(x, y, mask, generic {});
+        }
+
+        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
+        XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx512f>) noexcept
+        {
+            constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3) | ((I4 & 0x1) << 4) | ((I5 & 0x1) << 5) | ((I6 & 0x1) << 6) | ((I7 & 0x1) << 7);
+            // shuffle within lane
+            if (I0 < 2 && I1 >= 8 && I1 < 10 && I2 >= 2 && I2 < 4 && I3 >= 10 && I3 < 12 && I4 >= 4 && I4 < 6 && I5 >= 12 && I5 < 14 && I6 >= 6 && I6 < 8 && I7 >= 14)
+                return _mm512_shuffle_pd(x, y, smask);
+
+            // shuffle within opposite lane
+            if (I1 < 2 && I0 >= 8 && I0 < 10 && I3 >= 2 && I3 < 4 && I2 >= 10 && I2 < 12 && I5 >= 4 && I5 < 6 && I4 >= 12 && I4 < 14 && I7 >= 6 && I7 < 8 && I6 >= 14)
+                return _mm512_shuffle_pd(y, x, smask);
+
+            return shuffle(x, y, mask, generic {});
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const&, requires_arch<avx512f>) noexcept
+        {
+            static_assert(N == 0xDEAD, "not implemented yet");
+            return {};
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const&, requires_arch<avx512f>) noexcept
+        {
+            static_assert(N == 0xDEAD, "not implemented yet");
+            return {};
+        }
+
+        // sqrt
+        template <class A>
+        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_sqrt_ps(val);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_sqrt_pd(val);
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                return sadd(self, -other);
+            }
+            else
+            {
+                const auto diff = min(self, other);
+                return self - diff;
+            }
+        }
+
+        // store
+        template <class T, class A>
+        XSIMD_INLINE void store(batch_bool<T, A> const& self, bool* mem, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            constexpr auto size = batch_bool<T, A>::size;
+            for (std::size_t i = 0; i < size; ++i)
+                mem[i] = self.data & (register_type(1) << i);
+        }
+
+        // store_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_store_si512((__m512i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_store_si512((__m512i*)mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_store_ps(mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_store_pd(mem, self);
+        }
+
+        // store_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_storeu_si512((__m512i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_storeu_si512((__m512i*)mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_storeu_ps(mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_storeu_pd(mem, self);
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return sub(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return sub(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_sub_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_sub_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_sub_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_sub_pd(self, other);
+        }
+
+        // swizzle (dynamic version)
+        template <class A>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_permutexvar_ps(mask, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_permutexvar_pd(mask, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_permutexvar_epi64(mask, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx512f {}));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_permutexvar_epi32(mask, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx512f {}));
+        }
+
+        // swizzle (constant version)
+        template <class A, uint32_t... Vs>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return swizzle(self, mask.as_batch(), avx512f {});
+        }
+
+        template <class A, uint64_t... Vs>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return swizzle(self, mask.as_batch(), avx512f {});
+        }
+
+        template <class A, uint64_t... Vs>
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return swizzle(self, mask.as_batch(), avx512f {});
+        }
+
+        template <class A, uint64_t... Vs>
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return swizzle(self, mask.as_batch(), avx512f {});
+        }
+
+        template <class A, uint32_t... Vs>
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return swizzle(self, mask.as_batch(), avx512f {});
+        }
+
+        template <class A, uint32_t... Vs>
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return swizzle(self, mask.as_batch(), avx512f {});
+        }
+
+        namespace detail
+        {
+            template <class T, class A, T... Idx>
+            struct is_pair_of_contiguous_indices;
+
+            template <class T, class A>
+            struct is_pair_of_contiguous_indices<T, A> : std::true_type
+            {
+            };
+
+            template <class T, class A, T Idx0, T Idx1, T... Idx>
+            struct is_pair_of_contiguous_indices<T, A, Idx0, Idx1, Idx...> : std::conditional<(Idx0 % 2 == 0) && (Idx0 + 1 == Idx1), is_pair_of_contiguous_indices<T, A, Idx...>, std::false_type>::type
+            {
+            };
+
+            template <class A, uint16_t I0, uint16_t I1, uint16_t I2, uint16_t I3, uint16_t I4, uint16_t I5, uint16_t I6, uint16_t I7,
+                      uint16_t I8, uint16_t I9, uint16_t I10, uint16_t I11, uint16_t I12, uint16_t I13, uint16_t I14, uint16_t I15,
+                      uint16_t I16, uint16_t I17, uint16_t I18, uint16_t I19, uint16_t I20, uint16_t I21, uint16_t I22, uint16_t I23,
+                      uint16_t I24, uint16_t I25, uint16_t I26, uint16_t I27, uint16_t I28, uint16_t I29, uint16_t I30, uint16_t I31>
+            struct fold_batch_constant
+            {
+                using type = batch_constant<uint32_t, A, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2,
+                                            I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
+            };
+
+        }
+
+        template <class A, uint16_t... Idx, class _ = typename std::enable_if<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value, void>::type>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
+        {
+            constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
+            return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint16_t, A>
+        swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
+        {
+            // FIXME: this sequence is very inefficient, but it's here to catch
+            // a pattern generated by detail::reduce from xsimd_generic_math.hpp.
+            // The whole pattern is actually decently folded by GCC and Clang,
+            // so bare with it.
+            constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
+            auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
+
+            alignas(A::alignment()) uint16_t buffer[32];
+            _mm512_store_si512((__m512i*)&buffer[0], tmp);
+            buffer[0] = buffer[1];
+            return _mm512_load_si512(&buffer[0]);
+        }
+
+        template <class A, uint16_t... Vs>
+        XSIMD_INLINE batch<int16_t, A>
+        swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512f {}));
+        }
+
+        // trunc
+        template <class A>
+        XSIMD_INLINE batch<float, A>
+        trunc(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A>
+        trunc(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION);
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A>
+        zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(false && "not implemented yet");
+                return {};
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(false && "not implemented yet");
+                return {};
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                lo = _mm512_unpacklo_epi32(self, other);
+                hi = _mm512_unpackhi_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                lo = _mm512_unpacklo_epi64(self, other);
+                hi = _mm512_unpackhi_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+            return _mm512_inserti32x4(
+                _mm512_inserti32x4(
+                    _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
+                    _mm512_extracti32x4_epi32(lo, 3),
+                    2),
+                _mm512_extracti32x4_epi32(hi, 2),
+                1);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A>
+        zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            auto lo = _mm512_unpacklo_ps(self, other);
+            auto hi = _mm512_unpackhi_ps(self, other);
+            return _mm512_insertf32x4(
+                _mm512_insertf32x4(
+                    _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0),
+                    _mm512_extractf32x4_ps(lo, 3),
+                    2),
+                _mm512_extractf32x4_ps(hi, 2),
+                1);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A>
+        zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other));
+            auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other));
+            return _mm512_castps_pd(_mm512_insertf32x4(
+                _mm512_insertf32x4(
+                    _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0),
+                    _mm512_extractf32x4_ps(lo, 3),
+                    2),
+                _mm512_extractf32x4_ps(hi, 2),
+                1));
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A>
+        zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(false && "not implemented yet");
+                return {};
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(false && "not implemented yet");
+                return {};
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                lo = _mm512_unpacklo_epi32(self, other);
+                hi = _mm512_unpackhi_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                lo = _mm512_unpacklo_epi64(self, other);
+                hi = _mm512_unpackhi_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+            return _mm512_inserti32x4(
+                _mm512_inserti32x4(
+                    _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
+                    _mm512_extracti32x4_epi32(hi, 1),
+                    3),
+                _mm512_extracti32x4_epi32(lo, 1),
+                2);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A>
+        zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            auto lo = _mm512_unpacklo_ps(self, other);
+            auto hi = _mm512_unpackhi_ps(self, other);
+            return _mm512_insertf32x4(
+                _mm512_insertf32x4(
+                    _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1),
+                    _mm512_extractf32x4_ps(hi, 1),
+                    3),
+                _mm512_extractf32x4_ps(lo, 1),
+                2);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A>
+        zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other));
+            auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other));
+            return _mm512_castps_pd(_mm512_insertf32x4(
+                _mm512_insertf32x4(
+                    _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1),
+                    _mm512_extractf32x4_ps(hi, 1),
+                    3),
+                _mm512_extractf32x4_ps(lo, 1),
+                2));
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_avx512ifma.hpp b/include/onnxruntime/xsimd/arch/xsimd_avx512ifma.hpp
new file mode 100644
index 0000000000000..df382881b0b2e
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_avx512ifma.hpp
@@ -0,0 +1,20 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VBMI_HPP
+#define XSIMD_AVX512VBMI_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512vbmi_register.hpp"
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_avx512pf.hpp b/include/onnxruntime/xsimd/arch/xsimd_avx512pf.hpp
new file mode 100644
index 0000000000000..6265c91718fb0
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_avx512pf.hpp
@@ -0,0 +1,20 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512PF_HPP
+#define XSIMD_AVX512PF_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512pf_register.hpp"
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_avx512vbmi.hpp b/include/onnxruntime/xsimd/arch/xsimd_avx512vbmi.hpp
new file mode 100644
index 0000000000000..df382881b0b2e
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_avx512vbmi.hpp
@@ -0,0 +1,20 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VBMI_HPP
+#define XSIMD_AVX512VBMI_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512vbmi_register.hpp"
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp b/include/onnxruntime/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp
new file mode 100644
index 0000000000000..b285623d02f69
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp
@@ -0,0 +1,20 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VNNI_AVX512_BW_HPP
+#define XSIMD_AVX512VNNI_AVX512_BW_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512vnni_avx512bw_register.hpp"
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_avx512vnni_avx512vbmi.hpp b/include/onnxruntime/xsimd/arch/xsimd_avx512vnni_avx512vbmi.hpp
new file mode 100644
index 0000000000000..a70d30fad5985
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_avx512vnni_avx512vbmi.hpp
@@ -0,0 +1,20 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VNNI_AVX512VBMI_HPP
+#define XSIMD_AVX512VNNI_AVX512VBMI_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512vnni_avx512vbmi_register.hpp"
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_avxvnni.hpp b/include/onnxruntime/xsimd/arch/xsimd_avxvnni.hpp
new file mode 100644
index 0000000000000..a97ba9296c516
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_avxvnni.hpp
@@ -0,0 +1,20 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVXVNNI_HPP
+#define XSIMD_AVXVNNI_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avxvnni_register.hpp"
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_constants.hpp b/include/onnxruntime/xsimd/arch/xsimd_constants.hpp
new file mode 100644
index 0000000000000..51411d2877465
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_constants.hpp
@@ -0,0 +1,391 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NUMERICAL_CONSTANT_HPP
+#define XSIMD_NUMERICAL_CONSTANT_HPP
+
+#include <limits>
+
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+
+    namespace constants
+    {
+
+#define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \
+    template <class T>                              \
+    XSIMD_INLINE T NAME() noexcept                  \
+    {                                               \
+        return T(NAME<typename T::value_type>());   \
+    }                                               \
+    template <>                                     \
+    XSIMD_INLINE float NAME<float>() noexcept       \
+    {                                               \
+        return SINGLE;                              \
+    }                                               \
+    template <>                                     \
+    XSIMD_INLINE double NAME<double>() noexcept     \
+    {                                               \
+        return DOUBLE;                              \
+    }
+
+#define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \
+    template <class T>                                  \
+    XSIMD_INLINE T NAME() noexcept                      \
+    {                                                   \
+        return T(NAME<typename T::value_type>());       \
+    }                                                   \
+    template <>                                         \
+    XSIMD_INLINE float NAME<float>() noexcept           \
+    {                                                   \
+        return bit_cast<float>((uint32_t)SINGLE);       \
+    }                                                   \
+    template <>                                         \
+    XSIMD_INLINE double NAME<double>() noexcept         \
+    {                                                   \
+        return bit_cast<double>((uint64_t)DOUBLE);      \
+    }
+
+// Under fast-math, GCC might replace signmask (minus zero) by zero
+#if defined(__FAST_MATH__) && defined(__GNUC__) && !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC optimize("signed-zeros")
+#endif
+        XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits<float>::infinity()), (std::numeric_limits<double>::infinity()))
+        XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986)
+        XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000)
+        XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200)
+        XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949)
+        XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883)
+        XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553)
+        XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000)
+        XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76)
+        XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312)
+        XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12)
+        XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd)
+        XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5)
+        XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0)
+        XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400)
+        XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.)
+        XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167)
+        XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18)
+        XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641)
+        XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.)
+        XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167)
+        XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity<float>()), (-infinity<double>()))
+        XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff)
+        XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18)
+        XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07)
+        XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1)
+        XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18)
+        XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000)
+        XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits<float>::min(), std::numeric_limits<double>::min())
+        XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704)
+        XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31)
+        XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6)
+        XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e)
+        XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883)
+        XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0)
+        XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286)
+#if defined(__FAST_MATH__) && defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+#undef XSIMD_DEFINE_CONSTANT
+#undef XSIMD_DEFINE_CONSTANT_HEX
+
+        template <class T>
+        constexpr T allbits() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> mask1frexp() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> mask2frexp() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> maxexponent() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> maxexponentm1() noexcept;
+
+        template <class T>
+        constexpr int32_t nmb() noexcept;
+
+        template <class T>
+        constexpr T zero() noexcept;
+
+        template <class T>
+        constexpr T minvalue() noexcept;
+
+        template <class T>
+        constexpr T maxvalue() noexcept;
+
+        /**************************
+         * allbits implementation *
+         **************************/
+
+        namespace detail
+        {
+            template <class T, bool = std::is_integral<T>::value>
+            struct allbits_impl
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return T(~0);
+                }
+            };
+
+            template <class T>
+            struct allbits_impl<T, false>
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return nan<T>();
+                }
+            };
+        }
+
+        template <class T>
+        XSIMD_INLINE constexpr T allbits() noexcept
+        {
+            return T(detail::allbits_impl<typename T::value_type>::get_value());
+        }
+
+        /*****************************
+         * mask1frexp implementation *
+         *****************************/
+
+        template <class T>
+        XSIMD_INLINE constexpr as_integer_t<T> mask1frexp() noexcept
+        {
+            return as_integer_t<T>(mask1frexp<typename T::value_type>());
+        }
+
+        template <>
+        XSIMD_INLINE constexpr int32_t mask1frexp<float>() noexcept
+        {
+            return 0x7f800000;
+        }
+
+        template <>
+        XSIMD_INLINE constexpr int64_t mask1frexp<double>() noexcept
+        {
+            return 0x7ff0000000000000;
+        }
+
+        /*****************************
+         * mask2frexp implementation *
+         *****************************/
+
+        template <class T>
+        XSIMD_INLINE constexpr as_integer_t<T> mask2frexp() noexcept
+        {
+            return as_integer_t<T>(mask2frexp<typename T::value_type>());
+        }
+
+        template <>
+        XSIMD_INLINE constexpr int32_t mask2frexp<float>() noexcept
+        {
+            return 0x3f000000;
+        }
+
+        template <>
+        XSIMD_INLINE constexpr int64_t mask2frexp<double>() noexcept
+        {
+            return 0x3fe0000000000000;
+        }
+
+        /******************************
+         * maxexponent implementation *
+         ******************************/
+
+        template <class T>
+        XSIMD_INLINE constexpr as_integer_t<T> maxexponent() noexcept
+        {
+            return as_integer_t<T>(maxexponent<typename T::value_type>());
+        }
+
+        template <>
+        XSIMD_INLINE constexpr int32_t maxexponent<float>() noexcept
+        {
+            return 127;
+        }
+
+        template <>
+        XSIMD_INLINE constexpr int64_t maxexponent<double>() noexcept
+        {
+            return 1023;
+        }
+
+        /******************************
+         * maxexponent implementation *
+         ******************************/
+
+        template <class T>
+        XSIMD_INLINE constexpr as_integer_t<T> maxexponentm1() noexcept
+        {
+            return as_integer_t<T>(maxexponentm1<typename T::value_type>());
+        }
+
+        template <>
+        XSIMD_INLINE constexpr int32_t maxexponentm1<float>() noexcept
+        {
+            return 126;
+        }
+
+        template <>
+        XSIMD_INLINE constexpr int64_t maxexponentm1<double>() noexcept
+        {
+            return 1022;
+        }
+
+        /**********************
+         * nmb implementation *
+         **********************/
+
+        template <class T>
+        XSIMD_INLINE constexpr int32_t nmb() noexcept
+        {
+            return nmb<typename T::value_type>();
+        }
+
+        template <>
+        XSIMD_INLINE constexpr int32_t nmb<float>() noexcept
+        {
+            return 23;
+        }
+
+        template <>
+        XSIMD_INLINE constexpr int32_t nmb<double>() noexcept
+        {
+            return 52;
+        }
+
+        /***********************
+         * zero implementation *
+         ***********************/
+
+        template <class T>
+        XSIMD_INLINE constexpr T zero() noexcept
+        {
+            return T(typename T::value_type(0));
+        }
+
+        /***************************
+         * minvalue implementation *
+         ***************************/
+
+        namespace detail
+        {
+            template <class T>
+            struct minvalue_impl
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return std::numeric_limits<typename T::value_type>::min();
+                }
+            };
+
+            template <class T>
+            struct minvalue_common
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return std::numeric_limits<T>::min();
+                }
+            };
+
+            template <>
+            struct minvalue_impl<int8_t> : minvalue_common<int8_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint8_t> : minvalue_common<uint8_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<int16_t> : minvalue_common<int16_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint16_t> : minvalue_common<uint16_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<int32_t> : minvalue_common<int32_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint32_t> : minvalue_common<uint32_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<int64_t> : minvalue_common<int64_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint64_t> : minvalue_common<uint64_t>
+            {
+            };
+
+            template <>
+            struct minvalue_impl<float>
+            {
+                XSIMD_INLINE static float get_value() noexcept
+                {
+                    return bit_cast<float>((uint32_t)0xff7fffff);
+                }
+            };
+
+            template <>
+            struct minvalue_impl<double>
+            {
+                XSIMD_INLINE static double get_value() noexcept
+                {
+                    return bit_cast<double>((uint64_t)0xffefffffffffffff);
+                }
+            };
+        }
+
+        template <class T>
+        constexpr T minvalue() noexcept
+        {
+            return T(detail::minvalue_impl<typename T::value_type>::get_value());
+        }
+
+        /***************************
+         * maxvalue implementation *
+         ***************************/
+
+        template <class T>
+        constexpr T maxvalue() noexcept
+        {
+            return T(std::numeric_limits<typename T::value_type>::max());
+        }
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_emulated.hpp b/include/onnxruntime/xsimd/arch/xsimd_emulated.hpp
new file mode 100644
index 0000000000000..2f4585bbb3a0c
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_emulated.hpp
@@ -0,0 +1,771 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_EMULATED_HPP
+#define XSIMD_EMULATED_HPP
+
+#include <complex>
+#include <limits>
+#include <numeric>
+#include <type_traits>
+
+#include "../arch/xsimd_scalar.hpp"
+
+#include "../types/xsimd_emulated_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+    template <typename T, class A, bool... Values>
+    struct batch_bool_constant;
+
+    template <class T_out, class T_in, class A>
+    XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
+
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fwd
+        template <class A, class T, size_t I>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+        template <class A, typename T, typename ITy, ITy... Indices>
+        XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
+
+        namespace detail
+        {
+            template <size_t I, class F, class... Bs>
+            auto emulated_apply(F func, Bs const&... bs) -> decltype(func(bs.data[I]...))
+            {
+                return func(bs.data[I]...);
+            }
+
+            template <class F, class B, class... Bs, size_t... Is>
+            auto emulated_apply(F func, ::xsimd::detail::index_sequence<Is...>, B const& b, Bs const&... bs) -> std::array<decltype(func(b.data[0], bs.data[0]...)), B::size>
+            {
+                return { emulated_apply<Is>(func, b, bs...)... };
+            }
+
+            template <class B, class F, class... Bs>
+            auto emulated_apply(F func, B const& b, Bs const&... bs) -> std::array<decltype(func(b.data[0], bs.data[0]...)), B::size>
+            {
+                return emulated_apply(func, ::xsimd::detail::make_index_sequence<B::size>(), b, bs...);
+            }
+        }
+
+        // abs
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::abs(v); },
+                                          self);
+        }
+
+        // add
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::add(v0, v1); },
+                                          self, other);
+        }
+
+        // all
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::all_of(self.data.begin(), self.data.end(), [](T v)
+                               { return bool(v); });
+        }
+
+        // any
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::any_of(self.data.begin(), self.data.end(), [](T v)
+                               { return bool(v); });
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in, size_t N = 8 * sizeof(T_in) * batch<T_in, A>::size>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<emulated<N>>) noexcept
+        {
+            return { self.data };
+        }
+
+        // bitwise_and
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_and(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_and(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_andnot
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_andnot(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_andnot(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([other](T v)
+                                          { return xsimd::bitwise_lshift(v, other); },
+                                          self);
+        }
+
+        // bitwise_not
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::bitwise_not(v); },
+                                          self);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v)
+                                          { return xsimd::bitwise_not(v); },
+                                          self);
+        }
+
+        // bitwise_or
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_or(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_or(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([other](T v)
+                                          { return xsimd::bitwise_rshift(v, other); },
+                                          self);
+        }
+
+        // bitwise_xor
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_xor(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_xor(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T_in, class T_out, size_t N = 8 * sizeof(T_in) * batch<T_in, A>::size>
+        XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T_out, A>::size;
+            std::array<T_out, size> result;
+            char* raw_data = reinterpret_cast<char*>(result.data());
+            const char* raw_input = reinterpret_cast<const char*>(self.data.data());
+            memcpy(raw_data, raw_input, size * sizeof(T_out));
+            return result;
+        }
+
+        // broadcast
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        batch<T, A> XSIMD_INLINE broadcast(T val, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> r;
+            std::fill(r.begin(), r.end(), val);
+            return r;
+        }
+
+#if 0
+        // count
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE size_t count(batch_bool<T, A> const& x, requires_arch<emulated<N>>) noexcept
+        {
+            uint64_t m = x.mask();
+            // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+            m = m - ((m >> 1) & (uint64_t) ~(uint64_t)0 / 3); // temp
+            m = (m & (uint64_t) ~(uint64_t)0 / 15 * 3) + ((m >> 2) & (uint64_t) ~(uint64_t)0 / 15 * 3); // temp
+            m = (m + (m >> 4)) & (uint64_t) ~(uint64_t)0 / 255 * 15; // temp
+            return (m * ((uint64_t) ~(uint64_t)0 / 255)) >> (sizeof(uint64_t) - 1) * CHAR_BIT; // count
+        }
+#endif
+
+        // store_complex
+        namespace detail
+        {
+            // complex_low
+            template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+            XSIMD_INLINE batch<T, A> complex_low(batch<std::complex<T>, A> const& self, requires_arch<emulated<N>>) noexcept
+            {
+                constexpr size_t size = batch<T, A>::size;
+                std::array<T, size> result;
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    result[2 * i] = self.real().data[i];
+                    result[1 + 2 * i] = self.imag().data[i];
+                }
+                return result;
+            }
+            // complex_high
+            template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+            XSIMD_INLINE batch<T, A> complex_high(batch<std::complex<T>, A> const& self, requires_arch<emulated<N>>) noexcept
+            {
+                constexpr size_t size = batch<T, A>::size;
+                std::array<T, size> result;
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    result[2 * i] = self.real().data[i + size / 2];
+                    result[1 + 2 * i] = self.imag().data[i + size / 2];
+                }
+                return result;
+            }
+        }
+
+        // decr_if
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<emulated<N>>) noexcept
+        {
+            return self - batch<T, A>(mask.data);
+        }
+
+        // div
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::div(v0, v1); },
+                                          self, other);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A, size_t N = 8 * sizeof(float) * batch<float, A>::size>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](int32_t v)
+                                              { return float(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(float) * batch<float, A>::size>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](uint32_t v)
+                                              { return float(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& self, batch<double, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](int64_t v)
+                                              { return double(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& self, batch<double, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](uint64_t v)
+                                              { return double(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(int32_t) * batch<int32_t, A>::size>
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](float v)
+                                              { return int32_t(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
+            XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](double v)
+                                              { return int64_t(v); },
+                                              self);
+            }
+        }
+
+        // eq
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, emulated<N>> eq(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::eq(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch_bool<T, A>::size>
+        XSIMD_INLINE batch_bool<T, emulated<N>> eq(batch_bool<T, emulated<N>> const& self, batch_bool<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::eq(v0, v1); },
+                                          self, other);
+        }
+
+        // from_bool
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v)
+                                          { return T(v); },
+                                          self);
+        }
+
+        // from_mask
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<bool, size> vmask;
+            for (size_t i = 0; i < size; ++i)
+                vmask[i] = (mask >> i) & 1u;
+            return vmask;
+        }
+
+        // ge
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, emulated<N>> ge(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::ge(v0, v1); },
+                                          self, other);
+        }
+
+        // gt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, emulated<N>> gt(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::gt(v0, v1); },
+                                          self, other);
+        }
+
+        // haddp
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> haddp(batch<T, A> const* row, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> r;
+            for (size_t i = 0; i < size; ++i)
+                r[i] = std::accumulate(row[i].data.begin() + 1, row[i].data.end(), row[i].data.front());
+            return r;
+        }
+
+        // incr_if
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<emulated<N>>) noexcept
+        {
+            return self + batch<T, A>(mask.data);
+        }
+
+        // insert
+        template <class A, class T, size_t I, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<emulated<N>>) noexcept
+        {
+            batch<T, A> other = self;
+            other.data[I] = val;
+            return other;
+        }
+
+        // isnan
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::isnan(v); },
+                                          self);
+        }
+
+        // load_aligned
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> res;
+            std::copy(mem, mem + size, res.begin());
+            return res;
+        }
+
+        // load_unaligned
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> res;
+            std::copy(mem, mem + size, res.begin());
+            return res;
+        }
+
+        // load_complex
+        namespace detail
+        {
+            template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+            XSIMD_INLINE batch<std::complex<T>, A> load_complex(batch<T, A> const& hi, batch<T, A> const& lo, requires_arch<emulated<N>>) noexcept
+            {
+                constexpr size_t size = batch<T, A>::size;
+                std::array<T, size> real, imag;
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    real[i] = hi.data[2 * i];
+                    imag[i] = hi.data[1 + 2 * i];
+                }
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    real[size / 2 + i] = lo.data[2 * i];
+                    imag[size / 2 + i] = lo.data[1 + 2 * i];
+                }
+                return { real, imag };
+            }
+        }
+
+        // le
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, emulated<N>> le(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::le(v0, v1); },
+                                          self, other);
+        }
+
+        // lt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, emulated<N>> lt(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::lt(v0, v1); },
+                                          self, other);
+        }
+
+        // mask
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            uint64_t res = 0;
+            for (size_t i = 0; i < size; ++i)
+                res |= (self.data[i] ? 1u : 0u) << i;
+            return res;
+        }
+
+        // max
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::max(v0, v1); },
+                                          self, other);
+        }
+
+        // min
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::min(v0, v1); },
+                                          self, other);
+        }
+
+        // mul
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::mul(v0, v1); },
+                                          self, other);
+        }
+
+        // nearbyint_as_int
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<as_integer_t<T>, A> nearbyint_as_int(batch<T, A> const& self,
+                                                                requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::nearbyint_as_int(v); },
+                                          self);
+        }
+
+        // neg
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::neg(v); },
+                                          self);
+        }
+
+        // neq
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::neq(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::neq(v0, v1); },
+                                          self, other);
+        }
+
+        // reduce_add
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> buffer;
+            self.store_unaligned(buffer.data());
+            return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin());
+        }
+
+        // reduce_max
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y)
+                                   { return xsimd::max(x, y); });
+        }
+
+        // reduce_min
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y)
+                                   { return xsimd::min(x, y); });
+        }
+
+        // rsqrt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::rsqrt(v); },
+                                          self);
+        }
+
+        // select
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool c, T t, T f)
+                                          { return xsimd::select(c, t, f); },
+                                          cond, true_br, false_br);
+        }
+
+        template <class A, class T, bool... Values>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<emulated<8 * sizeof(T) * batch<T, A>::size>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            static_assert(sizeof...(Values) == size, "consistent init");
+            return select((batch_bool<T, A>)cond, true_br, false_br, emulated<8 * sizeof(T) * size> {});
+        }
+
+        // shuffle
+        template <class A, typename T, class ITy, ITy... Is>
+        XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, Is...> mask, requires_arch<emulated<batch<T, A>::size>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            batch<ITy, A> bmask = mask;
+            std::array<T, size> res;
+            for (size_t i = 0; i < size; ++i)
+                res[i] = bmask.data[i] < size ? x.data[bmask.data[i]] : y.data[bmask.data[i] - size];
+            return res;
+        }
+
+        // sqrt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::sqrt(v); },
+                                          self);
+        }
+
+        // slide_left
+        template <size_t M, class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> result;
+            char* raw_data = reinterpret_cast<char*>(result.data());
+            memset(raw_data, 0, M);
+            memcpy(raw_data + M, reinterpret_cast<const char*>(x.data.data()), sizeof(T) * result.size() - M);
+            return result;
+        }
+
+        // slide_right
+        template <size_t M, class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> result;
+            char* raw_data = reinterpret_cast<char*>(result.data());
+            memcpy(raw_data, reinterpret_cast<const char*>(x.data.data()) + M, sizeof(T) * result.size() - M);
+            memset(raw_data + sizeof(T) * result.size() - M, 0, M);
+            return result;
+        }
+
+        // sadd
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::sadd(v0, v1); },
+                                          self, other);
+        }
+
+        // set
+        template <class A, class T, size_t N, class... Values>
+        XSIMD_INLINE batch<T, emulated<N>> set(batch<T, emulated<N>> const&, requires_arch<emulated<N>>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<T, emulated<N>>::size, "consistent init");
+            return { typename batch<T, emulated<N>>::register_type { static_cast<T>(values)... } };
+        }
+
+        template <class A, class T, size_t N, class... Values>
+        XSIMD_INLINE batch_bool<T, emulated<N>> set(batch_bool<T, emulated<N>> const&, requires_arch<emulated<N>>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<T, emulated<N>>::size, "consistent init");
+            return { std::array<bool, sizeof...(Values)> { static_cast<bool>(values)... } };
+        }
+
+        // ssub
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::ssub(v0, v1); },
+                                          self, other);
+        }
+
+        // store_aligned
+        template <class A, class T, size_t N>
+        XSIMD_INLINE void store_aligned(T* mem, batch<T, emulated<N>> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            std::copy(self.data.begin(), self.data.end(), mem);
+        }
+
+        // store_unaligned
+        template <class A, class T, size_t N>
+        XSIMD_INLINE void store_unaligned(T* mem, batch<T, emulated<N>> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            std::copy(self.data.begin(), self.data.end(), mem);
+        }
+
+        // sub
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::sub(v0, v1); },
+                                          self, other);
+        }
+
+        // swizzle
+
+        template <class A, typename T, class ITy, ITy... Is>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<ITy, A, Is...> mask, requires_arch<emulated<8 * sizeof(T) * batch<T, A>::size>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            batch<ITy, A> bmask = mask;
+            std::array<T, size> res;
+            for (size_t i = 0; i < size; ++i)
+                res[i] = self.data[bmask.data[i]];
+            return res;
+        }
+
+        // zip_hi
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            // Note: irregular behavior for odd numbers.
+            std::array<T, size> res;
+            if (size % 2)
+            {
+                for (size_t i = 0; i < size; ++i)
+                    res[i] = (i % 2 ? self : other).data[size / 2 + i / 2];
+            }
+            else
+            {
+                for (size_t i = 0; i < size; ++i)
+                    res[i] = (i % 2 ? other : self).data[size / 2 + i / 2];
+            }
+            return res;
+        }
+
+        // zip_lo
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            // Note: irregular behavior for odd numbers.
+            std::array<T, size> res;
+            for (size_t i = 0; i < size; ++i)
+                res[i] = (i % 2 ? other : self).data[i / 2];
+            return res;
+        }
+    }
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_fma3_avx.hpp b/include/onnxruntime/xsimd/arch/xsimd_fma3_avx.hpp
new file mode 100644
index 0000000000000..99262531476a9
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_fma3_avx.hpp
@@ -0,0 +1,80 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX_HPP
+#define XSIMD_FMA3_AVX_HPP
+
+#include "../types/xsimd_fma3_avx_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fnma
+        template <class A>
+        XSIMD_INLINE batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmadd_pd(x, y, z);
+        }
+
+        // fnms
+        template <class A>
+        XSIMD_INLINE batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmsub_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmadd_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmsub_pd(x, y, z);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_fma3_avx2.hpp b/include/onnxruntime/xsimd/arch/xsimd_fma3_avx2.hpp
new file mode 100644
index 0000000000000..134053951ac63
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_fma3_avx2.hpp
@@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX2_HPP
+#define XSIMD_FMA3_AVX2_HPP
+
+#include "../types/xsimd_fma3_avx2_register.hpp"
+
+// Allow inclusion of xsimd_fma3_avx.hpp
+#ifdef XSIMD_FMA3_AVX_HPP
+#undef XSIMD_FMA3_AVX_HPP
+#define XSIMD_FORCE_FMA3_AVX_HPP
+#endif
+
+// Disallow inclusion of ./xsimd_fma3_avx_register.hpp
+#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
+#define XSIMD_FMA3_AVX_REGISTER_HPP
+#define XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
+#endif
+
+// Include ./xsimd_fma3_avx.hpp but s/avx/avx2
+#define avx avx2
+#include "./xsimd_fma3_avx.hpp"
+#undef avx
+#undef XSIMD_FMA3_AVX_HPP
+
+// Carefully restore guards
+#ifdef XSIMD_FORCE_FMA3_AVX_HPP
+#define XSIMD_FMA3_AVX_HPP
+#undef XSIMD_FORCE_FMA3_AVX_HPP
+#endif
+
+#ifdef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
+#undef XSIMD_FMA3_AVX_REGISTER_HPP
+#undef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
+#endif
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_fma3_sse.hpp b/include/onnxruntime/xsimd/arch/xsimd_fma3_sse.hpp
new file mode 100644
index 0000000000000..9b126166ac048
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_fma3_sse.hpp
@@ -0,0 +1,79 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_SSE_HPP
+#define XSIMD_FMA3_SSE_HPP
+
+#include "../types/xsimd_fma3_sse_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+        // fnma
+        template <class A>
+        XSIMD_INLINE batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmadd_pd(x, y, z);
+        }
+
+        // fnms
+        template <class A>
+        XSIMD_INLINE batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmsub_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmadd_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmsub_pd(x, y, z);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_fma4.hpp b/include/onnxruntime/xsimd/arch/xsimd_fma4.hpp
new file mode 100644
index 0000000000000..e51c7c52a82c6
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_fma4.hpp
@@ -0,0 +1,79 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA4_HPP
+#define XSIMD_FMA4_HPP
+
+#include "../types/xsimd_fma4_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fnma
+        template <class A>
+        XSIMD_INLINE batch<float, A> fnma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmacc_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fnma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmacc_pd(x, y, z);
+        }
+
+        // fnms
+        template <class A>
+        XSIMD_INLINE batch<float, A> fnms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fnms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmsub_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        XSIMD_INLINE batch<float, A> fma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_macc_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_macc_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        XSIMD_INLINE batch<float, A> fms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_msub_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_msub_pd(x, y, z);
+        }
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_generic.hpp b/include/onnxruntime/xsimd/arch/xsimd_generic.hpp
new file mode 100644
index 0000000000000..6403cfb0fc138
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_generic.hpp
@@ -0,0 +1,23 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_HPP
+#define XSIMD_GENERIC_HPP
+
+#include "./generic/xsimd_generic_arithmetic.hpp"
+#include "./generic/xsimd_generic_complex.hpp"
+#include "./generic/xsimd_generic_logical.hpp"
+#include "./generic/xsimd_generic_math.hpp"
+#include "./generic/xsimd_generic_memory.hpp"
+#include "./generic/xsimd_generic_rounding.hpp"
+#include "./generic/xsimd_generic_trigo.hpp"
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_generic_fwd.hpp b/include/onnxruntime/xsimd/arch/xsimd_generic_fwd.hpp
new file mode 100644
index 0000000000000..02708d60f70b9
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_generic_fwd.hpp
@@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_FWD_HPP
+#define XSIMD_GENERIC_FWD_HPP
+
+#include "../types/xsimd_batch_constant.hpp"
+
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        // forward declaration
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T hadd(batch<T, A> const& self, requires_arch<generic>) noexcept;
+
+    }
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_i8mm_neon64.hpp b/include/onnxruntime/xsimd/arch/xsimd_i8mm_neon64.hpp
new file mode 100644
index 0000000000000..5533923020363
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_i8mm_neon64.hpp
@@ -0,0 +1,17 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_I8MM_NEON64_HPP
+#define XSIMD_I8MM_NEON64_HPP
+
+#include "../types/xsimd_i8mm_neon64_register.hpp"
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_isa.hpp b/include/onnxruntime/xsimd/arch/xsimd_isa.hpp
new file mode 100644
index 0000000000000..5b714b299182a
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_isa.hpp
@@ -0,0 +1,130 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ISA_HPP
+#define XSIMD_ISA_HPP
+
+#include "../config/xsimd_arch.hpp"
+
+#include "./xsimd_generic_fwd.hpp"
+
+#if XSIMD_WITH_EMULATED
+#include "./xsimd_emulated.hpp"
+#endif
+
+#if XSIMD_WITH_SSE2
+#include "./xsimd_sse2.hpp"
+#endif
+
+#if XSIMD_WITH_SSE3
+#include "./xsimd_sse3.hpp"
+#endif
+
+#if XSIMD_WITH_SSSE3
+#include "./xsimd_ssse3.hpp"
+#endif
+
+#if XSIMD_WITH_SSE4_1
+#include "./xsimd_sse4_1.hpp"
+#endif
+
+#if XSIMD_WITH_SSE4_2
+#include "./xsimd_sse4_2.hpp"
+#endif
+
+#if XSIMD_WITH_FMA3_SSE
+#include "./xsimd_fma3_sse.hpp"
+#endif
+
+#if XSIMD_WITH_FMA4
+#include "./xsimd_fma4.hpp"
+#endif
+
+#if XSIMD_WITH_AVX
+#include "./xsimd_avx.hpp"
+#endif
+
+#if XSIMD_WITH_FMA3_AVX
+#include "./xsimd_fma3_avx.hpp"
+#endif
+
+#if XSIMD_WITH_AVXVNNI
+#include "./xsimd_avxvnni.hpp"
+#endif
+
+#if XSIMD_WITH_AVX2
+#include "./xsimd_avx2.hpp"
+#endif
+
+#if XSIMD_WITH_FMA3_AVX2
+#include "./xsimd_fma3_avx2.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512F
+#include "./xsimd_avx512f.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512BW
+#include "./xsimd_avx512bw.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512ER
+#include "./xsimd_avx512er.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512PF
+#include "./xsimd_avx512pf.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512IFMA
+#include "./xsimd_avx512ifma.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512VBMI
+#include "./xsimd_avx512vbmi.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512VNNI_AVX512BW
+#include "./xsimd_avx512vnni_avx512bw.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512VNNI_AVX512VBMI
+#include "./xsimd_avx512vnni_avx512vbmi.hpp"
+#endif
+
+#if XSIMD_WITH_NEON
+#include "./xsimd_neon.hpp"
+#endif
+
+#if XSIMD_WITH_NEON64
+#include "./xsimd_neon64.hpp"
+#endif
+
+#if XSIMD_WITH_I8MM_NEON64
+#include "./xsimd_i8mm_neon64.hpp"
+#endif
+
+#if XSIMD_WITH_SVE
+#include "./xsimd_sve.hpp"
+#endif
+
+#if XSIMD_WITH_RVV
+#include "./xsimd_rvv.hpp"
+#endif
+
+#if XSIMD_WITH_WASM
+#include "./xsimd_wasm.hpp"
+#endif
+
+// Must come last to have access to all conversion specializations.
+#include "./xsimd_generic.hpp"
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_neon.hpp b/include/onnxruntime/xsimd/arch/xsimd_neon.hpp
new file mode 100644
index 0000000000000..2d0a244528667
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_neon.hpp
@@ -0,0 +1,2813 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON_HPP
+#define XSIMD_NEON_HPP
+
+#include <algorithm>
+#include <complex>
+#include <tuple>
+#include <type_traits>
+
+#include "../types/xsimd_neon_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+// Wrap intrinsics so we can pass them as function pointers
+// - OP: intrinsics name prefix, e.g., vorrq
+// - RT: type traits to deduce intrinsics return types
+#define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT)                                     \
+    namespace wrap                                                                \
+    {                                                                             \
+        XSIMD_INLINE RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept  \
+        {                                                                         \
+            return ::OP##_u8(a, b);                                               \
+        }                                                                         \
+        XSIMD_INLINE RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
+        {                                                                         \
+            return ::OP##_u16(a, b);                                              \
+        }                                                                         \
+        XSIMD_INLINE RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
+        {                                                                         \
+            return ::OP##_u32(a, b);                                              \
+        }                                                                         \
+    }
+
+#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                   \
+    WRAP_BINARY_UINT_EXCLUDING_64(OP, RT)                                      \
+    namespace wrap                                                             \
+    {                                                                          \
+        XSIMD_INLINE RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept  \
+        {                                                                      \
+            return ::OP##_s8(a, b);                                            \
+        }                                                                      \
+        XSIMD_INLINE RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \
+        {                                                                      \
+            return ::OP##_s16(a, b);                                           \
+        }                                                                      \
+        XSIMD_INLINE RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \
+        {                                                                      \
+            return ::OP##_s32(a, b);                                           \
+        }                                                                      \
+    }
+
+#define WRAP_BINARY_INT(OP, RT)                                                   \
+    WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                          \
+    namespace wrap                                                                \
+    {                                                                             \
+        XSIMD_INLINE RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \
+        {                                                                         \
+            return ::OP##_u64(a, b);                                              \
+        }                                                                         \
+        XSIMD_INLINE RT<int64x2_t> OP##_s64(int64x2_t a, int64x2_t b) noexcept    \
+        {                                                                         \
+            return ::OP##_s64(a, b);                                              \
+        }                                                                         \
+    }
+
+#define WRAP_BINARY_FLOAT(OP, RT)                                                    \
+    namespace wrap                                                                   \
+    {                                                                                \
+        XSIMD_INLINE RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) noexcept \
+        {                                                                            \
+            return ::OP##_f32(a, b);                                                 \
+        }                                                                            \
+    }
+
+#define WRAP_UNARY_INT_EXCLUDING_64(OP)                         \
+    namespace wrap                                              \
+    {                                                           \
+        XSIMD_INLINE uint8x16_t OP##_u8(uint8x16_t a) noexcept  \
+        {                                                       \
+            return ::OP##_u8(a);                                \
+        }                                                       \
+        XSIMD_INLINE int8x16_t OP##_s8(int8x16_t a) noexcept    \
+        {                                                       \
+            return ::OP##_s8(a);                                \
+        }                                                       \
+        XSIMD_INLINE uint16x8_t OP##_u16(uint16x8_t a) noexcept \
+        {                                                       \
+            return ::OP##_u16(a);                               \
+        }                                                       \
+        XSIMD_INLINE int16x8_t OP##_s16(int16x8_t a) noexcept   \
+        {                                                       \
+            return ::OP##_s16(a);                               \
+        }                                                       \
+        XSIMD_INLINE uint32x4_t OP##_u32(uint32x4_t a) noexcept \
+        {                                                       \
+            return ::OP##_u32(a);                               \
+        }                                                       \
+        XSIMD_INLINE int32x4_t OP##_s32(int32x4_t a) noexcept   \
+        {                                                       \
+            return ::OP##_s32(a);                               \
+        }                                                       \
+    }
+
+#define WRAP_UNARY_INT(OP)                                      \
+    WRAP_UNARY_INT_EXCLUDING_64(OP)                             \
+    namespace wrap                                              \
+    {                                                           \
+        XSIMD_INLINE uint64x2_t OP##_u64(uint64x2_t a) noexcept \
+        {                                                       \
+            return ::OP##_u64(a);                               \
+        }                                                       \
+        XSIMD_INLINE int64x2_t OP##_s64(int64x2_t a) noexcept   \
+        {                                                       \
+            return ::OP##_s64(a);                               \
+        }                                                       \
+    }
+
+#define WRAP_UNARY_FLOAT(OP)                                      \
+    namespace wrap                                                \
+    {                                                             \
+        XSIMD_INLINE float32x4_t OP##_f32(float32x4_t a) noexcept \
+        {                                                         \
+            return ::OP##_f32(a);                                 \
+        }                                                         \
+    }
+
+// Dummy identity caster to ease coding
+XSIMD_INLINE uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; }
+XSIMD_INLINE int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; }
+XSIMD_INLINE uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; }
+XSIMD_INLINE int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; }
+XSIMD_INLINE uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; }
+XSIMD_INLINE int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; }
+XSIMD_INLINE uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; }
+XSIMD_INLINE int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; }
+XSIMD_INLINE float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; }
+
+namespace xsimd
+{
+    template <typename T, class A, bool... Values>
+    struct batch_bool_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            template <template <class> class return_type, class... T>
+            struct neon_dispatcher_base
+            {
+                struct unary
+                {
+                    using container_type = std::tuple<return_type<T> (*)(T)...>;
+                    const container_type m_func;
+
+                    template <class U>
+                    return_type<U> apply(U rhs) const noexcept
+                    {
+                        using func_type = return_type<U> (*)(U);
+                        auto func = xsimd::detail::get<func_type>(m_func);
+                        return func(rhs);
+                    }
+                };
+
+                struct binary
+                {
+                    using container_type = std::tuple<return_type<T> (*)(T, T)...>;
+                    const container_type m_func;
+
+                    template <class U>
+                    return_type<U> apply(U lhs, U rhs) const noexcept
+                    {
+                        using func_type = return_type<U> (*)(U, U);
+                        auto func = xsimd::detail::get<func_type>(m_func);
+                        return func(lhs, rhs);
+                    }
+                };
+            };
+
+            /***************************
+             *  arithmetic dispatchers *
+             ***************************/
+
+            template <class T>
+            using identity_return_type = T;
+
+            template <class... T>
+            struct neon_dispatcher_impl : neon_dispatcher_base<identity_return_type, T...>
+            {
+            };
+
+            using neon_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                         uint16x8_t, int16x8_t,
+                                                         uint32x4_t, int32x4_t,
+                                                         uint64x2_t, int64x2_t,
+                                                         float32x4_t>;
+
+            using excluding_int64_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                    uint16x8_t, int16x8_t,
+                                                                    uint32x4_t, int32x4_t,
+                                                                    float32x4_t>;
+
+            using excluding_int64f32_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                       uint16x8_t, int16x8_t,
+                                                                       uint32x4_t, int32x4_t>;
+
+            /**************************
+             * comparison dispatchers *
+             **************************/
+
+            template <class T>
+            struct comp_return_type_impl;
+
+            template <>
+            struct comp_return_type_impl<uint8x16_t>
+            {
+                using type = uint8x16_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<int8x16_t>
+            {
+                using type = uint8x16_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<uint16x8_t>
+            {
+                using type = uint16x8_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<int16x8_t>
+            {
+                using type = uint16x8_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<uint32x4_t>
+            {
+                using type = uint32x4_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<int32x4_t>
+            {
+                using type = uint32x4_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<uint64x2_t>
+            {
+                using type = uint64x2_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<int64x2_t>
+            {
+                using type = uint64x2_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<float32x4_t>
+            {
+                using type = uint32x4_t;
+            };
+
+            template <class T>
+            using comp_return_type = typename comp_return_type_impl<T>::type;
+
+            template <class... T>
+            struct neon_comp_dispatcher_impl : neon_dispatcher_base<comp_return_type, T...>
+            {
+            };
+
+            using excluding_int64_comp_dispatcher = neon_comp_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                              uint16x8_t, int16x8_t,
+                                                                              uint32x4_t, int32x4_t,
+                                                                              float32x4_t>;
+
+            /**************************************
+             * enabling / disabling metafunctions *
+             **************************************/
+
+            template <class T>
+            using enable_neon_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value,
+                                                               int>::type;
+
+            template <class T>
+            using exclude_int64_neon_t
+                = typename std::enable_if<(std::is_integral<T>::value && sizeof(T) != 8) || std::is_same<T, float>::value, int>::type;
+        }
+
+        /*************
+         * broadcast *
+         *************/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_u8(uint8_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_s8(int8_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_u16(uint16_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_s16(int16_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_u32(uint32_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_s32(int32_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_u64(uint64_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_s64(int64_t(val));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_f32(val);
+        }
+
+        /*******
+         * set *
+         *******/
+
+        template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args) noexcept
+        {
+            return xsimd::types::detail::neon_vector_type<T> { args... };
+        }
+
+        template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            using unsigned_type = as_unsigned_integer_t<T>;
+            return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<neon>, float f0, float f1, float f2, float f3) noexcept
+        {
+            return float32x4_t { f0, f1, f2, f3 };
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>,
+                                                       std::complex<float> c0, std::complex<float> c1,
+                                                       std::complex<float> c2, std::complex<float> c3) noexcept
+        {
+            return batch<std::complex<float>, A>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() },
+                                                 float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() });
+        }
+
+        template <class A, class... Args>
+        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept
+        {
+            using register_type = typename batch_bool<float, A>::register_type;
+            using unsigned_type = as_unsigned_integer_t<float>;
+            return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
+        }
+
+        /*************
+         * from_bool *
+         *************/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_u8(arg, vdupq_n_u8(1));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_s8(reinterpret_cast<int8x16_t>(arg.data), vdupq_n_s8(1));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_u16(arg, vdupq_n_u16(1));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_s16(reinterpret_cast<int16x8_t>(arg.data), vdupq_n_s16(1));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_u32(arg, vdupq_n_u32(1));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_s32(reinterpret_cast<int32x4_t>(arg.data), vdupq_n_s32(1));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_u64(arg, vdupq_n_u64(1));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_s64(reinterpret_cast<int64x2_t>(arg.data), vdupq_n_s64(1));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> from_bool(batch_bool<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_f32_u32(vandq_u32(arg, vreinterpretq_u32_f32(vdupq_n_f32(1.f))));
+        }
+
+        /********
+         * load *
+         ********/
+
+        // It is not possible to use a call to A::alignment() here, so use an
+        // immediate instead.
+#if defined(__clang__) || defined(__GNUC__)
+#define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
+#elif defined(_MSC_VER)
+#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
+#else
+#define xsimd_aligned_load(inst, type, expr) inst((type)expr)
+#endif
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_u8, uint8_t*, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_s8, int8_t*, src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_u16, uint16_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_s16, int16_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_u32, uint32_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_s32, int32_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_u64, uint64_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_s64, int64_t*, src);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_f32, float*, src);
+        }
+
+#undef xsimd_aligned_load
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u8((uint8_t*)src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s8((int8_t*)src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u16((uint16_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s16((int16_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u32((uint32_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s32((int32_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u64((uint64_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s64((int64_t*)src);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_unaligned(float const* src, convert<float>, requires_arch<neon>) noexcept
+        {
+            return vld1q_f32(src);
+        }
+
+        /*********
+         * store *
+         *********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_u8((uint8_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_s8((int8_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_u16((uint16_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_s16((int16_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_u32((uint32_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_s32((int32_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_u64((uint64_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_s64((int64_t*)dst, src);
+        }
+
+        template <class A>
+        XSIMD_INLINE void store_aligned(float* dst, batch<float, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_f32(dst, src);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            store_aligned<A>(dst, src, A {});
+        }
+
+        /****************
+         * load_complex *
+         ****************/
+
+        template <class A>
+        XSIMD_INLINE batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, convert<std::complex<float>>, requires_arch<neon>) noexcept
+        {
+            using real_batch = batch<float, A>;
+            const float* buf = reinterpret_cast<const float*>(mem);
+            float32x4x2_t tmp = vld2q_f32(buf);
+            real_batch real = tmp.val[0],
+                       imag = tmp.val[1];
+            return batch<std::complex<float>, A> { real, imag };
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, convert<std::complex<float>> cvt, requires_arch<neon>) noexcept
+        {
+            return load_complex_aligned<A>(mem, cvt, A {});
+        }
+
+        /*****************
+         * store_complex *
+         *****************/
+
+        template <class A>
+        XSIMD_INLINE void store_complex_aligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
+        {
+            float32x4x2_t tmp;
+            tmp.val[0] = src.real();
+            tmp.val[1] = src.imag();
+            float* buf = reinterpret_cast<float*>(dst);
+            vst2q_f32(buf, tmp);
+        }
+
+        template <class A>
+        XSIMD_INLINE void store_complex_unaligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
+        {
+            store_complex_aligned(dst, src, A {});
+        }
+
+        /*******
+         * neg *
+         *******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs)));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vnegq_s8(rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs)));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vnegq_s16(rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs)));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vnegq_s32(rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch<T, A> { -rhs.get(0), -rhs.get(1) };
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch<T, A> { -rhs.get(0), -rhs.get(1) };
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> neg(batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vnegq_f32(rhs);
+        }
+
+        /*******
+         * add *
+         *******/
+
+        WRAP_BINARY_INT(vaddq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type)
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vaddq_u8, wrap::vaddq_s8, wrap::vaddq_u16, wrap::vaddq_s16,
+                                wrap::vaddq_u32, wrap::vaddq_s32, wrap::vaddq_u64, wrap::vaddq_s64,
+                                wrap::vaddq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * avg *
+         *******/
+
+        WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type)
+
+        template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
+                std::make_tuple(wrap::vhaddq_u8, wrap::vhaddq_u16, wrap::vhaddq_u32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /********
+         * avgr *
+         ********/
+
+        WRAP_BINARY_UINT_EXCLUDING_64(vrhaddq, detail::identity_return_type)
+
+        template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
+                std::make_tuple(wrap::vrhaddq_u8, wrap::vrhaddq_u16, wrap::vrhaddq_u32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /********
+         * sadd *
+         ********/
+
+        WRAP_BINARY_INT(vqaddq, detail::identity_return_type)
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vqaddq_u8, wrap::vqaddq_s8, wrap::vqaddq_u16, wrap::vqaddq_s16,
+                                wrap::vqaddq_u32, wrap::vqaddq_s32, wrap::vqaddq_u64, wrap::vqaddq_s64,
+                                wrap::vaddq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * sub *
+         *******/
+
+        WRAP_BINARY_INT(vsubq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type)
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vsubq_u8, wrap::vsubq_s8, wrap::vsubq_u16, wrap::vsubq_s16,
+                                wrap::vsubq_u32, wrap::vsubq_s32, wrap::vsubq_u64, wrap::vsubq_s64,
+                                wrap::vsubq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /********
+         * ssub *
+         ********/
+
+        WRAP_BINARY_INT(vqsubq, detail::identity_return_type)
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vqsubq_u8, wrap::vqsubq_s8, wrap::vqsubq_u16, wrap::vqsubq_s16,
+                                wrap::vqsubq_u32, wrap::vqsubq_s32, wrap::vqsubq_u64, wrap::vqsubq_s64,
+                                wrap::vsubq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * mul *
+         *******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vmulq_u8, wrap::vmulq_s8, wrap::vmulq_u16, wrap::vmulq_s16,
+                                wrap::vmulq_u32, wrap::vmulq_s32, wrap::vmulq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * div *
+         *******/
+
+#if defined(XSIMD_FAST_INTEGER_DIVISION)
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs));
+        }
+#endif
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> div(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            // from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html
+            // get an initial estimate of 1/b.
+            float32x4_t rcp = reciprocal(rhs);
+
+            // use a couple Newton-Raphson steps to refine the estimate.  Depending on your
+            // application's accuracy requirements, you may be able to get away with only
+            // one refinement (instead of the two used here).  Be sure to test!
+            rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
+            rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
+
+            // and finally, compute a / b = a * (1 / b)
+            return vmulq_f32(lhs, rcp);
+        }
+
+        /******
+         * eq *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vceqq_u8, wrap::vceqq_s8, wrap::vceqq_u16, wrap::vceqq_s16,
+                                wrap::vceqq_u32, wrap::vceqq_s32, wrap::vceqq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            using dispatcher_type = detail::neon_comp_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary;
+            const dispatcher_type dispatcher = {
+                std::make_tuple(wrap::vceqq_u8, wrap::vceqq_u16, wrap::vceqq_u32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
+        }
+
+        /*************
+         * fast_cast *
+         *************/
+
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
+            {
+                return vcvtq_f32_s32(self);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
+            {
+                return vcvtq_f32_u32(self);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<neon>) noexcept
+            {
+                return vcvtq_s32_f32(self);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<neon>) noexcept
+            {
+                return vcvtq_u32_f32(self);
+            }
+
+        }
+
+        /******
+         * lt *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vcltq_u8, wrap::vcltq_s8, wrap::vcltq_u16, wrap::vcltq_s16,
+                                wrap::vcltq_u32, wrap::vcltq_s32, wrap::vcltq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) });
+        }
+
+        /******
+         * le *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vcleq_u8, wrap::vcleq_s8, wrap::vcleq_u16, wrap::vcleq_s16,
+                                wrap::vcleq_u32, wrap::vcleq_s32, wrap::vcleq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) <= rhs.get(0), lhs.get(1) <= rhs.get(1) });
+        }
+
+        /******
+         * gt *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vcgtq_u8, wrap::vcgtq_s8, wrap::vcgtq_u16, wrap::vcgtq_s16,
+                                wrap::vcgtq_u32, wrap::vcgtq_s32, wrap::vcgtq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) > rhs.get(0), lhs.get(1) > rhs.get(1) });
+        }
+
+        /******
+         * ge *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vcgeq_u8, wrap::vcgeq_s8, wrap::vcgeq_u16, wrap::vcgeq_s16,
+                                wrap::vcgeq_u32, wrap::vcgeq_s32, wrap::vcgeq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) });
+        }
+
+        /*******************
+         * batch_bool_cast *
+         *******************/
+
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T_out, A>::register_type;
+            return register_type(self);
+        }
+
+        /***************
+         * bitwise_and *
+         ***************/
+
+        WRAP_BINARY_INT(vandq, detail::identity_return_type)
+
+        namespace detail
+        {
+            XSIMD_INLINE float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            {
+                return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(lhs),
+                                                       vreinterpretq_u32_f32(rhs)));
+            }
+
+            template <class V>
+            V bitwise_and_neon(V const& lhs, V const& rhs)
+            {
+                const neon_dispatcher::binary dispatcher = {
+                    std::make_tuple(wrap::vandq_u8, wrap::vandq_s8, wrap::vandq_u16, wrap::vandq_s16,
+                                    wrap::vandq_u32, wrap::vandq_s32, wrap::vandq_u64, wrap::vandq_s64,
+                                    bitwise_and_f32)
+                };
+                return dispatcher.apply(lhs, rhs);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
+        }
+
+        /**************
+         * bitwise_or *
+         **************/
+
+        WRAP_BINARY_INT(vorrq, detail::identity_return_type)
+
+        namespace detail
+        {
+            XSIMD_INLINE float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            {
+                return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(lhs),
+                                                       vreinterpretq_u32_f32(rhs)));
+            }
+
+            template <class V>
+            XSIMD_INLINE V bitwise_or_neon(V const& lhs, V const& rhs) noexcept
+            {
+                const neon_dispatcher::binary dispatcher = {
+                    std::make_tuple(wrap::vorrq_u8, wrap::vorrq_s8, wrap::vorrq_u16, wrap::vorrq_s16,
+                                    wrap::vorrq_u32, wrap::vorrq_s32, wrap::vorrq_u64, wrap::vorrq_s64,
+                                    bitwise_or_f32)
+                };
+                return dispatcher.apply(lhs, rhs);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
+        }
+
+        /***************
+         * bitwise_xor *
+         ***************/
+
+        WRAP_BINARY_INT(veorq, detail::identity_return_type)
+
+        namespace detail
+        {
+            XSIMD_INLINE float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            {
+                return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(lhs),
+                                                       vreinterpretq_u32_f32(rhs)));
+            }
+
+            template <class V>
+            XSIMD_INLINE V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept
+            {
+                const neon_dispatcher::binary dispatcher = {
+                    std::make_tuple(wrap::veorq_u8, wrap::veorq_s8, wrap::veorq_u16, wrap::veorq_s16,
+                                    wrap::veorq_u32, wrap::veorq_s32, wrap::veorq_u64, wrap::veorq_s64,
+                                    bitwise_xor_f32)
+                };
+                return dispatcher.apply(lhs, rhs);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * neq *
+         *******/
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return bitwise_xor(lhs, rhs, A {});
+        }
+
+        /***************
+         * bitwise_not *
+         ***************/
+
+        WRAP_UNARY_INT_EXCLUDING_64(vmvnq)
+
+        namespace detail
+        {
+            XSIMD_INLINE int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
+            {
+                return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
+            }
+
+            XSIMD_INLINE uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
+            {
+                return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
+            }
+
+            XSIMD_INLINE float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
+            {
+                return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
+            }
+
+            template <class V>
+            XSIMD_INLINE V bitwise_not_neon(V const& arg) noexcept
+            {
+                const neon_dispatcher::unary dispatcher = {
+                    std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16,
+                                    wrap::vmvnq_u32, wrap::vmvnq_s32,
+                                    bitwise_not_u64, bitwise_not_s64,
+                                    bitwise_not_f32)
+                };
+                return dispatcher.apply(arg);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_not_neon(register_type(arg));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_not_neon(register_type(arg));
+        }
+
+        /******************
+         * bitwise_andnot *
+         ******************/
+
+        WRAP_BINARY_INT(vbicq, detail::identity_return_type)
+
+        namespace detail
+        {
+            XSIMD_INLINE float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            {
+                return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs)));
+            }
+
+            template <class V>
+            XSIMD_INLINE V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept
+            {
+                const detail::neon_dispatcher::binary dispatcher = {
+                    std::make_tuple(wrap::vbicq_u8, wrap::vbicq_s8, wrap::vbicq_u16, wrap::vbicq_s16,
+                                    wrap::vbicq_u32, wrap::vbicq_s32, wrap::vbicq_u64, wrap::vbicq_s64,
+                                    bitwise_andnot_f32)
+                };
+                return dispatcher.apply(lhs, rhs);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * min *
+         *******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vminq, detail::identity_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vminq_u8, wrap::vminq_s8, wrap::vminq_u16, wrap::vminq_s16,
+                                wrap::vminq_u32, wrap::vminq_s32, wrap::vminq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return { std::min(lhs.get(0), rhs.get(0)), std::min(lhs.get(1), rhs.get(1)) };
+        }
+
+        /*******
+         * max *
+         *******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vmaxq_u8, wrap::vmaxq_s8, wrap::vmaxq_u16, wrap::vmaxq_s16,
+                                wrap::vmaxq_u32, wrap::vmaxq_s32, wrap::vmaxq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return { std::max(lhs.get(0), rhs.get(0)), std::max(lhs.get(1), rhs.get(1)) };
+        }
+
+        /*******
+         * abs *
+         *******/
+
+        namespace wrap
+        {
+            XSIMD_INLINE int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(a); }
+            XSIMD_INLINE int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(a); }
+            XSIMD_INLINE int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(a); }
+        }
+        WRAP_UNARY_FLOAT(vabsq)
+
+        namespace detail
+        {
+            XSIMD_INLINE uint8x16_t abs_u8(uint8x16_t arg) noexcept
+            {
+                return arg;
+            }
+
+            XSIMD_INLINE uint16x8_t abs_u16(uint16x8_t arg) noexcept
+            {
+                return arg;
+            }
+
+            XSIMD_INLINE uint32x4_t abs_u32(uint32x4_t arg) noexcept
+            {
+                return arg;
+            }
+        }
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_dispatcher::unary dispatcher = {
+                std::make_tuple(detail::abs_u8, wrap::vabsq_s8, detail::abs_u16, wrap::vabsq_s16,
+                                detail::abs_u32, wrap::vabsq_s32, wrap::vabsq_f32)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+        /********
+         * rsqrt *
+         ********/
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vrsqrteq_f32(arg);
+        }
+
+        /********
+         * sqrt *
+         ********/
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            batch<float, A> sqrt_reciprocal = vrsqrteq_f32(arg);
+            // one iter
+            sqrt_reciprocal = sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
+            batch<float, A> sqrt_approx = arg * sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
+            batch<float, A> zero(0.f);
+            return select(arg == zero, zero, sqrt_approx);
+        }
+
+        /********************
+         * Fused operations *
+         ********************/
+
+#ifdef __ARM_FEATURE_FMA
+        template <class A>
+        XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
+        {
+            return vfmaq_f32(z, x, y);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
+        {
+            return vfmaq_f32(-z, x, y);
+        }
+#endif
+
+        /*********
+         * haddp *
+         *********/
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> haddp(const batch<float, A>* row, requires_arch<neon>) noexcept
+        {
+            // row = (a,b,c,d)
+            float32x2_t tmp1, tmp2, tmp3;
+            // tmp1 = (a0 + a2, a1 + a3)
+            tmp1 = vpadd_f32(vget_low_f32(row[0]), vget_high_f32(row[0]));
+            // tmp2 = (b0 + b2, b1 + b3)
+            tmp2 = vpadd_f32(vget_low_f32(row[1]), vget_high_f32(row[1]));
+            // tmp1 = (a0..3, b0..3)
+            tmp1 = vpadd_f32(tmp1, tmp2);
+            // tmp2 = (c0 + c2, c1 + c3)
+            tmp2 = vpadd_f32(vget_low_f32(row[2]), vget_high_f32(row[2]));
+            // tmp3 = (d0 + d2, d1 + d3)
+            tmp3 = vpadd_f32(vget_low_f32(row[3]), vget_high_f32(row[3]));
+            // tmp1 = (c0..3, d0..3)
+            tmp2 = vpadd_f32(tmp2, tmp3);
+            // return = (a0..3, b0..3, c0..3, d0..3)
+            return vcombine_f32(tmp1, tmp2);
+        }
+
+        /**************
+         * reciprocal *
+         **************/
+
+        template <class A>
+        XSIMD_INLINE batch<float, A>
+        reciprocal(const batch<float, A>& x,
+                   kernel::requires_arch<neon>) noexcept
+        {
+            return vrecpeq_f32(x);
+        }
+
+        /**********
+         * insert *
+         **********/
+
+        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_u8(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_s8(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_u16(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<int16_t, A> insert(batch<int16_t, A> const& self, int16_t val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_s16(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_u32(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_s32(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_u64(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_s64(val, self, I);
+        }
+
+        template <class A, size_t I>
+        XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_f32(val, self, I);
+        }
+
+        /********************
+         * nearbyint_as_int *
+         *******************/
+
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                        requires_arch<neon>) noexcept
+        {
+            /* origin: https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047 */
+            //  Contributors to this work are:
+            //   John W. Ratcliff <jratcliffscarab@gmail.com>
+            //   Brandon Rowlett <browlett@nvidia.com>
+            //   Ken Fast <kfast@gdeb.com>
+            //   Eric van Beurden <evanbeurden@nvidia.com>
+            //   Alexander Potylitsin <apotylitsin@nvidia.com>
+            //   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+            //   Jim Huang <jserv@biilabs.io>
+            //   Mark Cheng <marktwtn@biilabs.io>
+            //   Malcolm James MacLeod <malcolm@gulden.com>
+            //   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+            //   Sebastian Pop <spop@amazon.com>
+            //   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+            //   Danila Kutenin <danilak@google.com>
+            //   François Turban (JishinMaster) <francois.turban@gmail.com>
+            //   Pei-Hsuan Hung <afcidk@gmail.com>
+            //   Yang-Hao Yuan <yanghau@biilabs.io>
+            //   Syoyo Fujita <syoyo@lighttransport.com>
+            //   Brecht Van Lommel <brecht@blender.org>
+
+            /*
+             * sse2neon is freely redistributable under the MIT License.
+             *
+             * Permission is hereby granted, free of charge, to any person obtaining a copy
+             * of this software and associated documentation files (the "Software"), to deal
+             * in the Software without restriction, including without limitation the rights
+             * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+             * copies of the Software, and to permit persons to whom the Software is
+             * furnished to do so, subject to the following conditions:
+             *
+             * The above copyright notice and this permission notice shall be included in
+             * all copies or substantial portions of the Software.
+             *
+             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+             * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+             * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+             * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+             * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+             * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+             * SOFTWARE.
+             */
+
+            const auto signmask = vdupq_n_u32(0x80000000);
+            const auto half = vbslq_f32(signmask, self,
+                                        vdupq_n_f32(0.5f)); /* +/- 0.5 */
+            const auto r_normal = vcvtq_s32_f32(vaddq_f32(
+                self, half)); /* round to integer: [a + 0.5]*/
+            const auto r_trunc = vcvtq_s32_f32(self); /* truncate to integer: [a] */
+            const auto plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+                vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+            const auto r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                          vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+            const auto delta = vsubq_f32(
+                self,
+                vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+            const auto is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
+            return vbslq_s32(is_delta_half, r_even, r_normal);
+        }
+
+        /**************
+         * reduce_add *
+         **************/
+
+        namespace detail
+        {
+            template <class T, class A, class V>
+            XSIMD_INLINE T sum_batch(V const& arg) noexcept
+            {
+                T res = T(0);
+                for (std::size_t i = 0; i < batch<T, A>::size; ++i)
+                {
+                    res += arg[i];
+                }
+                return res;
+            }
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint8x8_t tmp = vpadd_u8(vget_low_u8(arg), vget_high_u8(arg));
+            tmp = vpadd_u8(tmp, tmp);
+            tmp = vpadd_u8(tmp, tmp);
+            tmp = vpadd_u8(tmp, tmp);
+            return vget_lane_u8(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            int8x8_t tmp = vpadd_s8(vget_low_s8(arg), vget_high_s8(arg));
+            tmp = vpadd_s8(tmp, tmp);
+            tmp = vpadd_s8(tmp, tmp);
+            tmp = vpadd_s8(tmp, tmp);
+            return vget_lane_s8(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint16x4_t tmp = vpadd_u16(vget_low_u16(arg), vget_high_u16(arg));
+            tmp = vpadd_u16(tmp, tmp);
+            tmp = vpadd_u16(tmp, tmp);
+            return vget_lane_u16(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            int16x4_t tmp = vpadd_s16(vget_low_s16(arg), vget_high_s16(arg));
+            tmp = vpadd_s16(tmp, tmp);
+            tmp = vpadd_s16(tmp, tmp);
+            return vget_lane_s16(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint32x2_t tmp = vpadd_u32(vget_low_u32(arg), vget_high_u32(arg));
+            tmp = vpadd_u32(tmp, tmp);
+            return vget_lane_u32(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            int32x2_t tmp = vpadd_s32(vget_low_s32(arg), vget_high_s32(arg));
+            tmp = vpadd_s32(tmp, tmp);
+            return vget_lane_s32(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return arg.get(0) + arg.get(1);
+        }
+
+        template <class A>
+        XSIMD_INLINE float reduce_add(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            float32x2_t tmp = vpadd_f32(vget_low_f32(arg), vget_high_f32(arg));
+            tmp = vpadd_f32(tmp, tmp);
+            return vget_lane_f32(tmp, 0);
+        }
+
+        /**************
+         * reduce_max *
+         **************/
+
+        // Using generic implementation because ARM doe snot provide intrinsics
+        // for this operation
+
+        /**************
+         * reduce_min *
+         **************/
+
+        // Using generic implementation because ARM doe snot provide intrinsics
+        // for this operation
+
+        /**********
+         * select *
+         **********/
+
+        namespace wrap
+        {
+            XSIMD_INLINE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(a, b, c); }
+            XSIMD_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return ::vbslq_s8(a, b, c); }
+            XSIMD_INLINE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return ::vbslq_u16(a, b, c); }
+            XSIMD_INLINE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return ::vbslq_s16(a, b, c); }
+            XSIMD_INLINE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return ::vbslq_u32(a, b, c); }
+            XSIMD_INLINE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return ::vbslq_s32(a, b, c); }
+            XSIMD_INLINE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return ::vbslq_u64(a, b, c); }
+            XSIMD_INLINE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(a, b, c); }
+            XSIMD_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(a, b, c); }
+        }
+
+        namespace detail
+        {
+            template <class... T>
+            struct neon_select_dispatcher_impl
+            {
+                using container_type = std::tuple<T (*)(comp_return_type<T>, T, T)...>;
+                const container_type m_func;
+
+                template <class U>
+                U apply(comp_return_type<U> cond, U lhs, U rhs) const noexcept
+                {
+                    using func_type = U (*)(comp_return_type<U>, U, U);
+                    auto func = xsimd::detail::get<func_type>(m_func);
+                    return func(cond, lhs, rhs);
+                }
+            };
+
+            using neon_select_dispatcher = neon_select_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                       uint16x8_t, int16x8_t,
+                                                                       uint32x4_t, int32x4_t,
+                                                                       uint64x2_t, int64x2_t,
+                                                                       float32x4_t>;
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<neon>) noexcept
+        {
+            using bool_register_type = typename batch_bool<T, A>::register_type;
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_select_dispatcher dispatcher = {
+                std::make_tuple(wrap::vbslq_u8, wrap::vbslq_s8, wrap::vbslq_u16, wrap::vbslq_s16,
+                                wrap::vbslq_u32, wrap::vbslq_s32, wrap::vbslq_u64, wrap::vbslq_s64,
+                                wrap::vbslq_f32)
+            };
+            return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b));
+        }
+
+        template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
+        {
+            return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
+        }
+
+        /*************
+         * transpose *
+         *************/
+        template <class A>
+        XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<neon>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
+            auto t01 = vtrnq_f32(r0, r1);
+            auto t23 = vtrnq_f32(r2, r3);
+            matrix_begin[0] = vcombine_f32(vget_low_f32(t01.val[0]), vget_low_f32(t23.val[0]));
+            matrix_begin[1] = vcombine_f32(vget_low_f32(t01.val[1]), vget_low_f32(t23.val[1]));
+            matrix_begin[2] = vcombine_f32(vget_high_f32(t01.val[0]), vget_high_f32(t23.val[0]));
+            matrix_begin[3] = vcombine_f32(vget_high_f32(t01.val[1]), vget_high_f32(t23.val[1]));
+        }
+        template <class A>
+        XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<neon>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<uint32_t, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
+            auto t01 = vtrnq_u32(r0, r1);
+            auto t23 = vtrnq_u32(r2, r3);
+            matrix_begin[0] = vcombine_u32(vget_low_u32(t01.val[0]), vget_low_u32(t23.val[0]));
+            matrix_begin[1] = vcombine_u32(vget_low_u32(t01.val[1]), vget_low_u32(t23.val[1]));
+            matrix_begin[2] = vcombine_u32(vget_high_u32(t01.val[0]), vget_high_u32(t23.val[0]));
+            matrix_begin[3] = vcombine_u32(vget_high_u32(t01.val[1]), vget_high_u32(t23.val[1]));
+        }
+        template <class A>
+        XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<neon>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<int32_t, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
+            auto t01 = vtrnq_s32(r0, r1);
+            auto t23 = vtrnq_s32(r2, r3);
+            matrix_begin[0] = vcombine_s32(vget_low_s32(t01.val[0]), vget_low_s32(t23.val[0]));
+            matrix_begin[1] = vcombine_s32(vget_low_s32(t01.val[1]), vget_low_s32(t23.val[1]));
+            matrix_begin[2] = vcombine_s32(vget_high_s32(t01.val[0]), vget_high_s32(t23.val[0]));
+            matrix_begin[3] = vcombine_s32(vget_high_s32(t01.val[1]), vget_high_s32(t23.val[1]));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            auto r0 = matrix_begin[0], r1 = matrix_begin[1];
+            matrix_begin[0] = vcombine_u64(vget_low_u64(r0), vget_low_u64(r1));
+            matrix_begin[1] = vcombine_u64(vget_high_u64(r0), vget_high_u64(r1));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            auto r0 = matrix_begin[0], r1 = matrix_begin[1];
+            matrix_begin[0] = vcombine_s64(vget_low_s64(r0), vget_low_s64(r1));
+            matrix_begin[1] = vcombine_s64(vget_high_s64(r0), vget_high_s64(r1));
+        }
+
+        /**********
+         * zip_lo *
+         **********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs));
+            return vcombine_u8(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs));
+            return vcombine_s8(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs));
+            return vcombine_u16(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs));
+            return vcombine_s16(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs));
+            return vcombine_u32(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs));
+            return vcombine_s32(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs));
+            return vcombine_f32(tmp.val[0], tmp.val[1]);
+        }
+
+        /**********
+         * zip_hi *
+         **********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs));
+            return vcombine_u8(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs));
+            return vcombine_s8(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs));
+            return vcombine_u16(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs));
+            return vcombine_s16(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs));
+            return vcombine_u32(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs));
+            return vcombine_s32(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs));
+            return vcombine_f32(tmp.val[0], tmp.val[1]);
+        }
+
+        /****************
+         * extract_pair *
+         ****************/
+
+        namespace detail
+        {
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
+            {
+                assert(false && "extract_pair out of bounds");
+                return batch<T, A> {};
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_u8(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 1> = 0>
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_s8(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_u16(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 2> = 0>
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_s16(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_u32(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 4> = 0>
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_s32(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_u64(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 8> = 0>
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_s64(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, size_t I, size_t... Is>
+            XSIMD_INLINE batch<float, A> extract_pair(batch<float, A> const& lhs, batch<float, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_f32(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t... Is>
+            XSIMD_INLINE batch<T, A> extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
+            {
+                if (n == 0)
+                {
+                    return rhs;
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<neon>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(n < size && "index in bounds");
+            return detail::extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
+        }
+
+        /******************
+         * bitwise_lshift *
+         ******************/
+
+        namespace detail
+        {
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
+            {
+                assert(false && "bitwise_lshift out of bounds");
+                return batch<T, A> {};
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_u8(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_s8(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_u16(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_s16(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_u32(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_s32(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_u64(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_s64(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int... Is>
+            XSIMD_INLINE batch<T, A> bitwise_lshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
+            {
+                if (n == 0)
+                {
+                    return lhs;
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
+        {
+            constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && n < size && "index in bounds");
+            return detail::bitwise_lshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u8(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s8(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u16(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s16(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u32(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s32(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s64(lhs, rhs);
+        }
+
+        /******************
+         * bitwise_rshift *
+         ******************/
+
+        namespace detail
+        {
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
+            {
+                assert(false && "bitwise_rshift out of bounds");
+                return batch<T, A> {};
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_u8(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_s8(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_u16(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_s16(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_u32(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_s32(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_u64(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
+            XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_s64(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int... Is>
+            XSIMD_INLINE batch<T, A> bitwise_rshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
+            {
+                if (n == 0)
+                {
+                    return lhs;
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
+        {
+            constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && n < size && "index in bounds");
+            return detail::bitwise_rshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u8(lhs, vnegq_s8(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s8(lhs, vnegq_s8(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u16(lhs, vnegq_s16(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s16(lhs, vnegq_s16(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u32(lhs, vnegq_s32(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s32(lhs, vnegq_s32(rhs));
+        }
+
+        // Overloads of bitwise shifts accepting two batches of uint64/int64 are not available with ARMv7
+
+        /*******
+         * all *
+         *******/
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint64x1_t tmp = vand_u64(vget_low_u64(arg), vget_high_u64(arg));
+            return vget_lane_u64(tmp, 0) == ~0ULL;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
+        }
+
+        /*******
+         * any *
+         *******/
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint32x2_t tmp = vqmovn_u64(arg);
+            return vget_lane_u64(vreinterpret_u64_u32(tmp), 0) != 0;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
+        }
+
+        /****************
+         * bitwise_cast *
+         ****************/
+
+#define WRAP_CAST(SUFFIX, TYPE)                                                \
+    namespace wrap                                                             \
+    {                                                                          \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) noexcept   \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_u8(a);                           \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) noexcept    \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_s8(a);                           \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) noexcept  \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_u16(a);                          \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) noexcept   \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_s16(a);                          \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) noexcept  \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_u32(a);                          \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) noexcept   \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_s32(a);                          \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) noexcept  \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_u64(a);                          \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) noexcept   \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_s64(a);                          \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) noexcept \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_f32(a);                          \
+        }                                                                      \
+    }
+
+        WRAP_CAST(u8, uint8x16_t)
+        WRAP_CAST(s8, int8x16_t)
+        WRAP_CAST(u16, uint16x8_t)
+        WRAP_CAST(s16, int16x8_t)
+        WRAP_CAST(u32, uint32x4_t)
+        WRAP_CAST(s32, int32x4_t)
+        WRAP_CAST(u64, uint64x2_t)
+        WRAP_CAST(s64, int64x2_t)
+        WRAP_CAST(f32, float32x4_t)
+
+#undef WRAP_CAST
+
+        namespace detail
+        {
+            template <class R, class... T>
+            struct bitwise_caster_impl
+            {
+                using container_type = std::tuple<R (*)(T)...>;
+                container_type m_func;
+
+                template <class U>
+                R apply(U rhs) const noexcept
+                {
+                    using func_type = R (*)(U);
+                    auto func = xsimd::detail::get<func_type>(m_func);
+                    return func(rhs);
+                }
+            };
+
+            template <class R, class... T>
+            XSIMD_INLINE const bitwise_caster_impl<R, T...> make_bitwise_caster_impl(R (*... arg)(T)) noexcept
+            {
+                return { std::make_tuple(arg...) };
+            }
+
+            template <class... T>
+            struct type_list
+            {
+            };
+
+            template <class RTL, class TTL>
+            struct bitwise_caster;
+
+            template <class... R, class... T>
+            struct bitwise_caster<type_list<R...>, type_list<T...>>
+            {
+                using container_type = std::tuple<bitwise_caster_impl<R, T...>...>;
+                container_type m_caster;
+
+                template <class V, class U>
+                V apply(U rhs) const noexcept
+                {
+                    using caster_type = bitwise_caster_impl<V, T...>;
+                    auto caster = xsimd::detail::get<caster_type>(m_caster);
+                    return caster.apply(rhs);
+                }
+            };
+
+            template <class... T>
+            using bitwise_caster_t = bitwise_caster<type_list<T...>, type_list<T...>>;
+
+            using neon_bitwise_caster = bitwise_caster_t<uint8x16_t, int8x16_t,
+                                                         uint16x8_t, int16x8_t,
+                                                         uint32x4_t, int32x4_t,
+                                                         uint64x2_t, int64x2_t,
+                                                         float32x4_t>;
+        }
+
+        template <class A, class T, class R>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<neon>) noexcept
+        {
+            const detail::neon_bitwise_caster caster = {
+                std::make_tuple(
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u8_u8, wrap::vreinterpretq_u8_s8, wrap::vreinterpretq_u8_u16, wrap::vreinterpretq_u8_s16,
+                                                     wrap::vreinterpretq_u8_u32, wrap::vreinterpretq_u8_s32, wrap::vreinterpretq_u8_u64, wrap::vreinterpretq_u8_s64,
+                                                     wrap::vreinterpretq_u8_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s8_u8, wrap::vreinterpretq_s8_s8, wrap::vreinterpretq_s8_u16, wrap::vreinterpretq_s8_s16,
+                                                     wrap::vreinterpretq_s8_u32, wrap::vreinterpretq_s8_s32, wrap::vreinterpretq_s8_u64, wrap::vreinterpretq_s8_s64,
+                                                     wrap::vreinterpretq_s8_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u16_u8, wrap::vreinterpretq_u16_s8, wrap::vreinterpretq_u16_u16, wrap::vreinterpretq_u16_s16,
+                                                     wrap::vreinterpretq_u16_u32, wrap::vreinterpretq_u16_s32, wrap::vreinterpretq_u16_u64, wrap::vreinterpretq_u16_s64,
+                                                     wrap::vreinterpretq_u16_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s16_u8, wrap::vreinterpretq_s16_s8, wrap::vreinterpretq_s16_u16, wrap::vreinterpretq_s16_s16,
+                                                     wrap::vreinterpretq_s16_u32, wrap::vreinterpretq_s16_s32, wrap::vreinterpretq_s16_u64, wrap::vreinterpretq_s16_s64,
+                                                     wrap::vreinterpretq_s16_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u32_u8, wrap::vreinterpretq_u32_s8, wrap::vreinterpretq_u32_u16, wrap::vreinterpretq_u32_s16,
+                                                     wrap::vreinterpretq_u32_u32, wrap::vreinterpretq_u32_s32, wrap::vreinterpretq_u32_u64, wrap::vreinterpretq_u32_s64,
+                                                     wrap::vreinterpretq_u32_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s32_u8, wrap::vreinterpretq_s32_s8, wrap::vreinterpretq_s32_u16, wrap::vreinterpretq_s32_s16,
+                                                     wrap::vreinterpretq_s32_u32, wrap::vreinterpretq_s32_s32, wrap::vreinterpretq_s32_u64, wrap::vreinterpretq_s32_s64,
+                                                     wrap::vreinterpretq_s32_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u64_u8, wrap::vreinterpretq_u64_s8, wrap::vreinterpretq_u64_u16, wrap::vreinterpretq_u64_s16,
+                                                     wrap::vreinterpretq_u64_u32, wrap::vreinterpretq_u64_s32, wrap::vreinterpretq_u64_u64, wrap::vreinterpretq_u64_s64,
+                                                     wrap::vreinterpretq_u64_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s64_u8, wrap::vreinterpretq_s64_s8, wrap::vreinterpretq_s64_u16, wrap::vreinterpretq_s64_s16,
+                                                     wrap::vreinterpretq_s64_u32, wrap::vreinterpretq_s64_s32, wrap::vreinterpretq_s64_u64, wrap::vreinterpretq_s64_s64,
+                                                     wrap::vreinterpretq_s64_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_f32_u8, wrap::vreinterpretq_f32_s8, wrap::vreinterpretq_f32_u16, wrap::vreinterpretq_f32_s16,
+                                                     wrap::vreinterpretq_f32_u32, wrap::vreinterpretq_f32_s32, wrap::vreinterpretq_f32_u64, wrap::vreinterpretq_f32_s64,
+                                                     wrap::vreinterpretq_f32_f32))
+            };
+            using src_register_type = typename batch<T, A>::register_type;
+            using dst_register_type = typename batch<R, A>::register_type;
+            return caster.apply<dst_register_type>(src_register_type(arg));
+        }
+
+        /*********
+         * isnan *
+         *********/
+
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return !(arg == arg);
+        }
+
+        // slide_left
+        namespace detail
+        {
+            template <size_t N>
+            struct slider_left
+            {
+                template <class A, class T>
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
+                {
+                    const auto left = vdupq_n_u8(0);
+                    const auto right = bitwise_cast<uint8_t>(x).data;
+                    const batch<uint8_t, A> res(vextq_u8(left, right, 16 - N));
+                    return bitwise_cast<T>(res);
+                }
+            };
+
+            template <>
+            struct slider_left<0>
+            {
+                template <class A, class T>
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
+                {
+                    return x;
+                }
+            };
+        } // namespace detail
+
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return detail::slider_left<N> {}(x, A {});
+        }
+
+        // slide_right
+        namespace detail
+        {
+            template <size_t N>
+            struct slider_right
+            {
+                template <class A, class T>
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
+                {
+                    const auto left = bitwise_cast<uint8_t>(x).data;
+                    const auto right = vdupq_n_u8(0);
+                    const batch<uint8_t, A> res(vextq_u8(left, right, N));
+                    return bitwise_cast<T>(res);
+                }
+            };
+
+            template <>
+            struct slider_right<16>
+            {
+                template <class A, class T>
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const&, requires_arch<neon>) noexcept
+                {
+                    return batch<T, A> {};
+                }
+            };
+        } // namespace detail
+
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return detail::slider_right<N> {}(x, A {});
+        }
+
+        /****************
+         * rotate_left *
+         ****************/
+        namespace wrap
+        {
+            template <size_t N>
+            XSIMD_INLINE uint8x16_t rotate_left_u8(uint8x16_t a, uint8x16_t b) noexcept { return vextq_u8(a, b, N); }
+            template <size_t N>
+            XSIMD_INLINE int8x16_t rotate_left_s8(int8x16_t a, int8x16_t b) noexcept { return vextq_s8(a, b, N); }
+            template <size_t N>
+            XSIMD_INLINE uint16x8_t rotate_left_u16(uint16x8_t a, uint16x8_t b) noexcept { return vextq_u16(a, b, N); }
+            template <size_t N>
+            XSIMD_INLINE int16x8_t rotate_left_s16(int16x8_t a, int16x8_t b) noexcept { return vextq_s16(a, b, N); }
+            template <size_t N>
+            XSIMD_INLINE uint32x4_t rotate_left_u32(uint32x4_t a, uint32x4_t b) noexcept { return vextq_u32(a, b, N); }
+            template <size_t N>
+            XSIMD_INLINE int32x4_t rotate_left_s32(int32x4_t a, int32x4_t b) noexcept { return vextq_s32(a, b, N); }
+            template <size_t N>
+            XSIMD_INLINE uint64x2_t rotate_left_u64(uint64x2_t a, uint64x2_t b) noexcept { return vextq_u64(a, b, N); }
+            template <size_t N>
+            XSIMD_INLINE int64x2_t rotate_left_s64(int64x2_t a, int64x2_t b) noexcept { return vextq_s64(a, b, N); }
+            template <size_t N>
+            XSIMD_INLINE float32x4_t rotate_left_f32(float32x4_t a, float32x4_t b) noexcept { return vextq_f32(a, b, N); }
+        }
+
+        template <size_t N, class A, class T, detail::enable_neon_type_t<T> = 0>
+        XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& a, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::rotate_left_u8<N>, wrap::rotate_left_s8<N>, wrap::rotate_left_u16<N>, wrap::rotate_left_s16<N>,
+                                wrap::rotate_left_u32<N>, wrap::rotate_left_s32<N>, wrap::rotate_left_u64<N>, wrap::rotate_left_s64<N>,
+                                wrap::rotate_left_f32<N>)
+            };
+            return dispatcher.apply(register_type(a), register_type(a));
+        }
+    }
+
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        /***********
+         * swizzle *
+         ***********/
+
+        template <class A, class T, class I, I... idx>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self,
+                                         batch_constant<I, A, idx...>,
+                                         requires_arch<neon>) noexcept
+        {
+            static_assert(batch<T, A>::size == sizeof...(idx), "valid swizzle indices");
+            std::array<T, batch<T, A>::size> data;
+            self.store_aligned(data.data());
+            return set(batch<T, A>(), A(), data[idx]...);
+        }
+    }
+
+}
+
+#undef WRAP_BINARY_INT_EXCLUDING_64
+#undef WRAP_BINARY_INT
+#undef WRAP_BINARY_FLOAT
+#undef WRAP_UNARY_INT_EXCLUDING_64
+#undef WRAP_UNARY_INT
+#undef WRAP_UNARY_FLOAT
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_neon64.hpp b/include/onnxruntime/xsimd/arch/xsimd_neon64.hpp
new file mode 100644
index 0000000000000..933b1e680380e
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_neon64.hpp
@@ -0,0 +1,1536 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON64_HPP
+#define XSIMD_NEON64_HPP
+
+#include <complex>
+#include <cstddef>
+#include <tuple>
+
+#include "../types/xsimd_neon64_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+    template <typename T, class A, bool... Values>
+    struct batch_bool_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        /*******
+         * all *
+         *******/
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return vminvq_u32(arg) == ~0U;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
+        }
+
+        /*******
+         * any *
+         *******/
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return vmaxvq_u32(arg) != 0;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
+        }
+
+        /*************
+         * broadcast *
+         *************/
+
+        // Required to avoid ambiguous call
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon64>) noexcept
+        {
+            return broadcast<A>(val, neon {});
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<neon64>) noexcept
+        {
+            return vdupq_n_f64(val);
+        }
+
+        /*******
+         * set *
+         *******/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<neon64>, double d0, double d1) noexcept
+        {
+            return float64x2_t { d0, d1 };
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<neon64>, bool b0, bool b1) noexcept
+        {
+            using register_type = typename batch_bool<double, A>::register_type;
+            using unsigned_type = as_unsigned_integer_t<double>;
+            return register_type { static_cast<unsigned_type>(b0 ? -1LL : 0LL),
+                                   static_cast<unsigned_type>(b1 ? -1LL : 0LL) };
+        }
+
+        /*************
+         * from_bool *
+         *************/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> from_bool(batch_bool<double, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vandq_u64(arg, vreinterpretq_u64_f64(vdupq_n_f64(1.))));
+        }
+
+        /********
+         * load *
+         ********/
+#if defined(__clang__) || defined(__GNUC__)
+#define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
+#elif defined(_MSC_VER)
+#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
+#else
+#define xsimd_aligned_load(inst, type, expr) inst((type)expr)
+#endif
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_aligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_f64, double*, src);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_unaligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
+        {
+            return vld1q_f64(src);
+        }
+#undef xsimd_aligned_load
+
+        /*********
+         * store *
+         *********/
+
+        template <class A>
+        XSIMD_INLINE void store_aligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
+        {
+            vst1q_f64(dst, src);
+        }
+
+        template <class A>
+        XSIMD_INLINE void store_unaligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
+        {
+            return store_aligned<A>(dst, src, A {});
+        }
+
+        /****************
+         * load_complex *
+         ****************/
+
+        template <class A>
+        XSIMD_INLINE batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, convert<std::complex<double>>, requires_arch<neon64>) noexcept
+        {
+            using real_batch = batch<double, A>;
+            const double* buf = reinterpret_cast<const double*>(mem);
+            float64x2x2_t tmp = vld2q_f64(buf);
+            real_batch real = tmp.val[0],
+                       imag = tmp.val[1];
+            return batch<std::complex<double>, A> { real, imag };
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, convert<std::complex<double>> cvt, requires_arch<neon64>) noexcept
+        {
+            return load_complex_aligned<A>(mem, cvt, A {});
+        }
+
+        /*****************
+         * store_complex *
+         *****************/
+
+        template <class A>
+        XSIMD_INLINE void store_complex_aligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
+        {
+            float64x2x2_t tmp;
+            tmp.val[0] = src.real();
+            tmp.val[1] = src.imag();
+            double* buf = reinterpret_cast<double*>(dst);
+            vst2q_f64(buf, tmp);
+        }
+
+        template <class A>
+        XSIMD_INLINE void store_complex_unaligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
+        {
+            store_complex_aligned(dst, src, A {});
+        }
+
+        /*******
+         * neg *
+         *******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_u64_s64(vnegq_s64(vreinterpretq_s64_u64(rhs)));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vnegq_s64(rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> neg(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vnegq_f64(rhs);
+        }
+
+        /*******
+         * add *
+         *******/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> add(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vaddq_f64(lhs, rhs);
+        }
+
+        /********
+         * sadd *
+         ********/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> sadd(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return add(lhs, rhs, neon64 {});
+        }
+
+        /*******
+         * sub *
+         *******/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> sub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vsubq_f64(lhs, rhs);
+        }
+
+        /********
+         * ssub *
+         ********/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> ssub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return sub(lhs, rhs, neon64 {});
+        }
+
+        /*******
+         * mul *
+         *******/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> mul(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vmulq_f64(lhs, rhs);
+        }
+
+        /*******
+         * div *
+         *******/
+
+#if defined(XSIMD_FAST_INTEGER_DIVISION)
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcvtq_u64_f64(vcvtq_f64_u64(lhs) / vcvtq_f64_u64(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcvtq_s64_f64(vcvtq_f64_s64(lhs) / vcvtq_f64_s64(rhs));
+        }
+#endif
+        template <class A>
+        XSIMD_INLINE batch<double, A> div(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vdivq_f64(lhs, rhs);
+        }
+
+        /******
+         * eq *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_f64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        /*************
+         * fast_cast *
+         *************/
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_f64_s64(x);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_f64_u64(x);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& x, batch<int64_t, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_s64_f64(x);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<uint64_t, A> fast_cast(batch<double, A> const& x, batch<uint64_t, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_u64_f64(x);
+            }
+
+        }
+
+        /******
+         * lt *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcltq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcltq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcltq_f64(lhs, rhs);
+        }
+
+        /******
+         * le *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcleq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcleq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcleq_f64(lhs, rhs);
+        }
+
+        /******
+         * gt *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgtq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgtq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgtq_f64(lhs, rhs);
+        }
+
+        /******
+         * ge *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgeq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgeq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgeq_f64(lhs, rhs);
+        }
+
+        /*******************
+         * batch_bool_cast *
+         *******************/
+
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch_bool<T_out, A>::register_type;
+            return register_type(self);
+        }
+
+        /***************
+         * bitwise_and *
+         ***************/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vandq_u64(lhs, rhs);
+        }
+
+        /**************
+         * bitwise_or *
+         **************/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vorrq_u64(lhs, rhs);
+        }
+
+        /***************
+         * bitwise_xor *
+         ***************/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return veorq_u64(lhs, rhs);
+        }
+
+        /*******
+         * neq *
+         *******/
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return bitwise_xor(lhs, rhs, A {});
+        }
+
+        /***************
+         * bitwise_not *
+         ***************/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_f64(rhs)));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return detail::bitwise_not_u64(rhs);
+        }
+
+        /******************
+         * bitwise_andnot *
+         ******************/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vbicq_u64(lhs, rhs);
+        }
+
+        /*******
+         * min *
+         *******/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> min(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vminq_f64(lhs, rhs);
+        }
+
+        /*******
+         * max *
+         *******/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> max(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vmaxq_f64(lhs, rhs);
+        }
+
+        /*******
+         * abs *
+         *******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return rhs;
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vabsq_s64(rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> abs(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vabsq_f64(rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                        requires_arch<neon64>) noexcept
+        {
+            return vcvtnq_s32_f32(self);
+        }
+
+#if !defined(__GNUC__)
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
+                                                        requires_arch<neon64>) noexcept
+        {
+            return vcvtnq_s64_f64(self);
+        }
+#endif
+
+        /**************
+         * reciprocal *
+         **************/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A>
+        reciprocal(const batch<double, A>& x,
+                   kernel::requires_arch<neon64>) noexcept
+        {
+            return vrecpeq_f64(x);
+        }
+
+        /********
+         * rsqrt *
+         ********/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vrsqrteq_f64(rhs);
+        }
+
+        /********
+         * sqrt *
+         ********/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vsqrtq_f64(rhs);
+        }
+
+        /********************
+         * Fused operations *
+         ********************/
+
+#ifdef __ARM_FEATURE_FMA
+        template <class A>
+        XSIMD_INLINE batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
+        {
+            return vfmaq_f64(z, x, y);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
+        {
+            return vfmaq_f64(-z, x, y);
+        }
+#endif
+
+        /*********
+         * haddp *
+         *********/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> haddp(const batch<double, A>* row, requires_arch<neon64>) noexcept
+        {
+            return vpaddq_f64(row[0], row[1]);
+        }
+
+        /**********
+         * insert *
+         **********/
+
+        template <class A, size_t I>
+        XSIMD_INLINE batch<double, A> insert(batch<double, A> const& self, double val, index<I>, requires_arch<neon64>) noexcept
+        {
+            return vsetq_lane_f64(val, self, I);
+        }
+
+        /******************
+         * reducer macros *
+         ******************/
+
+        // Wrap reducer intrinsics so we can pass them as function pointers
+        // - OP: intrinsics name prefix, e.g., vorrq
+
+#define WRAP_REDUCER_INT_EXCLUDING_64(OP)                     \
+    namespace wrap                                            \
+    {                                                         \
+        XSIMD_INLINE uint8_t OP##_u8(uint8x16_t a) noexcept   \
+        {                                                     \
+            return ::OP##_u8(a);                              \
+        }                                                     \
+        XSIMD_INLINE int8_t OP##_s8(int8x16_t a) noexcept     \
+        {                                                     \
+            return ::OP##_s8(a);                              \
+        }                                                     \
+        XSIMD_INLINE uint16_t OP##_u16(uint16x8_t a) noexcept \
+        {                                                     \
+            return ::OP##_u16(a);                             \
+        }                                                     \
+        XSIMD_INLINE int16_t OP##_s16(int16x8_t a) noexcept   \
+        {                                                     \
+            return ::OP##_s16(a);                             \
+        }                                                     \
+        XSIMD_INLINE uint32_t OP##_u32(uint32x4_t a) noexcept \
+        {                                                     \
+            return ::OP##_u32(a);                             \
+        }                                                     \
+        XSIMD_INLINE int32_t OP##_s32(int32x4_t a) noexcept   \
+        {                                                     \
+            return ::OP##_s32(a);                             \
+        }                                                     \
+    }
+
+#define WRAP_REDUCER_INT(OP)                                  \
+    WRAP_REDUCER_INT_EXCLUDING_64(OP)                         \
+    namespace wrap                                            \
+    {                                                         \
+        XSIMD_INLINE uint64_t OP##_u64(uint64x2_t a) noexcept \
+        {                                                     \
+            return ::OP##_u64(a);                             \
+        }                                                     \
+        XSIMD_INLINE int64_t OP##_s64(int64x2_t a) noexcept   \
+        {                                                     \
+            return ::OP##_s64(a);                             \
+        }                                                     \
+    }
+
+#define WRAP_REDUCER_FLOAT(OP)                               \
+    namespace wrap                                           \
+    {                                                        \
+        XSIMD_INLINE float OP##_f32(float32x4_t a) noexcept  \
+        {                                                    \
+            return ::OP##_f32(a);                            \
+        }                                                    \
+        XSIMD_INLINE double OP##_f64(float64x2_t a) noexcept \
+        {                                                    \
+            return ::OP##_f64(a);                            \
+        }                                                    \
+    }
+
+        namespace detail
+        {
+            template <class R>
+            struct reducer_return_type_impl;
+
+            template <>
+            struct reducer_return_type_impl<uint8x16_t>
+            {
+                using type = uint8_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int8x16_t>
+            {
+                using type = int8_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<uint16x8_t>
+            {
+                using type = uint16_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int16x8_t>
+            {
+                using type = int16_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<uint32x4_t>
+            {
+                using type = uint32_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int32x4_t>
+            {
+                using type = int32_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<uint64x2_t>
+            {
+                using type = uint64_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int64x2_t>
+            {
+                using type = int64_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<float32x4_t>
+            {
+                using type = float;
+            };
+
+            template <>
+            struct reducer_return_type_impl<float64x2_t>
+            {
+                using type = double;
+            };
+
+            template <class R>
+            using reducer_return_type = typename reducer_return_type_impl<R>::type;
+
+            template <class... T>
+            struct neon_reducer_dispatcher_impl : neon_dispatcher_base<reducer_return_type, T...>
+            {
+            };
+
+            using neon_reducer_dispatcher = neon_reducer_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                         uint16x8_t, int16x8_t,
+                                                                         uint32x4_t, int32x4_t,
+                                                                         uint64x2_t, int64x2_t,
+                                                                         float32x4_t, float64x2_t>;
+            template <class T>
+            using enable_neon64_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value,
+                                                                 int>::type;
+        }
+
+        /**************
+         * reduce_add *
+         **************/
+
+        WRAP_REDUCER_INT(vaddvq)
+        WRAP_REDUCER_FLOAT(vaddvq)
+
+        template <class A, class T, detail::enable_neon64_type_t<T> = 0>
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_reducer_dispatcher::unary dispatcher = {
+                std::make_tuple(wrap::vaddvq_u8, wrap::vaddvq_s8, wrap::vaddvq_u16, wrap::vaddvq_s16,
+                                wrap::vaddvq_u32, wrap::vaddvq_s32, wrap::vaddvq_u64, wrap::vaddvq_s64,
+                                wrap::vaddvq_f32, wrap::vaddvq_f64)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+        /**************
+         * reduce_max *
+         **************/
+
+        WRAP_REDUCER_INT_EXCLUDING_64(vmaxvq)
+        WRAP_REDUCER_FLOAT(vmaxvq)
+
+        namespace wrap
+        {
+            XSIMD_INLINE uint64_t vmaxvq_u64(uint64x2_t a) noexcept
+            {
+                return std::max(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
+            }
+
+            XSIMD_INLINE int64_t vmaxvq_s64(int64x2_t a) noexcept
+            {
+                return std::max(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
+            }
+        }
+
+        template <class A, class T, detail::enable_neon64_type_t<T> = 0>
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_max(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_reducer_dispatcher::unary dispatcher = {
+                std::make_tuple(wrap::vmaxvq_u8, wrap::vmaxvq_s8, wrap::vmaxvq_u16, wrap::vmaxvq_s16,
+                                wrap::vmaxvq_u32, wrap::vmaxvq_s32, wrap::vmaxvq_u64, wrap::vmaxvq_s64,
+                                wrap::vmaxvq_f32, wrap::vmaxvq_f64)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+        /**************
+         * reduce_min *
+         **************/
+
+        WRAP_REDUCER_INT_EXCLUDING_64(vminvq)
+        WRAP_REDUCER_FLOAT(vminvq)
+
+        namespace wrap
+        {
+            XSIMD_INLINE uint64_t vminvq_u64(uint64x2_t a) noexcept
+            {
+                return std::min(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
+            }
+
+            XSIMD_INLINE int64_t vminvq_s64(int64x2_t a) noexcept
+            {
+                return std::min(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
+            }
+        }
+
+        template <class A, class T, detail::enable_neon64_type_t<T> = 0>
+        XSIMD_INLINE typename batch<T, A>::value_type reduce_min(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_reducer_dispatcher::unary dispatcher = {
+                std::make_tuple(wrap::vminvq_u8, wrap::vminvq_s8, wrap::vminvq_u16, wrap::vminvq_s16,
+                                wrap::vminvq_u32, wrap::vminvq_s32, wrap::vminvq_u64, wrap::vminvq_s64,
+                                wrap::vminvq_f32, wrap::vminvq_f64)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+#undef WRAP_REDUCER_INT_EXCLUDING_64
+#undef WRAP_REDUCER_INT
+#undef WRAP_REDUCER_FLOAT
+
+        /**********
+         * select *
+         **********/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& a, batch<double, A> const& b, requires_arch<neon64>) noexcept
+        {
+            return vbslq_f64(cond, a, b);
+        }
+
+        template <class A, bool... b>
+        XSIMD_INLINE batch<double, A> select(batch_bool_constant<double, A, b...> const&,
+                                             batch<double, A> const& true_br,
+                                             batch<double, A> const& false_br,
+                                             requires_arch<neon64>) noexcept
+        {
+            return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
+        }
+
+        template <class A>
+        XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<neon64>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            auto r0 = matrix_begin[0], r1 = matrix_begin[1];
+            matrix_begin[0] = vzip1q_f64(r0, r1);
+            matrix_begin[1] = vzip2q_f64(r0, r1);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon64>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            auto r0 = matrix_begin[0], r1 = matrix_begin[1];
+            matrix_begin[0] = vzip1q_u64(r0, r1);
+            matrix_begin[1] = vzip2q_u64(r0, r1);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon64>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            auto r0 = matrix_begin[0], r1 = matrix_begin[1];
+            matrix_begin[0] = vzip1q_s64(r0, r1);
+            matrix_begin[1] = vzip2q_s64(r0, r1);
+        }
+
+        /**********
+         * zip_lo *
+         **********/
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_u8(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_s8(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_u16(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_s16(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_u32(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_s32(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_s64(lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_f32(lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_f64(lhs, rhs);
+        }
+
+        /**********
+         * zip_hi *
+         **********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_u8(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_s8(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_u16(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_s16(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_u32(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_s32(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_s64(lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_f32(lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_f64(lhs, rhs);
+        }
+
+        /****************
+         * extract_pair *
+         ****************/
+
+        namespace detail
+        {
+            template <class A, size_t I, size_t... Is>
+            XSIMD_INLINE batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n,
+                                                       ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_f64(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n, requires_arch<neon64>) noexcept
+        {
+            constexpr std::size_t size = batch<double, A>::size;
+            assert(n < size && "index in bounds");
+            return detail::extract_pair(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
+        }
+
+        /******************
+         * bitwise_rshift *
+         ******************/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
+        {
+            return bitwise_rshift<A>(lhs, n, neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vshlq_u64(lhs, vnegq_s64(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
+        {
+            return bitwise_rshift<A>(lhs, n, neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vshlq_s64(lhs, vnegq_s64(rhs));
+        }
+
+        /****************
+         * bitwise_cast *
+         ****************/
+
+#define WRAP_CAST(SUFFIX, TYPE)                                                \
+    namespace wrap                                                             \
+    {                                                                          \
+        XSIMD_INLINE float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) noexcept   \
+        {                                                                      \
+            return ::vreinterpretq_f64_##SUFFIX(a);                            \
+        }                                                                      \
+        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) noexcept \
+        {                                                                      \
+            return ::vreinterpretq_##SUFFIX##_f64(a);                          \
+        }                                                                      \
+    }
+
+        WRAP_CAST(u8, uint8x16_t)
+        WRAP_CAST(s8, int8x16_t)
+        WRAP_CAST(u16, uint16x8_t)
+        WRAP_CAST(s16, int16x8_t)
+        WRAP_CAST(u32, uint32x4_t)
+        WRAP_CAST(s32, int32x4_t)
+        WRAP_CAST(u64, uint64x2_t)
+        WRAP_CAST(s64, int64x2_t)
+        WRAP_CAST(f32, float32x4_t)
+
+#undef WRAP_CAST
+
+        template <class A, class T>
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
+        {
+            using caster_type = detail::bitwise_caster_impl<float64x2_t,
+                                                            uint8x16_t, int8x16_t,
+                                                            uint16x8_t, int16x8_t,
+                                                            uint32x4_t, int32x4_t,
+                                                            uint64x2_t, int64x2_t,
+                                                            float32x4_t>;
+            const caster_type caster = {
+                std::make_tuple(wrap::vreinterpretq_f64_u8, wrap::vreinterpretq_f64_s8, wrap::vreinterpretq_f64_u16, wrap::vreinterpretq_f64_s16,
+                                wrap::vreinterpretq_f64_u32, wrap::vreinterpretq_f64_s32, wrap::vreinterpretq_f64_u64, wrap::vreinterpretq_f64_s64,
+                                wrap::vreinterpretq_f64_f32)
+            };
+            using register_type = typename batch<T, A>::register_type;
+            return caster.apply(register_type(arg));
+        }
+
+        namespace detail
+        {
+            template <class S, class... R>
+            struct bitwise_caster_neon64
+            {
+                using container_type = std::tuple<R (*)(S)...>;
+                container_type m_func;
+
+                template <class V>
+                V apply(float64x2_t rhs) const
+                {
+                    using func_type = V (*)(float64x2_t);
+                    auto func = xsimd::detail::get<func_type>(m_func);
+                    return func(rhs);
+                }
+            };
+        }
+
+        template <class A, class R>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<double, A> const& arg, batch<R, A> const&, requires_arch<neon64>) noexcept
+        {
+            using caster_type = detail::bitwise_caster_neon64<float64x2_t,
+                                                              uint8x16_t, int8x16_t,
+                                                              uint16x8_t, int16x8_t,
+                                                              uint32x4_t, int32x4_t,
+                                                              uint64x2_t, int64x2_t,
+                                                              float32x4_t>;
+            const caster_type caster = {
+                std::make_tuple(wrap::vreinterpretq_u8_f64, wrap::vreinterpretq_s8_f64, wrap::vreinterpretq_u16_f64, wrap::vreinterpretq_s16_f64,
+                                wrap::vreinterpretq_u32_f64, wrap::vreinterpretq_s32_f64, wrap::vreinterpretq_u64_f64, wrap::vreinterpretq_s64_f64,
+                                wrap::vreinterpretq_f32_f64)
+            };
+            using src_register_type = typename batch<double, A>::register_type;
+            using dst_register_type = typename batch<R, A>::register_type;
+            return caster.apply<dst_register_type>(src_register_type(arg));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<double, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
+        {
+            return arg;
+        }
+
+        /*********
+         * isnan *
+         *********/
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return !(arg == arg);
+        }
+
+        /****************
+         * rotate_left *
+         ****************/
+        template <size_t N, class A>
+        XSIMD_INLINE batch<double, A> rotate_left(batch<double, A> const& a, requires_arch<neon64>) noexcept
+        {
+            return vextq_f64(a, a, N);
+        }
+    }
+
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        /*********************
+         * swizzle (dynamic) *
+         *********************/
+        template <class A>
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> idx,
+                                               requires_arch<neon64>) noexcept
+        {
+            return vqtbl1q_u8(self, idx);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> idx,
+                                              requires_arch<neon64>) noexcept
+        {
+            return vqtbl1q_s8(self, idx);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
+                                                batch<uint16_t, A> idx,
+                                                requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            using index_type = batch<uint8_t, A>;
+            return vreinterpretq_u16_u8(swizzle(batch_type(vreinterpretq_u8_u16(self)),
+                                                index_type(vreinterpretq_u8_u16(idx * 0x0202 + 0x0100)),
+                                                neon64 {}));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
+                                               batch<uint16_t, A> idx,
+                                               requires_arch<neon64>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), idx, neon64 {}));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
+                                                batch<uint32_t, A> idx,
+                                                requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            using index_type = batch<uint8_t, A>;
+            return vreinterpretq_u32_u8(swizzle(batch_type(vreinterpretq_u8_u32(self)),
+                                                index_type(vreinterpretq_u8_u32(idx * 0x04040404 + 0x03020100)),
+                                                neon64 {}));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
+                                               batch<uint32_t, A> idx,
+                                               requires_arch<neon64>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), idx, neon64 {}));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
+                                                batch<uint64_t, A> idx,
+                                                requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            using index_type = batch<uint8_t, A>;
+            return vreinterpretq_u64_u8(swizzle(batch_type(vreinterpretq_u8_u64(self)),
+                                                index_type(vreinterpretq_u8_u64(idx * 0x0808080808080808ull + 0x0706050403020100ull)),
+                                                neon64 {}));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
+                                               batch<uint64_t, A> idx,
+                                               requires_arch<neon64>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), idx, neon64 {}));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
+                                             batch<uint32_t, A> idx,
+                                             requires_arch<neon64>) noexcept
+        {
+            return bitwise_cast<float>(swizzle(bitwise_cast<uint32_t>(self), idx, neon64 {}));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self,
+                                              batch<uint64_t, A> idx,
+                                              requires_arch<neon64>) noexcept
+        {
+            return bitwise_cast<double>(swizzle(bitwise_cast<uint64_t>(self), idx, neon64 {}));
+        }
+
+        /********************
+         * swizzle (static) *
+         ********************/
+
+        namespace detail
+        {
+            using ::xsimd::batch_constant;
+            using ::xsimd::detail::integer_sequence;
+            using ::xsimd::detail::make_integer_sequence;
+
+            template <class CB1, class CB2, class IS>
+            struct index_burst_impl;
+
+            template <typename T1, class A, typename T2, T2... V,
+                      T2... incr>
+            struct index_burst_impl<batch_constant<T1, A>, batch_constant<T2, A, V...>,
+                                    integer_sequence<T2, incr...>>
+            {
+                using type = batch_constant<T2, A, V...>;
+            };
+
+            template <typename T1, class A, T1 V0, T1... V1,
+                      typename T2, T2... V2, T2... incr>
+            struct index_burst_impl<batch_constant<T1, A, V0, V1...>, batch_constant<T2, A, V2...>,
+                                    integer_sequence<T2, incr...>>
+            {
+                using next_input = batch_constant<T1, A, V1...>;
+                using next_output = batch_constant<T2, A, V2..., (V0 + incr)...>;
+                using type = typename index_burst_impl<next_input, next_output, integer_sequence<T2, incr...>>::type;
+            };
+
+            template <class B, class T>
+            struct index_burst;
+
+            template <typename Tp, class A, Tp... V, typename T>
+            struct index_burst<batch_constant<Tp, A, V...>, T>
+            {
+                static constexpr size_t mul = sizeof(Tp) / sizeof(T);
+                using input = batch_constant<Tp, A, (mul * V)...>;
+                using output = batch_constant<T, A>;
+                using type = typename index_burst_impl<input, output, make_integer_sequence<T, mul>>::type;
+            };
+
+            template <class B, typename T>
+            using index_burst_t = typename index_burst<B, T>::type;
+
+            template <typename T, class B>
+            XSIMD_INLINE index_burst_t<B, T> burst_index(B)
+            {
+                return index_burst_t<B, T>();
+            }
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self,
+                                               batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
+                                               requires_arch<neon64>) noexcept
+        {
+            return vqtbl1q_u8(self, batch<uint8_t, A>(idx));
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self,
+                                              batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
+                                              requires_arch<neon64>) noexcept
+        {
+            return vqtbl1q_s8(self, batch<uint8_t, A>(idx));
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
+                                                batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> idx,
+                                                requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_u16_u8(swizzle<A>(batch_type(vreinterpretq_u8_u16(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
+                                               batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> idx,
+                                               requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<int8_t, A>;
+            return vreinterpretq_s16_s8(swizzle<A>(batch_type(vreinterpretq_s8_s16(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
+                                                batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
+                                                requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_u32_u8(swizzle<A>(batch_type(vreinterpretq_u8_u32(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
+                                               batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
+                                               requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<int8_t, A>;
+            return vreinterpretq_s32_s8(swizzle<A>(batch_type(vreinterpretq_s8_s32(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
+                                                batch_constant<uint64_t, A, V0, V1> idx,
+                                                requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_u64_u8(swizzle<A>(batch_type(vreinterpretq_u8_u64(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
+                                               batch_constant<uint64_t, A, V0, V1> idx,
+                                               requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<int8_t, A>;
+            return vreinterpretq_s64_s8(swizzle<A>(batch_type(vreinterpretq_s8_s64(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
+                                             batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
+                                             requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_f32_u8(swizzle<A>(batch_type(vreinterpretq_u8_f32(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self,
+                                              batch_constant<uint64_t, A, V0, V1> idx,
+                                              requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_f64_u8(swizzle<A>(batch_type(vreinterpretq_u8_f64(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<std::complex<float>, A> swizzle(batch<std::complex<float>, A> const& self,
+                                                           batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
+                                                           requires_arch<neon64>) noexcept
+        {
+            return batch<std::complex<float>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<std::complex<double>, A> swizzle(batch<std::complex<double>, A> const& self,
+                                                            batch_constant<uint64_t, A, V0, V1> idx,
+                                                            requires_arch<neon64>) noexcept
+        {
+            return batch<std::complex<double>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
+        }
+    }
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_rvv.hpp b/include/onnxruntime/xsimd/arch/xsimd_rvv.hpp
new file mode 100644
index 0000000000000..ef00e75679b80
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_rvv.hpp
@@ -0,0 +1,1500 @@
+/***************************************************************************
+
+ * Copyright (c) Rivos Inc.                                                 *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_RVV_HPP
+#define XSIMD_RVV_HPP
+
+#include <complex>
+#include <type_traits>
+#include <utility>
+
+#include "../types/xsimd_rvv_register.hpp"
+#include "xsimd_constants.hpp"
+
+// This set of macros allows the synthesis of identifiers using a template and
+// variable macro arguments.  A single template can then be used by multiple
+// macros, or multiple instances of a macro to define the same logic for
+// different data types.
+//
+// First some logic to paste text together...
+//
+#define XSIMD_RVV_JOIN_(x, y) x##y
+#define XSIMD_RVV_JOIN(x, y) XSIMD_RVV_JOIN_(x, y)
+#define XSIMD_RVV_PREFIX_T(T, S, then) XSIMD_RVV_JOIN(T, then)
+#define XSIMD_RVV_PREFIX_S(T, S, then) XSIMD_RVV_JOIN(S, then)
+#define XSIMD_RVV_PREFIX_M(T, S, then) XSIMD_RVV_JOIN(m1, then)
+#define XSIMD_RVV_PREFIX(T, S, then) then
+//
+// XSIMD_RVV_IDENTIFIER accepts type and size parameters, and a template for
+// the identifier.  The template is a comma-separated list of alternating
+// literal and parameter segments.  Each parameter is appended to XSIMD_RVV_PREFIX to
+// form a new macro name which decides which parameter should be inserted.
+// Then a literal segment is inserted after that.  Empty literals are used to
+// join two or more variables together.
+//
+#define XSIMD_RVV_IDENTIFIER9(T, S, t, ...) t
+#define XSIMD_RVV_IDENTIFIER8(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER9(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER7(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER8(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER6(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER7(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER5(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER6(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER4(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER5(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER3(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER4(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER2(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER3(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER1(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER2(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER0(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER1(T, S, __VA_ARGS__)))
+//
+// UNBRACKET and REPARSE force the preprocessor to handle expansion in a
+// specific order.  XSIMD_RVV_UNBRACKET strips the parentheses from the template
+// (which were necessary to keep the template as a single, named macro
+// parameter up to this point).  XSIMD_RVV_ARG_LIST then forms the new parameter list
+// to pass to XSIMD_RVV_IDENTIFIER0, with trailing commas to ensure the unrolled
+// XSIMD_RVV_IDENTIFIER loop runs to completion adding empty strings.
+//
+// However XSIMD_RVV_IDENTIFIER0 is not expanded immediately because it does not
+// match a function-like macro in this pass.  XSIMD_RVV_REPARSE forces another
+// evaluation after the expansion of XSIMD_RVV_ARG_LIST, where XSIMD_RVV_IDENTIFIER0 will
+// now match as a function-like macro, and the cycle of substitutions and
+// insertions can begin.
+//
+#define XSIMD_RVV_REPARSE(v) (v)
+#define XSIMD_RVV_UNBRACKET(...) __VA_ARGS__
+#define XSIMD_RVV_ARG_LIST(T, S, name) (T, S, XSIMD_RVV_UNBRACKET name, , , , , , , , , , , , , , , , , , , , , )
+#define XSIMD_RVV_IDENTIFIER(T, S, name) XSIMD_RVV_REPARSE(XSIMD_RVV_IDENTIFIER0 XSIMD_RVV_ARG_LIST(T, S, name))
+//
+// To avoid comma-counting bugs, replace the variable references with macros
+// which include enough commas to keep proper phase, and then use no commas at
+// all in the templates.
+//
+#define XSIMD_RVV_T , _T,
+#define XSIMD_RVV_S , _S,
+#define XSIMD_RVV_M , _M,
+#define XSIMD_RVV_TSM XSIMD_RVV_T XSIMD_RVV_S XSIMD_RVV_M
+
+// XSIMD_RVV_OVERLOAD, below, expands to a head section, a number of body sections
+// (depending on which types are supported), and a tail section.  Different
+// variants of these sections are implemented with different suffixes on the
+// three macro names XSIMD_RVV_WRAPPER_HEAD, XSIMD_RVV_WRAPPER, and XSIMD_RVV_WRAPPER_TAIL and
+// specified as an argument to XSIMD_RVV_OVERLOAD (the empty string is the default,
+// but still needs an extra comma to hold its place).
+//
+// The default XSIMD_RVV_WRAPPER_HEAD provides a class containing convenient names
+// for the function signature argument(s) to XSIMD_RVV_OVERLOAD.  That signature can
+// also reference the template argument T, because it's a text substitution
+// into the template.
+#define XSIMD_RVV_WRAPPER_HEAD(NAME, SIGNATURE, ...)                      \
+    namespace NAME##_cruft                                                \
+    {                                                                     \
+        template <class T>                                                \
+        struct ctx                                                        \
+        {                                                                 \
+            static constexpr size_t width = XSIMD_RVV_BITS;               \
+            static constexpr size_t vl = width / (sizeof(T) * 8);         \
+            using vec = rvv_reg_t<T, width>;                              \
+            using uvec = rvv_reg_t<as_unsigned_relaxed_t<T>, width>;      \
+            using svec = rvv_reg_t<as_signed_relaxed_t<T>, width>;        \
+            using fvec = rvv_reg_t<as_float_relaxed_t<T>, width>;         \
+            using bvec = rvv_bool_t<T, width>;                            \
+            using scalar_vec = rvv_reg_t<T, types::detail::rvv_width_m1>; \
+            using wide_vec = rvv_reg_t<T, width * 2>;                     \
+            using narrow_vec = rvv_reg_t<T, width / 2>;                   \
+            using type = SIGNATURE;                                       \
+        };                                                                \
+        template <class T>                                                \
+        using sig_t = typename ctx<T>::type;                              \
+        template <class K, class T>                                       \
+        struct impl                                                       \
+        {                                                                 \
+            void operator()() const noexcept {};                          \
+        };                                                                \
+        template <class K>                                                \
+        using impl_t = impl<K, sig_t<K>>;
+
+#define XSIMD_RVV_WRAPPER_HEAD_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__)
+#define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__)
+#define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST_CUSTOM_ARGS(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__)
+#define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST_CUSTOM_ARGS_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__)
+
+// The body of the wrapper defines a functor (because partial specialisation of
+// functions is not legal) which forwards its arguments to the named intrinsic
+// with a few manipulations.  In general, vector types are handled as
+// rvv_reg_t<> and rely on the conversion operators in that class for
+// compatibility with the intrinsics.
+//
+// The function signature is not mentioned here.  Instead it's provided in the
+// tail code as the template argument for which this is a specialisation, which
+// overcomes the problem of converting a function signature type to an argument
+// list to pass to another function.
+//
+#define XSIMD_RVV_WRAPPER(KEY, CALLEE, ...)                   \
+    template <class Ret, class... Args>                       \
+    struct impl<KEY, Ret(Args...)>                            \
+    {                                                         \
+        using ctx = ctx<KEY>;                                 \
+        constexpr Ret operator()(Args... args) const noexcept \
+        {                                                     \
+            return CALLEE(args..., ctx::vl);                  \
+        };                                                    \
+    };
+#define XSIMD_RVV_WRAPPER_NOVL(KEY, CALLEE, ...)              \
+    template <class Ret, class... Args>                       \
+    struct impl<KEY, Ret(Args...)>                            \
+    {                                                         \
+        constexpr Ret operator()(Args... args) const noexcept \
+        {                                                     \
+            return CALLEE(args...);                           \
+        };                                                    \
+    };
+#define XSIMD_RVV_WRAPPER_DROP_1ST(KEY, CALLEE, ...)                 \
+    template <class Ret, class First, class... Args>                 \
+    struct impl<KEY, Ret(First, Args...)>                            \
+    {                                                                \
+        using ctx = ctx<KEY>;                                        \
+        constexpr Ret operator()(First, Args... args) const noexcept \
+        {                                                            \
+            return CALLEE(args..., ctx::vl);                         \
+        };                                                           \
+    };
+#define XSIMD_RVV_WRAPPER_DROP_1ST_CUSTOM_ARGS(KEY, CALLEE, SIGNATURE, ...) \
+    template <class Ret, class First, class... Args>                        \
+    struct impl<KEY, Ret(First, Args...)>                                   \
+    {                                                                       \
+        using ctx = ctx<KEY>;                                               \
+        constexpr Ret operator()(First, Args... args) const noexcept        \
+        {                                                                   \
+            return CALLEE(__VA_ARGS__, ctx::vl);                            \
+        };                                                                  \
+    };
+#define XSIMD_RVV_WRAPPER_DROP_1ST_CUSTOM_ARGS_NOVL(KEY, CALLEE, SIGNATURE, ...) \
+    template <class Ret, class First, class... Args>                             \
+    struct impl<KEY, Ret(First, Args...)>                                        \
+    {                                                                            \
+        constexpr Ret operator()(First, Args... args) const noexcept             \
+        {                                                                        \
+            return CALLEE(__VA_ARGS__);                                          \
+        };                                                                       \
+    };
+
+// This part folds all the above templates down into a single functor instance
+// with all the different function signatures available under the one name.
+// Not all of the base classes necessarily contain useful code, but there's a
+// default implementation so that filtering them out isn't really necessary.
+#define XSIMD_RVV_WRAPPER_TAIL(NAME, ...)                     \
+    } /* namespace NAME##_cruft */                            \
+    static constexpr struct : NAME##_cruft::impl_t<int8_t>,   \
+                              NAME##_cruft::impl_t<uint8_t>,  \
+                              NAME##_cruft::impl_t<int16_t>,  \
+                              NAME##_cruft::impl_t<uint16_t>, \
+                              NAME##_cruft::impl_t<int32_t>,  \
+                              NAME##_cruft::impl_t<uint32_t>, \
+                              NAME##_cruft::impl_t<int64_t>,  \
+                              NAME##_cruft::impl_t<uint64_t>, \
+                              NAME##_cruft::impl_t<float>,    \
+                              NAME##_cruft::impl_t<double>    \
+    {                                                         \
+        using NAME##_cruft::impl_t<int8_t>::operator();       \
+        using NAME##_cruft::impl_t<uint8_t>::operator();      \
+        using NAME##_cruft::impl_t<int16_t>::operator();      \
+        using NAME##_cruft::impl_t<uint16_t>::operator();     \
+        using NAME##_cruft::impl_t<int32_t>::operator();      \
+        using NAME##_cruft::impl_t<uint32_t>::operator();     \
+        using NAME##_cruft::impl_t<int64_t>::operator();      \
+        using NAME##_cruft::impl_t<uint64_t>::operator();     \
+        using NAME##_cruft::impl_t<float>::operator();        \
+        using NAME##_cruft::impl_t<double>::operator();       \
+    } NAME {};
+#define XSIMD_RVV_WRAPPER_TAIL_NOVL(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__)
+#define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__)
+#define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST_CUSTOM_ARGS(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__)
+#define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST_CUSTOM_ARGS_NOVL(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__)
+
+// clang-format off
+
+#define XSIMD_RVV_OVERLOAD_head(my_name, variant, ...) \
+    XSIMD_RVV_WRAPPER_HEAD##variant(my_name, __VA_ARGS__)
+#define XSIMD_RVV_OVERLOAD_i(name, variant, ...)                                        \
+    XSIMD_RVV_WRAPPER##variant(int8_t, XSIMD_RVV_IDENTIFIER(i, 8, name), __VA_ARGS__)   \
+    XSIMD_RVV_WRAPPER##variant(int16_t, XSIMD_RVV_IDENTIFIER(i, 16, name), __VA_ARGS__) \
+    XSIMD_RVV_WRAPPER##variant(int32_t, XSIMD_RVV_IDENTIFIER(i, 32, name), __VA_ARGS__) \
+    XSIMD_RVV_WRAPPER##variant(int64_t, XSIMD_RVV_IDENTIFIER(i, 64, name), __VA_ARGS__)
+#define XSIMD_RVV_OVERLOAD_u(name, variant, ...)                                         \
+    XSIMD_RVV_WRAPPER##variant(uint8_t, XSIMD_RVV_IDENTIFIER(u, 8, name), __VA_ARGS__)   \
+    XSIMD_RVV_WRAPPER##variant(uint16_t, XSIMD_RVV_IDENTIFIER(u, 16, name), __VA_ARGS__) \
+    XSIMD_RVV_WRAPPER##variant(uint32_t, XSIMD_RVV_IDENTIFIER(u, 32, name), __VA_ARGS__) \
+    XSIMD_RVV_WRAPPER##variant(uint64_t, XSIMD_RVV_IDENTIFIER(u, 64, name), __VA_ARGS__)
+#define XSIMD_RVV_OVERLOAD_f(name, variant, ...)                                      \
+    XSIMD_RVV_WRAPPER##variant(float, XSIMD_RVV_IDENTIFIER(f, 32, name), __VA_ARGS__) \
+    XSIMD_RVV_WRAPPER##variant(double, XSIMD_RVV_IDENTIFIER(f, 64, name), __VA_ARGS__)
+#define XSIMD_RVV_OVERLOAD_tail(my_name, variant, ...) \
+    XSIMD_RVV_WRAPPER_TAIL##variant(my_name, __VA_ARGS__)
+
+// Use these to create function (actually functor, sorry) wrappers overloaded
+// for whichever types are supported.  Being functors means they can't take a
+// template argument (until C++14), so if a type can't be deduced then a junk
+// value can be passed as the first argument and discarded by using the
+// _DROP_1ST variant, instead.
+//
+// The wrappers use the rvv_reg_t<> types for template accessibility, and
+// because some types (eg., vfloat64mf2_t) don't exist and need extra
+// abstraction to emulate.
+//
+// In many cases the intrinsic names are different for signed, unsigned, or
+// float variants, the macros OVERLOAD2 and OVERLOAD3 (depending on whether or
+// not a float variant exists) take multiple intrinsic names and bring them
+// together under a single overloaded identifier where they can be used within
+// templates.
+//
+#define XSIMD_RVV_OVERLOAD2(my_name, name_i, name_u, variant, ...) \
+    XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)         \
+    XSIMD_RVV_OVERLOAD_i(name_i, variant, __VA_ARGS__)         \
+    XSIMD_RVV_OVERLOAD_u(name_u, variant, __VA_ARGS__)     \
+    XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
+
+#define XSIMD_RVV_OVERLOAD3(my_name, name_i, name_u, name_f, variant, ...) \
+    XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)                 \
+    XSIMD_RVV_OVERLOAD_i(name_i, variant, __VA_ARGS__)                     \
+    XSIMD_RVV_OVERLOAD_u(name_u, variant, __VA_ARGS__)                     \
+    XSIMD_RVV_OVERLOAD_f(name_f, variant, __VA_ARGS__)                     \
+    XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
+
+#define XSIMD_RVV_OVERLOAD(my_name, name, ...) XSIMD_RVV_OVERLOAD3(my_name, name, name, name, __VA_ARGS__)
+#define XSIMD_RVV_OVERLOAD_INTS(my_name, name, ...) XSIMD_RVV_OVERLOAD2(my_name, name, name, __VA_ARGS__)
+
+#define XSIMD_RVV_OVERLOAD_SINTS(my_name, name, variant, ...) \
+    XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)    \
+    XSIMD_RVV_OVERLOAD_i(name, variant, __VA_ARGS__)          \
+    XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
+
+#define XSIMD_RVV_OVERLOAD_UINTS(my_name, name, variant, ...) \
+    XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)    \
+    XSIMD_RVV_OVERLOAD_u(name, variant, __VA_ARGS__)          \
+    XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
+
+#define XSIMD_RVV_OVERLOAD_FLOATS(my_name, name, variant, ...) \
+    XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)     \
+    XSIMD_RVV_OVERLOAD_f(name, variant, __VA_ARGS__)           \
+    XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
+
+// clang-format on
+
+namespace xsimd
+{
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        namespace detail
+        {
+            template <class T>
+            using rvv_fix_char_t = types::detail::rvv_fix_char_t<T>;
+            template <class T, size_t Width = XSIMD_RVV_BITS>
+            using rvv_reg_t = types::detail::rvv_reg_t<T, Width>;
+            template <class T, size_t Width = XSIMD_RVV_BITS>
+            using rvv_bool_t = types::detail::rvv_bool_t<T, Width>;
+
+            template <size_t>
+            struct as_signed_relaxed;
+            template <>
+            struct as_signed_relaxed<1>
+            {
+                using type = int8_t;
+            };
+            template <>
+            struct as_signed_relaxed<2>
+            {
+                using type = int16_t;
+            };
+            template <>
+            struct as_signed_relaxed<4>
+            {
+                using type = int32_t;
+            };
+            template <>
+            struct as_signed_relaxed<8>
+            {
+                using type = int64_t;
+            };
+            template <class T>
+            using as_signed_relaxed_t = typename as_signed_relaxed<sizeof(T)>::type;
+            template <size_t>
+            struct as_unsigned_relaxed;
+            template <>
+            struct as_unsigned_relaxed<1>
+            {
+                using type = uint8_t;
+            };
+            template <>
+            struct as_unsigned_relaxed<2>
+            {
+                using type = uint16_t;
+            };
+            template <>
+            struct as_unsigned_relaxed<4>
+            {
+                using type = uint32_t;
+            };
+            template <>
+            struct as_unsigned_relaxed<8>
+            {
+                using type = uint64_t;
+            };
+            template <class T>
+            using as_unsigned_relaxed_t = typename as_unsigned_relaxed<sizeof(T)>::type;
+            template <size_t>
+            struct as_float_relaxed;
+            template <>
+            struct as_float_relaxed<1>
+            {
+                using type = int8_t;
+            };
+            template <>
+            struct as_float_relaxed<2>
+            {
+                using type = int16_t;
+            };
+            template <>
+            struct as_float_relaxed<4>
+            {
+                using type = float;
+            };
+            template <>
+            struct as_float_relaxed<8>
+            {
+                using type = double;
+            };
+            template <class T>
+            using as_float_relaxed_t = typename as_float_relaxed<sizeof(T)>::type;
+
+            template <class T, class U>
+            rvv_reg_t<T, U::width> rvvreinterpret(U const& arg) noexcept
+            {
+                return rvv_reg_t<T, U::width>(arg, types::detail::XSIMD_RVV_BITCAST);
+            }
+            template <class T, class A, class U>
+            rvv_reg_t<T, A::width> rvvreinterpret(batch<U, A> const& arg) noexcept
+            {
+                typename batch<U, A>::register_type r = arg;
+                return rvvreinterpret<T>(r);
+            }
+
+            template <class A, class T, class U = as_unsigned_integer_t<T>>
+            XSIMD_INLINE batch<U, A> rvv_to_unsigned_batch(batch<T, A> const& arg) noexcept
+            {
+                return rvvreinterpret<U>(arg.data);
+            }
+
+            XSIMD_RVV_OVERLOAD(rvvid,
+                               (__riscv_vid_v_u XSIMD_RVV_S XSIMD_RVV_M), _DROP_1ST, uvec(T))
+
+            XSIMD_RVV_OVERLOAD3(rvvmv_splat,
+                                (__riscv_vmv_v_x_ XSIMD_RVV_TSM),
+                                (__riscv_vmv_v_x_ XSIMD_RVV_TSM),
+                                (__riscv_vfmv_v_f_ XSIMD_RVV_TSM), , vec(T))
+
+            XSIMD_RVV_OVERLOAD3(rvvmv_lane0,
+                                (__riscv_vmv_x),
+                                (__riscv_vmv_x),
+                                (__riscv_vfmv_f), _NOVL, T(vec))
+
+            XSIMD_RVV_OVERLOAD(rvvmerge, (__riscv_vmerge), , vec(vec, vec, bvec))
+            XSIMD_RVV_OVERLOAD3(rvvmerge_splat,
+                                (__riscv_vmerge),
+                                (__riscv_vmerge),
+                                (__riscv_vfmerge), , vec(vec, T, bvec))
+
+            // count active lanes in a predicate
+            XSIMD_RVV_OVERLOAD(rvvcpop, (__riscv_vcpop),
+                               , size_t(bvec));
+
+            template <class T, size_t Width>
+            XSIMD_INLINE rvv_bool_t<T, Width> pmask8(uint8_t mask) noexcept
+            {
+                return rvv_bool_t<T, Width>(mask);
+            }
+            template <class T, size_t Width>
+            XSIMD_INLINE rvv_bool_t<T, Width> pmask(uint64_t mask) noexcept
+            {
+                return rvv_bool_t<T, Width>(mask);
+            }
+
+            template <class A, class T, size_t offset = 0, int shift = 0>
+            XSIMD_INLINE rvv_reg_t<T, A::width> vindex() noexcept
+            {
+                auto index = rvvid(T {});
+                if (shift < 0)
+                    index = __riscv_vsrl(index, -shift, batch<T, A>::size);
+                else
+                    index = __riscv_vsll(index, shift, batch<T, A>::size);
+                return __riscv_vadd(index, T(offset), batch<T, A>::size);
+            }
+
+            // enable for signed integers
+            template <class T>
+            using rvv_enable_signed_int_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, int>::type;
+
+            // enable for unsigned integers
+            template <class T>
+            using rvv_enable_unsigned_int_t = typename std::enable_if<std::is_integral<T>::value && std::is_unsigned<T>::value, int>::type;
+
+            // enable for floating points
+            template <class T>
+            using rvv_enable_floating_point_t = typename std::enable_if<std::is_floating_point<T>::value, int>::type;
+
+            // enable for signed integers or floating points
+            template <class T>
+            using rvv_enable_signed_int_or_floating_point_t = typename std::enable_if<std::is_signed<T>::value, int>::type;
+
+            // enable for all RVE supported types
+            template <class T>
+            using rvv_enable_all_t = typename std::enable_if<std::is_arithmetic<T>::value, int>::type;
+        } // namespace detail
+
+        /********************
+         * Scalar to vector *
+         ********************/
+
+        namespace detail
+        {
+            template <class T, size_t Width>
+            XSIMD_INLINE detail::rvv_reg_t<T, Width> broadcast(T arg) noexcept
+            {
+                // A bit of a dance, here, because rvvmv_splat has no other
+                // argument from which to deduce type, and T=char is not
+                // supported.
+                detail::rvv_fix_char_t<T> arg_not_char(arg);
+                const auto splat = detail::rvvmv_splat(arg_not_char);
+                return detail::rvv_reg_t<T, Width>(splat.get_bytes(), types::detail::XSIMD_RVV_BITCAST);
+            }
+        }
+
+        // broadcast
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<rvv>) noexcept
+        {
+            return detail::broadcast<T, A::width>(arg);
+        }
+
+        /*********
+         * Load *
+         *********/
+
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*))
+            XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec))
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvle(reinterpret_cast<detail::rvv_fix_char_t<T> const*>(src));
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<rvv>) noexcept
+        {
+            return load_aligned<A>(src, convert<T>(), rvv {});
+        }
+
+        // load_complex
+        namespace detail
+        {
+            template <class T, size_t W, typename std::enable_if<W >= types::detail::rvv_width_m1, int>::type = 0>
+            XSIMD_INLINE rvv_reg_t<T, W * 2> rvvabut(rvv_reg_t<T, W> const& lo, rvv_reg_t<T, W> const& hi) noexcept
+            {
+                typename rvv_reg_t<T, W * 2>::register_type tmp;
+                tmp = __riscv_vset(tmp, 0, lo);
+                return __riscv_vset(tmp, 1, hi);
+            }
+
+            template <class T, size_t W, typename std::enable_if<W<types::detail::rvv_width_m1, int>::type = 0> XSIMD_INLINE rvv_reg_t<T, W * 2> rvvabut(rvv_reg_t<T, W> const& lo, rvv_reg_t<T, W> const& hi) noexcept
+            {
+                return __riscv_vslideup(lo, hi, lo.vl, lo.vl * 2);
+            }
+
+            XSIMD_RVV_OVERLOAD(rvvget_lo_, (__riscv_vget_ XSIMD_RVV_TSM), _DROP_1ST_CUSTOM_ARGS_NOVL, vec(T, wide_vec), args..., 0)
+            XSIMD_RVV_OVERLOAD(rvvget_hi_, (__riscv_vget_ XSIMD_RVV_TSM), _DROP_1ST_CUSTOM_ARGS_NOVL, vec(T, wide_vec), args..., 1)
+
+            template <class T, size_t W, typename std::enable_if<W >= types::detail::rvv_width_m1, int>::type = 0>
+            rvv_reg_t<T, W> rvvget_lo(rvv_reg_t<T, W * 2> const& vv) noexcept
+            {
+                typename rvv_reg_t<T, W>::register_type tmp = rvvget_lo_(T {}, vv);
+                return tmp;
+            }
+            template <class T, size_t W, typename std::enable_if<W >= types::detail::rvv_width_m1, int>::type = 0>
+            rvv_reg_t<T, W> rvvget_hi(rvv_reg_t<T, W * 2> const& vv) noexcept
+            {
+                typename rvv_reg_t<T, W>::register_type tmp = rvvget_hi_(T {}, vv);
+                return tmp;
+            }
+            template <class T, size_t W, typename std::enable_if<W<types::detail::rvv_width_m1, int>::type = 0> rvv_reg_t<T, W> rvvget_lo(rvv_reg_t<T, W * 2> const& vv) noexcept
+            {
+                typename rvv_reg_t<T, W>::register_type tmp = vv;
+                return tmp;
+            }
+            template <class T, size_t W, typename std::enable_if<W<types::detail::rvv_width_m1, int>::type = 0> rvv_reg_t<T, W> rvvget_hi(rvv_reg_t<T, W * 2> const& vv) noexcept
+            {
+                return __riscv_vslidedown(vv, vv.vl / 2, vv.vl);
+            }
+
+            template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+            XSIMD_INLINE batch<std::complex<T>, A> load_complex(batch<T, A> const& lo, batch<T, A> const& hi, requires_arch<rvv>) noexcept
+            {
+                const auto real_index = vindex<A, as_unsigned_integer_t<T>, 0, 1>();
+                const auto imag_index = vindex<A, as_unsigned_integer_t<T>, 1, 1>();
+                const auto index = rvvabut<as_unsigned_integer_t<T>, A::width>(real_index, imag_index);
+                const auto input = rvvabut<T, A::width>(lo.data, hi.data);
+                const rvv_reg_t<T, A::width * 2> result = __riscv_vrgather(input, index, index.vl);
+
+                return { rvvget_lo<T, A::width>(result), rvvget_hi<T, A::width>(result) };
+            }
+        }
+
+        /*********
+         * Store *
+         *********/
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<rvv>) noexcept
+        {
+            detail::rvvse(reinterpret_cast<detail::rvv_fix_char_t<T>*>(dst), src);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<rvv>) noexcept
+        {
+            store_aligned<A>(dst, src, rvv {});
+        }
+
+        /******************
+         * scatter/gather *
+         ******************/
+
+        namespace detail
+        {
+            template <class T, class U>
+            using rvv_enable_sg_t = typename std::enable_if<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>::type;
+            XSIMD_RVV_OVERLOAD(rvvloxei, (__riscv_vloxei XSIMD_RVV_S), , vec(T const*, uvec))
+            XSIMD_RVV_OVERLOAD(rvvsoxei, (__riscv_vsoxei XSIMD_RVV_S), , void(T*, uvec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvmul_splat,
+                                (__riscv_vmul),
+                                (__riscv_vmul),
+                                (__riscv_vfmul), , vec(vec, T))
+        }
+
+        // scatter
+        template <class A, class T, class U, detail::rvv_enable_sg_t<T, U> = 0>
+        XSIMD_INLINE void scatter(batch<T, A> const& vals, T* dst, batch<U, A> const& index, kernel::requires_arch<rvv>) noexcept
+        {
+            using UU = as_unsigned_integer_t<U>;
+            const auto uindex = detail::rvv_to_unsigned_batch(index);
+            auto* base = reinterpret_cast<detail::rvv_fix_char_t<T>*>(dst);
+            // or rvvsuxei
+            const auto bi = detail::rvvmul_splat(uindex, sizeof(T));
+            detail::rvvsoxei(base, bi, vals);
+        }
+
+        // gather
+        template <class A, class T, class U, detail::rvv_enable_sg_t<T, U> = 0>
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<rvv>) noexcept
+        {
+            using UU = as_unsigned_integer_t<U>;
+            const auto uindex = detail::rvv_to_unsigned_batch(index);
+            auto const* base = reinterpret_cast<detail::rvv_fix_char_t<T> const*>(src);
+            // or rvvluxei
+            const auto bi = detail::rvvmul_splat(uindex, sizeof(T));
+            return detail::rvvloxei(base, bi);
+        }
+
+        /**************
+         * Arithmetic *
+         **************/
+
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD3(rvvadd,
+                                (__riscv_vadd),
+                                (__riscv_vadd),
+                                (__riscv_vfadd), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD2(rvvsadd,
+                                (__riscv_vsadd),
+                                (__riscv_vsaddu), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvsub,
+                                (__riscv_vsub),
+                                (__riscv_vsub),
+                                (__riscv_vfsub), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD2(rvvssub,
+                                (__riscv_vssub),
+                                (__riscv_vssubu), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD2(rvvaadd,
+                                (__riscv_vaadd),
+                                (__riscv_vaaddu), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvmul,
+                                (__riscv_vmul),
+                                (__riscv_vmul),
+                                (__riscv_vfmul), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvdiv,
+                                (__riscv_vdiv),
+                                (__riscv_vdivu),
+                                (__riscv_vfdiv), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvmax,
+                                (__riscv_vmax),
+                                (__riscv_vmaxu),
+                                (__riscv_vfmax), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvmin,
+                                (__riscv_vmin),
+                                (__riscv_vminu),
+                                (__riscv_vfmin), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvneg,
+                                (__riscv_vneg),
+                                (abort),
+                                (__riscv_vfneg), , vec(vec))
+            XSIMD_RVV_OVERLOAD_FLOATS(rvvabs,
+                                      (__riscv_vfabs), , vec(vec))
+            XSIMD_RVV_OVERLOAD3(rvvmacc,
+                                (__riscv_vmacc),
+                                (__riscv_vmacc),
+                                (__riscv_vfmacc), , vec(vec, vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvnmsac,
+                                (__riscv_vnmsac),
+                                (__riscv_vnmsac),
+                                (__riscv_vfnmsac), , vec(vec, vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvmadd,
+                                (__riscv_vmadd),
+                                (__riscv_vmadd),
+                                (__riscv_vfmadd), , vec(vec, vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvnmsub,
+                                (__riscv_vnmsub),
+                                (__riscv_vnmsub),
+                                (__riscv_vfnmsub), , vec(vec, vec, vec))
+
+#define RISCV_VMSXX(XX)                                      \
+    XSIMD_RVV_OVERLOAD3(rvvms##XX,                           \
+                        (__riscv_vms##XX),                   \
+                        (__riscv_vms##XX##u),                \
+                        (__riscv_vmf##XX), , bvec(vec, vec)) \
+    XSIMD_RVV_OVERLOAD3(rvvms##XX##_splat,                   \
+                        (__riscv_vms##XX),                   \
+                        (__riscv_vms##XX##u),                \
+                        (__riscv_vmf##XX), , bvec(vec, T))
+#define __riscv_vmsequ __riscv_vmseq
+#define __riscv_vmsneu __riscv_vmsne
+            RISCV_VMSXX(eq)
+            RISCV_VMSXX(ne)
+            RISCV_VMSXX(lt)
+            RISCV_VMSXX(le)
+            RISCV_VMSXX(gt)
+            RISCV_VMSXX(ge)
+#undef __riscv_vmsequ
+#undef __riscv_vmsneu
+#undef RISCV_VMSXX
+        } // namespace detail
+
+        // add
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvadd(lhs, rhs);
+        }
+
+        // sadd
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvsadd(lhs, rhs);
+        }
+
+        // sub
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvsub(lhs, rhs);
+        }
+
+        // ssub
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvssub(lhs, rhs);
+        }
+
+        // mul
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmul(lhs, rhs);
+        }
+
+        // div
+        template <class A, class T, typename detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvdiv(lhs, rhs);
+        }
+
+        // max
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmax(lhs, rhs);
+        }
+
+        // min
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmin(lhs, rhs);
+        }
+
+        // neg
+        template <class A, class T, detail::rvv_enable_unsigned_int_t<T> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            using S = as_signed_integer_t<T>;
+            const auto as_signed = detail::rvvreinterpret<S>(arg);
+            const auto result = detail::rvvneg(as_signed);
+            return detail::rvvreinterpret<T>(result);
+        }
+
+        template <class A, class T, detail::rvv_enable_signed_int_or_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvneg(arg);
+        }
+
+        // abs
+        template <class A, class T, detail::rvv_enable_unsigned_int_t<T> = 0>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return arg;
+        }
+
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvabs(arg);
+        }
+
+        // fma: x * y + z
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
+        {
+            // also detail::rvvmadd(x, y, z);
+            return detail::rvvmacc(z, x, y);
+        }
+
+        // fnma: z - x * y
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
+        {
+            // also detail::rvvnmsub(x, y, z);
+            return detail::rvvnmsac(z, x, y);
+        }
+
+        // fms: x * y - z
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
+        {
+            // also vfmsac(z, x, y), but lacking integer version
+            // also vfmsub(x, y, z), but lacking integer version
+            return -fnma(x, y, z);
+        }
+
+        // fnms: - x * y - z
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
+        {
+            // also vfnmacc(z, x, y), but lacking integer version
+            // also vfnmadd(x, y, z), but lacking integer version
+            return -fma(z, x, y);
+        }
+
+        /**********************
+         * Logical operations *
+         **********************/
+
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD_INTS(rvvand, (__riscv_vand), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD_INTS(rvvor, (__riscv_vor), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD_INTS(rvvor_splat, (__riscv_vor), , vec(vec, T))
+            XSIMD_RVV_OVERLOAD_INTS(rvvxor, (__riscv_vxor), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD_INTS(rvvnot, (__riscv_vnot), , vec(vec))
+            XSIMD_RVV_OVERLOAD(rvvmand, (__riscv_vmand_mm_b XSIMD_RVV_S), , bvec(bvec, bvec))
+            XSIMD_RVV_OVERLOAD(rvvmor, (__riscv_vmor_mm_b XSIMD_RVV_S), , bvec(bvec, bvec))
+            XSIMD_RVV_OVERLOAD(rvvmxor, (__riscv_vmxor_mm_b XSIMD_RVV_S), , bvec(bvec, bvec))
+            XSIMD_RVV_OVERLOAD(rvvmandn, (__riscv_vmandn_mm_b XSIMD_RVV_S), , bvec(bvec, bvec))
+            XSIMD_RVV_OVERLOAD(rvvmnot, (__riscv_vmnot), , bvec(bvec))
+        }
+
+        // bitwise_and
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvand(lhs, rhs);
+        }
+
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
+            const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
+            const auto result_bits = detail::rvvand(lhs_bits, rhs_bits);
+            return detail::rvvreinterpret<T>(result_bits);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmand(lhs, rhs);
+        }
+
+        // bitwise_andnot
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto not_rhs = detail::rvvnot(rhs);
+            return detail::rvvand(lhs, not_rhs);
+        }
+
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
+            const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
+            const auto not_rhs = detail::rvvnot(rhs_bits);
+            const auto result_bits = detail::rvvand(lhs_bits, not_rhs);
+            return detail::rvvreinterpret<T>(result_bits);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmandn(lhs, rhs);
+        }
+
+        // bitwise_or
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvor(lhs, rhs);
+        }
+
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
+            const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
+            const auto result_bits = detail::rvvor(lhs_bits, rhs_bits);
+            return detail::rvvreinterpret<T>(result_bits);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmor(lhs, rhs);
+        }
+
+        // bitwise_xor
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvxor(lhs, rhs);
+        }
+
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
+            const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
+            const auto result_bits = detail::rvvxor(lhs_bits, rhs_bits);
+            return detail::rvvreinterpret<T>(result_bits);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmxor(lhs, rhs);
+        }
+
+        // bitwise_not
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvnot(arg);
+        }
+
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            const auto arg_bits = detail::rvv_to_unsigned_batch(arg);
+            const auto result_bits = detail::rvvnot(arg_bits);
+            return detail::rvvreinterpret<T>(result_bits);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmnot(arg);
+        }
+
+        /**********
+         * Shifts *
+         **********/
+
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD_INTS(rvvsll_splat, (__riscv_vsll), , vec(vec, size_t))
+            XSIMD_RVV_OVERLOAD_INTS(rvvsll, (__riscv_vsll), , vec(vec, uvec))
+            XSIMD_RVV_OVERLOAD2(rvvsr_splat,
+                                (__riscv_vsra),
+                                (__riscv_vsrl), , vec(vec, size_t))
+            XSIMD_RVV_OVERLOAD2(rvvsr,
+                                (__riscv_vsra),
+                                (__riscv_vsrl), , vec(vec, uvec))
+        } // namespace detail
+
+        // bitwise_lshift
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& arg, int n, requires_arch<rvv>) noexcept
+        {
+            constexpr size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<size_t>(n) < size && "index in bounds");
+            return detail::rvvsll_splat(arg, n);
+        }
+
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvsll(lhs, detail::rvv_to_unsigned_batch<A, T>(rhs));
+        }
+
+        // bitwise_rshift
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<rvv>) noexcept
+        {
+            constexpr size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<size_t>(n) < size && "index in bounds");
+            return detail::rvvsr_splat(arg, n);
+        }
+
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvsr(lhs, detail::rvv_to_unsigned_batch<A, T>(rhs));
+        }
+
+        /**************
+         * Reductions *
+         **************/
+
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD3(rvvredsum,
+                                (__riscv_vredsum),
+                                (__riscv_vredsum),
+                                (__riscv_vfredosum), // or __riscv_vfredusum
+                                , scalar_vec(vec, scalar_vec))
+            XSIMD_RVV_OVERLOAD3(rvvredmax,
+                                (__riscv_vredmax),
+                                (__riscv_vredmaxu),
+                                (__riscv_vfredmax), , scalar_vec(vec, scalar_vec))
+            XSIMD_RVV_OVERLOAD3(rvvredmin,
+                                (__riscv_vredmin),
+                                (__riscv_vredminu),
+                                (__riscv_vfredmin), , scalar_vec(vec, scalar_vec))
+            XSIMD_RVV_OVERLOAD3(rvvslide1up,
+                                (__riscv_vslide1up),
+                                (__riscv_vslide1up),
+                                (__riscv_vfslide1up), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvslide1down,
+                                (__riscv_vslide1down),
+                                (__riscv_vslide1down),
+                                (__riscv_vfslide1down), , vec(vec, T))
+
+            template <class A, class T>
+            XSIMD_INLINE T reduce_scalar(rvv_reg_t<T, types::detail::rvv_width_m1> const& arg)
+            {
+                return detail::rvvmv_lane0(rvv_reg_t<T, A::width>(arg.get_bytes(), types::detail::XSIMD_RVV_BITCAST));
+            }
+        }
+        // reduce_add
+        template <class A, class T, class V = typename batch<T, A>::value_type, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE V reduce_add(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            const auto zero = detail::broadcast<T, types::detail::rvv_width_m1>(T(0));
+            const auto r = detail::rvvredsum(arg, zero);
+            return detail::reduce_scalar<A, T>(r);
+        }
+
+        // reduce_max
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE T reduce_max(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            const auto lowest = detail::broadcast<T, types::detail::rvv_width_m1>(std::numeric_limits<T>::lowest());
+            const auto r = detail::rvvredmax(arg, lowest);
+            return detail::reduce_scalar<A, T>(r);
+        }
+
+        // reduce_min
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE T reduce_min(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            const auto max = detail::broadcast<T, types::detail::rvv_width_m1>(std::numeric_limits<T>::max());
+            const auto r = detail::rvvredmin(arg, max);
+            return detail::reduce_scalar<A, T>(r);
+        }
+
+        // haddp
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> haddp(const batch<T, A>* row, requires_arch<rvv>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            T sums[size];
+#pragma unroll size
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                sums[i] = reduce_add(row[i], rvv {});
+            }
+            return load_aligned<A>(sums, convert<T>(), rvv {});
+        }
+
+        /***************
+         * Comparisons *
+         ***************/
+
+        // eq
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmseq(lhs, rhs);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto neq_result = detail::rvvmxor(lhs, rhs);
+            return detail::rvvmnot(neq_result);
+        }
+
+        // neq
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmsne(lhs, rhs);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmxor(lhs, rhs);
+        }
+
+        // lt
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmslt(lhs, rhs);
+        }
+
+        // le
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmsle(lhs, rhs);
+        }
+
+        // gt
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmsgt(lhs, rhs);
+        }
+
+        // ge
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmsge(lhs, rhs);
+        }
+
+        /*************
+         * Selection *
+         *************/
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD(rvvcompress, (__riscv_vcompress_tu), , vec(vec, vec, bvec))
+        }
+        // compress
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> compress(batch<T, A> const& x, batch_bool<T, A> const& mask, requires_arch<rvv>) noexcept
+        {
+            auto zero = broadcast<A>(T(0), rvv {});
+            return detail::rvvcompress(zero, x, mask);
+        }
+
+        /***************
+         * Permutation *
+         ***************/
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD(rvvrgather, (__riscv_vrgather), , vec(vec, uvec))
+            XSIMD_RVV_OVERLOAD(rvvslideup, (__riscv_vslideup), , vec(vec, vec, size_t))
+            XSIMD_RVV_OVERLOAD(rvvslidedown, (__riscv_vslidedown), , vec(vec, size_t))
+        }
+
+        // swizzle
+        template <class A, class T, class I, I... idx>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<I, A, idx...>, requires_arch<rvv>) noexcept
+        {
+            static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
+            const batch<I, A> indices { idx... };
+            return detail::rvvrgather(arg, indices);
+        }
+
+        template <class A, class T, class I, I... idx>
+        XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
+                                                       batch_constant<I, A, idx...>,
+                                                       requires_arch<rvv>) noexcept
+        {
+            const auto real = swizzle(self.real(), batch_constant<I, A, idx...> {}, rvv {});
+            const auto imag = swizzle(self.imag(), batch_constant<I, A, idx...> {}, rvv {});
+            return batch<std::complex<T>>(real, imag);
+        }
+
+        /*************
+         * Selection *
+         *************/
+
+        // extract_pair
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, size_t n, requires_arch<rvv>) noexcept
+        {
+            const auto tmp = detail::rvvslidedown(rhs, n);
+            return detail::rvvslideup(tmp, lhs, lhs.size - n);
+        }
+
+        // select
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmerge(b, a, cond);
+        }
+
+        template <class A, class T, bool... b>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<rvv>) noexcept
+        {
+            return select(batch_bool<T, A> { b... }, true_br, false_br, rvv {});
+        }
+
+        // zip_lo
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto index = detail::vindex<A, as_unsigned_integer_t<T>, 0, -1>();
+            const auto mask = detail::pmask8<T, A::width>(0xaa);
+            return detail::rvvmerge(detail::rvvrgather(lhs, index),
+                                    detail::rvvrgather(rhs, index),
+                                    mask);
+        }
+
+        // zip_hi
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto index = detail::vindex<A, as_unsigned_integer_t<T>, batch<T, A>::size / 2, -1>();
+            const auto mask = detail::pmask8<T, A::width>(0xaa);
+            return detail::rvvmerge(detail::rvvrgather(lhs, index),
+                                    detail::rvvrgather(rhs, index),
+                                    mask);
+        }
+
+        // store_complex
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<rvv>) noexcept
+        {
+            const auto lo = zip_lo(src.real(), src.imag());
+            const auto hi = zip_hi(src.real(), src.imag());
+            T* buf = reinterpret_cast<T*>(dst);
+            store_aligned(buf, lo, rvv {});
+            store_aligned(buf + lo.size, hi, rvv {});
+        }
+
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<rvv>) noexcept
+        {
+            store_complex_aligned(dst, src, rvv {});
+        }
+
+        /*****************************
+         * Floating-point arithmetic *
+         *****************************/
+
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD_FLOATS(rvvfsqrt, (__riscv_vfsqrt), , vec(vec))
+            XSIMD_RVV_OVERLOAD_FLOATS(rvvfrec7, (__riscv_vfrec7), , vec(vec))
+            XSIMD_RVV_OVERLOAD_FLOATS(rvvfrsqrt7, (__riscv_vfrsqrt7), , vec(vec))
+        }
+
+        // rsqrt
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            auto approx = detail::rvvfrsqrt7(arg);
+            approx = approx * (1.5 - (0.5 * arg * approx * approx));
+            return approx;
+        }
+
+        // sqrt
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvfsqrt(arg);
+        }
+
+        // reciprocal
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvfrec7(arg);
+        }
+
+        /******************************
+         * Floating-point conversions *
+         ******************************/
+
+        // fast_cast
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD2(rvvfcvt_rtz, // truncating conversion, like C.
+                                (__riscv_vfcvt_rtz_x),
+                                (__riscv_vfcvt_rtz_xu), _DROP_1ST, vec(T, fvec))
+            XSIMD_RVV_OVERLOAD2(rvvfcvt_rne, // round to nearest, ties to even
+                                (__riscv_vfcvt_x),
+                                (__riscv_vfcvt_xu), _DROP_1ST_CUSTOM_ARGS, vec(T, fvec), args..., __RISCV_FRM_RNE)
+            XSIMD_RVV_OVERLOAD2(rvvfcvt_rmm, // round to nearest, ties to max magnitude
+                                (__riscv_vfcvt_x),
+                                (__riscv_vfcvt_xu), _DROP_1ST_CUSTOM_ARGS, vec(T, fvec), args..., __RISCV_FRM_RMM)
+            XSIMD_RVV_OVERLOAD2(rvvfcvt, // round to current rounding mode.
+                                (__riscv_vfcvt_x),
+                                (__riscv_vfcvt_xu), _DROP_1ST, vec(T, fvec))
+            XSIMD_RVV_OVERLOAD_INTS(rvvfcvt_f, (__riscv_vfcvt_f), , fvec(vec))
+
+            template <class T, class U>
+            using rvv_enable_ftoi_t = typename std::enable_if<(sizeof(T) == sizeof(U) && std::is_floating_point<T>::value && !std::is_floating_point<U>::value), int>::type;
+            template <class T, class U>
+            using rvv_enable_itof_t = typename std::enable_if<(sizeof(T) == sizeof(U) && !std::is_floating_point<T>::value && std::is_floating_point<U>::value), int>::type;
+
+            template <class A, class T, class U, rvv_enable_ftoi_t<T, U> = 0>
+            XSIMD_INLINE batch<U, A> fast_cast(batch<T, A> const& arg, batch<U, A> const&, requires_arch<rvv>) noexcept
+            {
+                return rvvfcvt_rtz(U {}, arg);
+            }
+            template <class A, class T, class U, rvv_enable_itof_t<T, U> = 0>
+            XSIMD_INLINE batch<U, A> fast_cast(batch<T, A> const& arg, batch<U, A> const&, requires_arch<rvv>) noexcept
+            {
+                return rvvfcvt_f(arg);
+            }
+        }
+
+        /*********
+         * Miscs *
+         *********/
+
+        // set
+        template <class A, class T, class... Args>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<rvv>, Args... args) noexcept
+        {
+            const std::array<T, batch<T, A>::size> tmp { args... };
+            return load_unaligned<A>(tmp.data(), convert<T>(), rvv {});
+        }
+
+        template <class A, class T, class... Args>
+        XSIMD_INLINE batch<std::complex<T>, A> set(batch<std::complex<T>, A> const&, requires_arch<rvv>,
+                                                   Args... args_complex) noexcept
+        {
+            return batch<std::complex<T>>(set(batch<T, rvv> {}, rvv {}, args_complex.real()...),
+                                          set(batch<T, rvv> {}, rvv {}, args_complex.imag()...));
+        }
+
+        template <class A, class T, class... Args>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<rvv>, Args... args) noexcept
+        {
+            using U = as_unsigned_integer_t<T>;
+            const auto values = set(batch<U, rvv> {}, rvv {}, static_cast<U>(args)...);
+            const auto zero = broadcast<A>(U(0), rvv {});
+            detail::rvv_bool_t<T> result = detail::rvvmsne(values, zero);
+            return result;
+        }
+
+        // insert
+        template <class A, class T, size_t I, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<rvv>) noexcept
+        {
+            const auto mask = detail::pmask<T, A::width>(uint64_t(1) << I);
+            return detail::rvvmerge_splat(arg, val, mask);
+        }
+
+        // get
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& arg, size_t i, requires_arch<rvv>) noexcept
+        {
+            const auto tmp = detail::rvvslidedown(arg, i);
+            return detail::rvvmv_lane0(tmp);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE std::complex<T> get(batch<std::complex<T>, A> const& arg, size_t i, requires_arch<rvv>) noexcept
+        {
+            const auto tmpr = detail::rvvslidedown(arg.real(), i);
+            const auto tmpi = detail::rvvslidedown(arg.imag(), i);
+            return std::complex<T> { detail::rvvmv_lane0(tmpr), detail::rvvmv_lane0(tmpi) };
+        }
+
+        // all
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvcpop(arg) == batch_bool<T, A>::size;
+        }
+
+        // any
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvcpop(arg) > 0;
+        }
+
+        // bitwise_cast
+        template <class A, class T, class R, detail::rvv_enable_all_t<T> = 0, detail::rvv_enable_all_t<R> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<rvv>) noexcept
+        {
+            return detail::rvv_reg_t<R, A::width>(arg.data.get_bytes(), types::detail::XSIMD_RVV_BITCAST);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in, detail::rvv_enable_all_t<T_in> = 0>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<rvv>) noexcept
+        {
+            using intermediate_t = typename detail::rvv_bool_t<T_out>;
+            return intermediate_t(arg.data);
+        }
+
+        // from_bool
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            const auto zero = broadcast<A>(T(0), rvv {});
+            return detail::rvvmerge_splat(zero, T(1), arg);
+        }
+
+        namespace detail
+        {
+            template <size_t Width>
+            XSIMD_INLINE vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i)
+            {
+                return __riscv_vslidedown(arg, i, types::detail::rvv_width_m1 / 8);
+            }
+            template <>
+            XSIMD_INLINE vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf2>(vuint8m1_t arg, size_t i)
+            {
+                const auto bytes = __riscv_vlmul_trunc_u8mf2(arg);
+                const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf2 / 8);
+                return __riscv_vlmul_ext_u8m1(result);
+            }
+            template <>
+            XSIMD_INLINE vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf4>(vuint8m1_t arg, size_t i)
+            {
+                const auto bytes = __riscv_vlmul_trunc_u8mf4(arg);
+                const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf4 / 8);
+                return __riscv_vlmul_ext_u8m1(result);
+            }
+            template <>
+            XSIMD_INLINE vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf8>(vuint8m1_t arg, size_t i)
+            {
+                const auto bytes = __riscv_vlmul_trunc_u8mf8(arg);
+                const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf8 / 8);
+                return __riscv_vlmul_ext_u8m1(result);
+            }
+        }
+
+        // slide_left
+        template <size_t N, class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            const auto zero = broadcast<A>(uint8_t(0), rvv {});
+            const auto bytes = arg.data.get_bytes();
+            return detail::rvvreinterpret<T>(detail::rvvslideup(zero, bytes, N));
+        }
+
+        // slide_right
+        template <size_t N, class A, class T, detail::rvv_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            using reg_t = detail::rvv_reg_t<T, A::width>;
+            const auto bytes = arg.data.get_bytes();
+            return reg_t(detail::rvvslidedownbytes<A::width>(bytes, N), types::detail::XSIMD_RVV_BITCAST);
+        }
+
+        // isnan
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return !(arg == arg);
+        }
+
+        namespace detail
+        {
+            template <class T>
+            using rvv_as_signed_integer_t = as_signed_integer_t<as_unsigned_integer_t<T>>;
+
+            template <class A, class T, class U = rvv_as_signed_integer_t<T>>
+            XSIMD_INLINE batch<U, A> rvvfcvt_default(batch<T, A> const& arg) noexcept
+            {
+                return rvvfcvt_rne(U {}, arg);
+            }
+
+            template <class A, class T, class U = rvv_as_signed_integer_t<T>>
+            XSIMD_INLINE batch<U, A> rvvfcvt_afz(batch<T, A> const& arg) noexcept
+            {
+                return rvvfcvt_rmm(U {}, arg);
+            }
+        }
+
+        // nearbyint_as_int
+        template <class A, class T, class U = detail::rvv_as_signed_integer_t<T>>
+        XSIMD_INLINE batch<U, A> nearbyint_as_int(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            // Reference rounds ties to nearest even
+            return detail::rvvfcvt_default(arg);
+        }
+
+        // round
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> round(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            // Round ties away from zero.
+            const auto mask = abs(arg) < constants::maxflint<batch<T, A>>();
+            return select(mask, to_float(detail::rvvfcvt_afz(arg)), arg, rvv {});
+        }
+
+        // nearbyint
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            // Round according to current rounding mode.
+            const auto mask = abs(arg) < constants::maxflint<batch<T, A>>();
+            return select(mask, to_float(detail::rvvfcvt_default(arg)), arg, rvv {});
+        }
+    } // namespace kernel
+} // namespace xsimd
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_scalar.hpp b/include/onnxruntime/xsimd/arch/xsimd_scalar.hpp
new file mode 100644
index 0000000000000..f72dc76a7d3b3
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_scalar.hpp
@@ -0,0 +1,1223 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SCALAR_HPP
+#define XSIMD_SCALAR_HPP
+
+#include <cassert>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+
+#include "xsimd/config/xsimd_inline.hpp"
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+#include "xtl/xcomplex.hpp"
+#endif
+
+#ifdef __APPLE__
+#include <AvailabilityMacros.h>
+#endif
+
+namespace xsimd
+{
+    template <class T, class A>
+    class batch;
+    template <class T, class A>
+    class batch_bool;
+
+    using std::abs;
+
+    using std::acos;
+    using std::acosh;
+    using std::arg;
+    using std::asin;
+    using std::asinh;
+    using std::atan;
+    using std::atan2;
+    using std::atanh;
+    using std::cbrt;
+    using std::ceil;
+    using std::conj;
+    using std::copysign;
+    using std::cos;
+    using std::cosh;
+    using std::erf;
+    using std::erfc;
+    using std::exp;
+    using std::exp2;
+    using std::expm1;
+    using std::fabs;
+    using std::fdim;
+    using std::floor;
+    using std::fmax;
+    using std::fmin;
+    using std::fmod;
+    using std::hypot;
+    using std::ldexp;
+    using std::lgamma;
+    using std::log;
+    using std::log10;
+    using std::log1p;
+    using std::log2;
+    using std::modf;
+    using std::nearbyint;
+    using std::nextafter;
+    using std::norm;
+    using std::polar;
+    using std::proj;
+    using std::remainder;
+    using std::rint;
+    using std::round;
+    using std::sin;
+    using std::sinh;
+    using std::sqrt;
+    using std::tan;
+    using std::tanh;
+    using std::tgamma;
+    using std::trunc;
+
+    XSIMD_INLINE signed char abs(signed char v)
+    {
+        return v < 0 ? -v : v;
+    }
+
+    namespace detail
+    {
+        // Use templated type here to prevent automatic instantiation that may
+        // ends up in a warning
+        template <typename char_type>
+        XSIMD_INLINE char abs(char_type v, std::true_type)
+        {
+            return v;
+        }
+        template <typename char_type>
+        XSIMD_INLINE char abs(char_type v, std::false_type)
+        {
+            return v < 0 ? -v : v;
+        }
+    }
+
+    XSIMD_INLINE char abs(char v)
+    {
+        return detail::abs(v, std::is_unsigned<char>::type {});
+    }
+
+    XSIMD_INLINE short abs(short v)
+    {
+        return v < 0 ? -v : v;
+    }
+    XSIMD_INLINE unsigned char abs(unsigned char v)
+    {
+        return v;
+    }
+    XSIMD_INLINE unsigned short abs(unsigned short v)
+    {
+        return v;
+    }
+    XSIMD_INLINE unsigned int abs(unsigned int v)
+    {
+        return v;
+    }
+    XSIMD_INLINE unsigned long abs(unsigned long v)
+    {
+        return v;
+    }
+    XSIMD_INLINE unsigned long long abs(unsigned long long v)
+    {
+        return v;
+    }
+
+#ifndef _WIN32
+    using std::isfinite;
+    using std::isinf;
+    using std::isnan;
+#else
+
+    // Windows defines catch all templates
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+    isfinite(T var) noexcept
+    {
+        return std::isfinite(var);
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, bool>::type
+    isfinite(T var) noexcept
+    {
+        return isfinite(double(var));
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+    isinf(T var) noexcept
+    {
+        return std::isinf(var);
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, bool>::type
+    isinf(T var) noexcept
+    {
+        return isinf(double(var));
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+    isnan(T var) noexcept
+    {
+        return std::isnan(var);
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, bool>::type
+    isnan(T var) noexcept
+    {
+        return isnan(double(var));
+    }
+#endif
+
+    template <class T, class Tp>
+    XSIMD_INLINE typename std::common_type<T, Tp>::type add(T const& x, Tp const& y) noexcept
+    {
+        return x + y;
+    }
+
+    template <class T, class Tp>
+    XSIMD_INLINE typename std::common_type<T, Tp>::type avg(T const& x, Tp const& y) noexcept
+    {
+        using common_type = typename std::common_type<T, Tp>::type;
+        if (std::is_floating_point<common_type>::value)
+            return (x + y) / 2;
+        else if (std::is_unsigned<common_type>::value)
+        {
+            return (x & y) + ((x ^ y) >> 1);
+        }
+        else
+        {
+            // Inspired by
+            // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c
+            auto t = (x & y) + ((x ^ y) >> 1);
+            auto t_u = static_cast<typename std::make_unsigned<common_type>::type>(t);
+            auto avg = t + (static_cast<T>(t_u >> (8 * sizeof(T) - 1)) & (x ^ y));
+            return avg;
+        }
+    }
+
+    template <class T, class Tp>
+    XSIMD_INLINE typename std::common_type<T, Tp>::type avgr(T const& x, Tp const& y) noexcept
+    {
+        using common_type = typename std::common_type<T, Tp>::type;
+        if (std::is_floating_point<common_type>::value)
+            return avg(x, y);
+        else
+        {
+            return avg(x, y) + ((x ^ y) & 1);
+        }
+    }
+
+    template <class T>
+    XSIMD_INLINE T incr(T const& x) noexcept
+    {
+        return x + T(1);
+    }
+
+    template <class T>
+    XSIMD_INLINE T incr_if(T const& x, bool mask) noexcept
+    {
+        return x + T(mask ? 1 : 0);
+    }
+
+    XSIMD_INLINE bool all(bool mask)
+    {
+        return mask;
+    }
+
+    XSIMD_INLINE bool any(bool mask)
+    {
+        return mask;
+    }
+
+    XSIMD_INLINE bool none(bool mask)
+    {
+        return !mask;
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_and(T x, T y) noexcept
+    {
+        return x & y;
+    }
+
+    template <class T_out, class T_in>
+    XSIMD_INLINE T_out bitwise_cast(T_in x) noexcept
+    {
+        static_assert(sizeof(T_in) == sizeof(T_out), "bitwise_cast between types of the same size");
+        T_out r;
+        std::memcpy((void*)&r, (void*)&x, sizeof(T_in));
+        return r;
+    }
+
+    XSIMD_INLINE float bitwise_and(float x, float y) noexcept
+    {
+        uint32_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(float));
+        uint32_t ir = bitwise_and(ix, iy);
+        float r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
+        return r;
+    }
+
+    XSIMD_INLINE double bitwise_and(double x, double y) noexcept
+    {
+        uint64_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(double));
+        uint64_t ir = bitwise_and(ix, iy);
+        double r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        return r;
+    }
+
+    template <class T0, class T1>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
+    bitwise_lshift(T0 x, T1 shift) noexcept
+    {
+        return x << shift;
+    }
+
+    template <class T0, class T1>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
+    bitwise_rshift(T0 x, T1 shift) noexcept
+    {
+        return x >> shift;
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_not(T x) noexcept
+    {
+        return ~x;
+    }
+
+    XSIMD_INLINE bool bitwise_not(bool x) noexcept
+    {
+        return !x;
+    }
+
+    XSIMD_INLINE float bitwise_not(float x) noexcept
+    {
+        uint32_t ix;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
+        uint32_t ir = bitwise_not(ix);
+        float r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
+        return r;
+    }
+
+    XSIMD_INLINE double bitwise_not(double x) noexcept
+    {
+        uint64_t ix;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
+        uint64_t ir = bitwise_not(ix);
+        double r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        return r;
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_scalar<T>::value, T>::type bitwise_andnot(T x, T y) noexcept
+    {
+        return bitwise_and(x, bitwise_not(y));
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_or(T x, T y) noexcept
+    {
+        return x | y;
+    }
+
+    XSIMD_INLINE float bitwise_or(float x, float y) noexcept
+    {
+        uint32_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(float));
+        uint32_t ir = bitwise_or(ix, iy);
+        float r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
+        return r;
+    }
+
+    XSIMD_INLINE double bitwise_or(double x, double y) noexcept
+    {
+        uint64_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(double));
+        uint64_t ir = bitwise_or(ix, iy);
+        double r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        return r;
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_xor(T x, T y) noexcept
+    {
+        return x ^ y;
+    }
+
+    XSIMD_INLINE float bitwise_xor(float x, float y) noexcept
+    {
+        uint32_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(float));
+        uint32_t ir = bitwise_xor(ix, iy);
+        float r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
+        return r;
+    }
+
+    XSIMD_INLINE double bitwise_xor(double x, double y) noexcept
+    {
+        uint64_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(double));
+        uint64_t ir = bitwise_xor(ix, iy);
+        double r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        return r;
+    }
+
+    template <class T, class Tp>
+    XSIMD_INLINE typename std::common_type<T, Tp>::type div(T const& x, Tp const& y) noexcept
+    {
+        return x / y;
+    }
+
+    template <class T, class Tp>
+    XSIMD_INLINE auto mod(T const& x, Tp const& y) noexcept -> decltype(x % y)
+    {
+        return x % y;
+    }
+
+    template <class T, class Tp>
+    XSIMD_INLINE typename std::common_type<T, Tp>::type mul(T const& x, Tp const& y) noexcept
+    {
+        return x * y;
+    }
+
+    template <class T>
+    XSIMD_INLINE T neg(T const& x) noexcept
+    {
+        return -x;
+    }
+
+    template <class T>
+    XSIMD_INLINE auto pos(T const& x) noexcept -> decltype(+x)
+    {
+        return +x;
+    }
+
+    XSIMD_INLINE float reciprocal(float const& x) noexcept
+    {
+        return 1.f / x;
+    }
+
+    XSIMD_INLINE double reciprocal(double const& x) noexcept
+    {
+        return 1. / x;
+    }
+
+    template <class T0, class T1>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
+    rotl(T0 x, T1 shift) noexcept
+    {
+        constexpr auto N = std::numeric_limits<T0>::digits;
+        return (x << shift) | (x >> (N - shift));
+    }
+
+    template <class T0, class T1>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
+    rotr(T0 x, T1 shift) noexcept
+    {
+        constexpr auto N = std::numeric_limits<T0>::digits;
+        return (x >> shift) | (x << (N - shift));
+    }
+
+    template <class T>
+    XSIMD_INLINE bool isnan(std::complex<T> var) noexcept
+    {
+        return std::isnan(std::real(var)) || std::isnan(std::imag(var));
+    }
+
+    template <class T>
+    XSIMD_INLINE bool isinf(std::complex<T> var) noexcept
+    {
+        return std::isinf(std::real(var)) || std::isinf(std::imag(var));
+    }
+
+    template <class T>
+    XSIMD_INLINE bool isfinite(std::complex<T> var) noexcept
+    {
+        return std::isfinite(std::real(var)) && std::isfinite(std::imag(var));
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    using xtl::abs;
+    using xtl::acos;
+    using xtl::acosh;
+    using xtl::asin;
+    using xtl::asinh;
+    using xtl::atan;
+    using xtl::atanh;
+    using xtl::cos;
+    using xtl::cosh;
+    using xtl::exp;
+    using xtl::log;
+    using xtl::log10;
+    using xtl::norm;
+    using xtl::pow;
+    using xtl::proj;
+    using xtl::sin;
+    using xtl::sinh;
+    using xtl::sqrt;
+    using xtl::tan;
+    using xtl::tanh;
+#endif
+
+    template <typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE T clip(const T& val, const T& low, const T& hi) noexcept
+    {
+        assert(low <= hi && "ordered clipping bounds");
+        return low > val ? low : (hi < val ? hi : val);
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE bool is_flint(const T& x) noexcept
+    {
+        return std::isnan(x - x) ? false : (x - std::trunc(x)) == T(0);
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE bool is_even(const T& x) noexcept
+    {
+        return is_flint(x * T(0.5));
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE bool is_odd(const T& x) noexcept
+    {
+        return is_even(x - 1.);
+    }
+
+    XSIMD_INLINE int32_t nearbyint_as_int(float var) noexcept
+    {
+        return static_cast<int32_t>(std::nearbyint(var));
+    }
+
+    XSIMD_INLINE int64_t nearbyint_as_int(double var) noexcept
+    {
+        return static_cast<int64_t>(std::nearbyint(var));
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE bool eq(const T& x0, const T& x1) noexcept
+    {
+        return x0 == x1;
+    }
+
+    template <class T>
+    XSIMD_INLINE bool eq(const std::complex<T>& x0, const std::complex<T>& x1) noexcept
+    {
+        return x0 == x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE bool ge(const T& x0, const T& x1) noexcept
+    {
+        return x0 >= x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE bool gt(const T& x0, const T& x1) noexcept
+    {
+        return x0 > x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE bool le(const T& x0, const T& x1) noexcept
+    {
+        return x0 <= x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE bool lt(const T& x0, const T& x1) noexcept
+    {
+        return x0 < x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE bool neq(const T& x0, const T& x1) noexcept
+    {
+        return x0 != x1;
+    }
+
+    template <class T>
+    XSIMD_INLINE bool neq(const std::complex<T>& x0, const std::complex<T>& x1) noexcept
+    {
+        return !(x0 == x1);
+    }
+
+#if defined(__APPLE__) && (MAC_OS_X_VERSION_MIN_REQUIRED > 1080)
+    XSIMD_INLINE float exp10(const float& x) noexcept
+    {
+        return __exp10f(x);
+    }
+    XSIMD_INLINE double exp10(const double& x) noexcept
+    {
+        return __exp10(x);
+    }
+#elif defined(__GLIBC__)
+    XSIMD_INLINE float exp10(const float& x) noexcept
+    {
+        return ::exp10f(x);
+    }
+    XSIMD_INLINE double exp10(const double& x) noexcept
+    {
+        return ::exp10(x);
+    }
+#elif !defined(__clang__) && defined(__GNUC__) && (__GNUC__ >= 5)
+    XSIMD_INLINE float exp10(const float& x) noexcept
+    {
+        return __builtin_exp10f(x);
+    }
+    XSIMD_INLINE double exp10(const double& x) noexcept
+    {
+        return __builtin_exp10(x);
+    }
+#elif defined(_WIN32)
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE T exp10(const T& x) noexcept
+    {
+        // Very inefficient but other implementations give incorrect results
+        // on Windows
+        return std::pow(T(10), x);
+    }
+#else
+    XSIMD_INLINE float exp10(const float& x) noexcept
+    {
+        const float ln10 = std::log(10.f);
+        return std::exp(ln10 * x);
+    }
+    XSIMD_INLINE double exp10(const double& x) noexcept
+    {
+        const double ln10 = std::log(10.);
+        return std::exp(ln10 * x);
+    }
+#endif
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE auto rsqrt(const T& x) noexcept -> decltype(std::sqrt(x))
+    {
+        using float_type = decltype(std::sqrt(x));
+        return static_cast<float_type>(1) / std::sqrt(x);
+    }
+
+    namespace detail
+    {
+        template <class C>
+        XSIMD_INLINE C expm1_complex_scalar_impl(const C& val) noexcept
+        {
+            using T = typename C::value_type;
+            T isin = std::sin(val.imag());
+            T rem1 = std::expm1(val.real());
+            T re = rem1 + T(1.);
+            T si = std::sin(val.imag() * T(0.5));
+            return std::complex<T>(rem1 - T(2.) * re * si * si, re * isin);
+        }
+    }
+
+    template <class T>
+    XSIMD_INLINE std::complex<T> expm1(const std::complex<T>& val) noexcept
+    {
+        return detail::expm1_complex_scalar_impl(val);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> expm1(const xtl::xcomplex<T, T, i3ec>& val) noexcept
+    {
+        return detail::expm1_complex_scalar_impl(val);
+    }
+#endif
+
+    namespace detail
+    {
+        template <class C>
+        XSIMD_INLINE C log1p_complex_scalar_impl(const C& val) noexcept
+        {
+            using T = typename C::value_type;
+            C u = C(1.) + val;
+            return u == C(1.) ? val : (u.real() <= T(0.) ? log(u) : log(u) * val / (u - C(1.)));
+        }
+    }
+
+    template <class T>
+    XSIMD_INLINE std::complex<T> log1p(const std::complex<T>& val) noexcept
+    {
+        return detail::log1p_complex_scalar_impl(val);
+    }
+
+    template <class T>
+    XSIMD_INLINE std::complex<T> log2(const std::complex<T>& val) noexcept
+    {
+        return log(val) / std::log(T(2));
+    }
+
+    template <typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE T sadd(const T& lhs, const T& rhs) noexcept
+    {
+        if (std::numeric_limits<T>::is_signed)
+        {
+            if ((lhs > 0) && (rhs > std::numeric_limits<T>::max() - lhs))
+            {
+                return std::numeric_limits<T>::max();
+            }
+            else if ((lhs < 0) && (rhs < std::numeric_limits<T>::lowest() - lhs))
+            {
+                return std::numeric_limits<T>::lowest();
+            }
+            else
+            {
+                return lhs + rhs;
+            }
+        }
+        else
+        {
+            if (rhs > std::numeric_limits<T>::max() - lhs)
+            {
+                return std::numeric_limits<T>::max();
+            }
+            else
+            {
+                return lhs + rhs;
+            }
+        }
+    }
+
+    template <typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE T ssub(const T& lhs, const T& rhs) noexcept
+    {
+        if (std::numeric_limits<T>::is_signed)
+        {
+            return sadd(lhs, (T)-rhs);
+        }
+        else
+        {
+            if (lhs < rhs)
+            {
+                return std::numeric_limits<T>::lowest();
+            }
+            else
+            {
+                return lhs - rhs;
+            }
+        }
+    }
+
+    namespace detail
+    {
+        template <class T>
+        struct value_type_or_type_helper
+        {
+            using type = T;
+        };
+        template <class T, class A>
+        struct value_type_or_type_helper<batch<T, A>>
+        {
+            using type = T;
+        };
+
+        template <class T>
+        using value_type_or_type = typename value_type_or_type_helper<T>::type;
+
+        template <class T0, class T1>
+        XSIMD_INLINE typename std::enable_if<std::is_integral<T1>::value, T0>::type
+        ipow(const T0& x, const T1& n) noexcept
+        {
+            static_assert(std::is_integral<T1>::value, "second argument must be an integer");
+            T0 a = x;
+            T1 b = n;
+            bool const recip = b < 0;
+            T0 r(static_cast<value_type_or_type<T0>>(1));
+            while (1)
+            {
+                if (b & 1)
+                {
+                    r *= a;
+                }
+                b /= 2;
+                if (b == 0)
+                {
+                    break;
+                }
+                a *= a;
+            }
+            return recip ? static_cast<T0>(1) / r : r;
+        }
+    }
+
+    template <class T0, class T1>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T1>::value, T0>::type
+    pow(const T0& x, const T1& n) noexcept
+    {
+        return detail::ipow(x, n);
+    }
+
+    template <class T0, class T1>
+    XSIMD_INLINE auto
+    pow(const T0& t0, const T1& t1) noexcept
+        -> typename std::enable_if<std::is_scalar<T0>::value && std::is_floating_point<T1>::value, decltype(std::pow(t0, t1))>::type
+    {
+        return std::pow(t0, t1);
+    }
+
+    template <class T0, class T1>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T1>::value, std::complex<T0>>::type
+    pow(const std::complex<T0>& t0, const T1& t1) noexcept
+    {
+        return detail::ipow(t0, t1);
+    }
+
+    template <class T0, class T1>
+    XSIMD_INLINE typename std::enable_if<!std::is_integral<T1>::value, std::complex<T0>>::type
+    pow(const std::complex<T0>& t0, const T1& t1) noexcept
+    {
+        return std::pow(t0, t1);
+    }
+
+    template <class T0, class T1>
+    XSIMD_INLINE auto
+    pow(const T0& t0, const std::complex<T1>& t1) noexcept
+        -> typename std::enable_if<std::is_scalar<T0>::value, decltype(std::pow(t0, t1))>::type
+    {
+        return std::pow(t0, t1);
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE T bitofsign(T const& x) noexcept
+    {
+        return T(x < T(0));
+    }
+
+    XSIMD_INLINE float bitofsign(float const& x) noexcept
+    {
+        return float(std::signbit(x));
+    }
+
+    XSIMD_INLINE double bitofsign(double const& x) noexcept
+    {
+        return double(std::signbit(x));
+    }
+
+    XSIMD_INLINE long double bitofsign(long double const& x) noexcept
+    {
+        return static_cast<long double>(std::signbit(x));
+    }
+
+    template <class T>
+    XSIMD_INLINE auto signbit(T const& v) noexcept -> decltype(bitofsign(v))
+    {
+        return bitofsign(v);
+    }
+
+    XSIMD_INLINE double sign(bool const& v) noexcept
+    {
+        return v;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE T sign(const T& v) noexcept
+    {
+        return v < T(0) ? T(-1.) : v == T(0) ? T(0.)
+                                             : T(1.);
+    }
+
+    namespace detail
+    {
+        template <class C>
+        XSIMD_INLINE C sign_complex_scalar_impl(const C& v) noexcept
+        {
+            using value_type = typename C::value_type;
+            if (v.real())
+            {
+                return C(sign(v.real()), value_type(0));
+            }
+            else
+            {
+                return C(sign(v.imag()), value_type(0));
+            }
+        }
+    }
+
+    template <class T>
+    XSIMD_INLINE std::complex<T> sign(const std::complex<T>& v) noexcept
+    {
+        return detail::sign_complex_scalar_impl(v);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> sign(const xtl::xcomplex<T, T, i3ec>& v) noexcept
+    {
+        return detail::sign_complex_scalar_impl(v);
+    }
+#endif
+
+    XSIMD_INLINE double signnz(bool const&) noexcept
+    {
+        return 1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    XSIMD_INLINE T signnz(const T& v) noexcept
+    {
+        return v < T(0) ? T(-1.) : T(1.);
+    }
+
+    template <class T, class Tp>
+    XSIMD_INLINE typename std::common_type<T, Tp>::type sub(T const& x, Tp const& y) noexcept
+    {
+        return x - y;
+    }
+
+    template <class T>
+    XSIMD_INLINE T decr(T const& x) noexcept
+    {
+        return x - T(1);
+    }
+
+    template <class T>
+    XSIMD_INLINE T decr_if(T const& x, bool mask) noexcept
+    {
+        return x - T(mask ? 1 : 0);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> log2(const xtl::xcomplex<T, T, i3ec>& val) noexcept
+    {
+        return log(val) / log(T(2));
+    }
+#endif
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> log1p(const xtl::xcomplex<T, T, i3ec>& val) noexcept
+    {
+        return detail::log1p_complex_scalar_impl(val);
+    }
+#endif
+
+    template <class T0, class T1>
+    XSIMD_INLINE auto min(T0 const& self, T1 const& other) noexcept
+        -> typename std::enable_if<std::is_scalar<T0>::value && std::is_scalar<T1>::value,
+                                   typename std::decay<decltype(self > other ? other : self)>::type>::type
+    {
+        return self > other ? other : self;
+    }
+
+    // numpy defines minimum operator on complex using lexical comparison
+    template <class T0, class T1>
+    XSIMD_INLINE std::complex<typename std::common_type<T0, T1>::type>
+    min(std::complex<T0> const& self, std::complex<T1> const& other) noexcept
+    {
+        return (self.real() < other.real()) ? (self) : (self.real() == other.real() ? (self.imag() < other.imag() ? self : other) : other);
+    }
+
+    template <class T0, class T1>
+    XSIMD_INLINE auto max(T0 const& self, T1 const& other) noexcept
+        -> typename std::enable_if<std::is_scalar<T0>::value && std::is_scalar<T1>::value,
+                                   typename std::decay<decltype(self > other ? other : self)>::type>::type
+    {
+        return self < other ? other : self;
+    }
+
+    // numpy defines maximum operator on complex using lexical comparison
+    template <class T0, class T1>
+    XSIMD_INLINE std::complex<typename std::common_type<T0, T1>::type>
+    max(std::complex<T0> const& self, std::complex<T1> const& other) noexcept
+    {
+        return (self.real() > other.real()) ? (self) : (self.real() == other.real() ? (self.imag() > other.imag() ? self : other) : other);
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type fma(const T& a, const T& b, const T& c) noexcept
+    {
+        return a * b + c;
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_floating_point<T>::value, T>::type fma(const T& a, const T& b, const T& c) noexcept
+    {
+        return std::fma(a, b, c);
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_scalar<T>::value, T>::type fms(const T& a, const T& b, const T& c) noexcept
+    {
+        return a * b - c;
+    }
+
+    namespace detail
+    {
+        template <class C>
+        XSIMD_INLINE C fma_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        {
+            return { fms(a.real(), b.real(), fms(a.imag(), b.imag(), c.real())),
+                     fma(a.real(), b.imag(), fma(a.imag(), b.real(), c.imag())) };
+        }
+    }
+
+    template <class T>
+    XSIMD_INLINE std::complex<T> fma(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    {
+        return detail::fma_complex_scalar_impl(a, b, c);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> fma(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    {
+        return detail::fma_complex_scalar_impl(a, b, c);
+    }
+#endif
+
+    namespace detail
+    {
+        template <class C>
+        XSIMD_INLINE C fms_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        {
+            return { fms(a.real(), b.real(), fma(a.imag(), b.imag(), c.real())),
+                     fma(a.real(), b.imag(), fms(a.imag(), b.real(), c.imag())) };
+        }
+    }
+
+    template <class T>
+    XSIMD_INLINE std::complex<T> fms(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    {
+        return detail::fms_complex_scalar_impl(a, b, c);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> fms(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    {
+        return detail::fms_complex_scalar_impl(a, b, c);
+    }
+#endif
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type fnma(const T& a, const T& b, const T& c) noexcept
+    {
+        return -(a * b) + c;
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_floating_point<T>::value, T>::type fnma(const T& a, const T& b, const T& c) noexcept
+    {
+        return std::fma(-a, b, c);
+    }
+
+    namespace detail
+    {
+        template <class C>
+        XSIMD_INLINE C fnma_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        {
+            return { fms(a.imag(), b.imag(), fms(a.real(), b.real(), c.real())),
+                     -fma(a.real(), b.imag(), fms(a.imag(), b.real(), c.imag())) };
+        }
+    }
+
+    template <class T>
+    XSIMD_INLINE std::complex<T> fnma(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    {
+        return detail::fnma_complex_scalar_impl(a, b, c);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> fnma(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    {
+        return detail::fnma_complex_scalar_impl(a, b, c);
+    }
+#endif
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type fnms(const T& a, const T& b, const T& c) noexcept
+    {
+        return -(a * b) - c;
+    }
+
+    template <class T>
+    XSIMD_INLINE typename std::enable_if<std::is_floating_point<T>::value, T>::type fnms(const T& a, const T& b, const T& c) noexcept
+    {
+        return -std::fma(a, b, c);
+    }
+
+    namespace detail
+    {
+        template <class C>
+        XSIMD_INLINE C fnms_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        {
+            return { fms(a.imag(), b.imag(), fma(a.real(), b.real(), c.real())),
+                     -fma(a.real(), b.imag(), fma(a.imag(), b.real(), c.imag())) };
+        }
+    }
+
+    template <class T>
+    XSIMD_INLINE std::complex<T> fnms(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    {
+        return detail::fnms_complex_scalar_impl(a, b, c);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    XSIMD_INLINE xtl::xcomplex<T, T, i3ec> fnms(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    {
+        return detail::fnms_complex_scalar_impl(a, b, c);
+    }
+#endif
+
+    namespace detail
+    {
+#define XSIMD_HASSINCOS_TRAIT(func)                                                                                                           \
+    template <class S>                                                                                                                        \
+    struct has##func                                                                                                                          \
+    {                                                                                                                                         \
+        template <class T>                                                                                                                    \
+        static XSIMD_INLINE auto get(T* ptr) -> decltype(func(std::declval<T>(), std::declval<T*>(), std::declval<T*>()), std::true_type {}); \
+        static XSIMD_INLINE std::false_type get(...);                                                                                         \
+        static constexpr bool value = decltype(get((S*)nullptr))::value;                                                                      \
+    }
+
+#define XSIMD_HASSINCOS(func, T) has##func<T>::value
+
+        XSIMD_HASSINCOS_TRAIT(sincos);
+        XSIMD_HASSINCOS_TRAIT(sincosf);
+        XSIMD_HASSINCOS_TRAIT(__sincos);
+        XSIMD_HASSINCOS_TRAIT(__sincosf);
+
+        struct generic_sincosf
+        {
+            template <class T>
+            XSIMD_INLINE typename std::enable_if<XSIMD_HASSINCOS(sincosf, T), void>::type
+            operator()(float val, T& s, T& c)
+            {
+                sincosf(val, &s, &c);
+            }
+
+            template <class T>
+            XSIMD_INLINE typename std::enable_if<!XSIMD_HASSINCOS(sincosf, T) && XSIMD_HASSINCOS(__sincosf, T), void>::type
+            operator()(float val, T& s, T& c)
+            {
+                __sincosf(val, &s, &c);
+            }
+
+            template <class T>
+            XSIMD_INLINE typename std::enable_if<!XSIMD_HASSINCOS(sincosf, T) && !XSIMD_HASSINCOS(__sincosf, T), void>::type
+            operator()(float val, T& s, T& c)
+            {
+                s = std::sin(val);
+                c = std::cos(val);
+            }
+        };
+
+        struct generic_sincos
+        {
+            template <class T>
+            XSIMD_INLINE typename std::enable_if<XSIMD_HASSINCOS(sincos, T), void>::type
+            operator()(double val, T& s, T& c)
+            {
+                sincos(val, &s, &c);
+            }
+
+            template <class T>
+            XSIMD_INLINE typename std::enable_if<!XSIMD_HASSINCOS(sincos, T) && XSIMD_HASSINCOS(__sincos, T), void>::type
+            operator()(double val, T& s, T& c)
+            {
+                __sincos(val, &s, &c);
+            }
+
+            template <class T>
+            XSIMD_INLINE typename std::enable_if<!XSIMD_HASSINCOS(sincos, T) && !XSIMD_HASSINCOS(__sincos, T), void>::type
+            operator()(double val, T& s, T& c)
+            {
+                s = std::sin(val);
+                c = std::cos(val);
+            }
+        };
+
+#undef XSIMD_HASSINCOS_TRAIT
+#undef XSIMD_HASSINCOS
+    }
+
+    XSIMD_INLINE std::pair<float, float> sincos(float val) noexcept
+    {
+        float s, c;
+        detail::generic_sincosf {}(val, s, c);
+        return std::make_pair(s, c);
+    }
+
+    XSIMD_INLINE std::pair<double, double> sincos(double val) noexcept
+    {
+        double s, c;
+        detail::generic_sincos {}(val, s, c);
+        return std::make_pair(s, c);
+    }
+
+    template <class T>
+    XSIMD_INLINE std::pair<std::complex<T>, std::complex<T>>
+    sincos(const std::complex<T>& val) noexcept
+    {
+        return std::make_pair(std::sin(val), std::cos(val));
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T>
+    XSIMD_INLINE std::pair<xtl::xcomplex<T>, xtl::xcomplex<T>> sincos(const xtl::xcomplex<T>& val) noexcept
+    {
+        return std::make_pair(sin(val), cos(val));
+    }
+#endif
+
+    template <class T, class _ = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+    XSIMD_INLINE T frexp(T const& val, int& exp) noexcept
+    {
+        return std::frexp(val, &exp);
+    }
+
+    template <class T>
+    XSIMD_INLINE T select(bool cond, T const& true_br, T const& false_br) noexcept
+    {
+        return cond ? true_br : false_br;
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_sse2.hpp b/include/onnxruntime/xsimd/arch/xsimd_sse2.hpp
new file mode 100644
index 0000000000000..abcd0ae6993f1
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_sse2.hpp
@@ -0,0 +1,1763 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE2_HPP
+#define XSIMD_SSE2_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "../types/xsimd_sse2_register.hpp"
+
+namespace xsimd
+{
+    template <typename T, class A, bool... Values>
+    struct batch_bool_constant;
+
+    template <class T_out, class T_in, class A>
+    XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
+
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            constexpr uint32_t shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
+            {
+                return (z << 6) | (y << 4) | (x << 2) | w;
+            }
+            constexpr uint32_t shuffle(uint32_t x, uint32_t y)
+            {
+                return (y << 1) | x;
+            }
+
+            constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
+            {
+                return shuffle(w % 4, x % 4, y % 4, z % 4);
+            }
+
+            constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x)
+            {
+                return shuffle(w % 2, x % 2);
+            }
+        }
+
+        // fwd
+        template <class A, class T, size_t I>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+        template <class A, typename T, typename ITy, ITy... Indices>
+        XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
+
+        // abs
+        template <class A>
+        XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128d sign_mask = _mm_set1_pd(-0.f); // -0.f = 1 << 31
+            return _mm_andnot_pd(sign_mask, self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
+            return _mm_andnot_ps(sign_mask, self);
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_add_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_add_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_add_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_add_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_add_ps(self, other);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_add_pd(self, other);
+        }
+
+        // all
+        template <class A>
+        XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_ps(self) == 0x0F;
+        }
+        template <class A>
+        XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_pd(self) == 0x03;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_epi8(self) == 0xFFFF;
+        }
+
+        // any
+        template <class A>
+        XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_ps(self) != 0;
+        }
+        template <class A>
+        XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_pd(self) != 0;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_epi8(self) != 0;
+        }
+
+        // avgr
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_avg_epu8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_avg_epu16(self, other);
+            }
+            else
+            {
+                return avgr(self, other, generic {});
+            }
+        }
+
+        // avg
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto adj = ((self ^ other) << 7) >> 7;
+                return avgr(self, other, A {}) - adj;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto adj = ((self ^ other) << 15) >> 15;
+                return avgr(self, other, A {}) - adj;
+            }
+            else
+            {
+                return avg(self, other, generic {});
+            }
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept
+        {
+            return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
+        }
+
+        // bitwise_and
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_si128(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_si128(self, other);
+        }
+
+        template <class A>
+        batch<double, A> XSIMD_INLINE bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_pd(self, other);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_pd(self, other);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_ps(other, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_ps(other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_si128(other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_si128(other, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_pd(other, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_pd(other, self);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_slli_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_slli_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_slli_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // bitwise_not
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, _mm_set1_epi32(-1));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, _mm_set1_epi32(-1));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
+        }
+
+        // bitwise_or
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_si128(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_si128(self, other);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_pd(self, other);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_pd(self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF);
+                    __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self);
+                    __m128i res = _mm_srai_epi16(self, other);
+                    return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_srai_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_srai_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    // from https://github.com/samyvilar/vect/blob/master/vect_128.h
+                    return _mm_or_si128(
+                        _mm_srli_epi64(self, other),
+                        _mm_slli_epi64(
+                            _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32),
+                            64 - other));
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_srli_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_srli_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm_srli_epi64(self, other);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+        }
+
+        // bitwise_xor
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_ps(self);
+        }
+        template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
+        XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<sse2>) noexcept
+        {
+            return batch<Tp, A>(self.data);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castps_si128(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_pd(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castps_pd(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castpd_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castpd_si128(self);
+        }
+
+        // broadcast
+        template <class A>
+        batch<float, A> XSIMD_INLINE broadcast(float val, requires_arch<sse2>) noexcept
+        {
+            return _mm_set1_ps(val);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_set1_epi8(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_set1_epi16(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_set1_epi32(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_set1_epi64x(val);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<sse2>) noexcept
+        {
+            return _mm_set1_pd(val);
+        }
+
+        // store_complex
+        namespace detail
+        {
+            // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned
+            // complex_low
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpacklo_ps(self.real(), self.imag());
+            }
+            // complex_high
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpackhi_ps(self.real(), self.imag());
+            }
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpacklo_pd(self.real(), self.imag());
+            }
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpackhi_pd(self.real(), self.imag());
+            }
+        }
+
+        // decr_if
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
+        {
+            return self + batch<T, A>(mask.data);
+        }
+
+        // div
+        template <class A>
+        XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_div_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_div_pd(self, other);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+            {
+                return _mm_cvtepi32_ps(self);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to sse2
+                __m128i xH = _mm_srli_epi64(x, 32);
+                xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
+                __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
+                __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to sse2
+                __m128i xH = _mm_srai_epi32(x, 16);
+                xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
+                xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
+                __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
+                __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<sse2>) noexcept
+            {
+                return _mm_cvttps_epi32(self);
+            }
+        }
+
+        // eq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpeq_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other)));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_cmpeq_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_cmpeq_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_cmpeq_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m128i tmp1 = _mm_cmpeq_epi32(self, other);
+                __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1);
+                __m128i tmp3 = _mm_and_si128(tmp1, tmp2);
+                __m128i tmp4 = _mm_srai_epi32(tmp3, 31);
+                return _mm_shuffle_epi32(tmp4, 0xF5);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return ~(self != other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpeq_pd(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
+        }
+
+        // from_mask
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        {
+            alignas(A::alignment()) static const uint32_t lut[][4] = {
+                { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+            };
+            assert(!(mask & ~0xFul) && "inbound mask");
+            return _mm_castsi128_ps(_mm_load_si128((const __m128i*)lut[mask]));
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut[][4] = {
+                { 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+            };
+            assert(!(mask & ~0x3ul) && "inbound mask");
+            return _mm_castsi128_pd(_mm_load_si128((const __m128i*)lut[mask]));
+        }
+        template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut64[] = {
+                0x0000000000000000,
+                0x000000000000FFFF,
+                0x00000000FFFF0000,
+                0x00000000FFFFFFFF,
+                0x0000FFFF00000000,
+                0x0000FFFF0000FFFF,
+                0x0000FFFFFFFF0000,
+                0x0000FFFFFFFFFFFF,
+                0xFFFF000000000000,
+                0xFFFF00000000FFFF,
+                0xFFFF0000FFFF0000,
+                0xFFFF0000FFFFFFFF,
+                0xFFFFFFFF00000000,
+                0xFFFFFFFF0000FFFF,
+                0xFFFFFFFFFFFF0000,
+                0xFFFFFFFFFFFFFFFF,
+            };
+            alignas(A::alignment()) static const uint32_t lut32[] = {
+                0x00000000,
+                0x000000FF,
+                0x0000FF00,
+                0x0000FFFF,
+                0x00FF0000,
+                0x00FF00FF,
+                0x00FFFF00,
+                0x00FFFFFF,
+                0xFF000000,
+                0xFF0000FF,
+                0xFF00FF00,
+                0xFF00FFFF,
+                0xFFFF0000,
+                0xFFFF00FF,
+                0xFFFFFF00,
+                0xFFFFFFFF,
+            };
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(!(mask & ~0xFFFF) && "inbound mask");
+                return _mm_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(!(mask & ~0xFF) && "inbound mask");
+                return _mm_set_epi64x(lut64[mask >> 4], lut64[mask & 0xF]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_castps_si128(from_mask(batch_bool<float, A> {}, mask, sse2 {}));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_castpd_si128(from_mask(batch_bool<double, A> {}, mask, sse2 {}));
+            }
+        }
+
+        // ge
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpge_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpge_pd(self, other);
+        }
+
+        // gt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpgt_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_cmpgt_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_cmpgt_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_cmpgt_epi32(self, other);
+                }
+                else
+                {
+                    return gt(self, other, generic {});
+                }
+            }
+            else
+            {
+                return gt(self, other, generic {});
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpgt_pd(self, other);
+        }
+
+        // haddp
+        template <class A>
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse2>) noexcept
+        {
+            __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
+            __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
+            __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
+            tmp0 = _mm_add_ps(tmp0, tmp1);
+            tmp1 = _mm_unpacklo_ps(row[2], row[3]);
+            tmp1 = _mm_add_ps(tmp1, tmp2);
+            tmp2 = _mm_movehl_ps(tmp1, tmp0);
+            tmp0 = _mm_movelh_ps(tmp0, tmp1);
+            return _mm_add_ps(tmp0, tmp2);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse2>) noexcept
+        {
+            return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]),
+                              _mm_unpackhi_pd(row[0], row[1]));
+        }
+
+        // incr_if
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
+        {
+            return self - batch<T, A>(mask.data);
+        }
+
+        // insert
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_insert_epi16(self, val, I);
+            }
+            else
+            {
+                return insert(self, val, pos, generic {});
+            }
+        }
+
+        // isnan
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpunord_ps(self, self);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpunord_pd(self, self);
+        }
+
+        // load_aligned
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
+        {
+            return _mm_load_ps(mem);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
+        {
+            return _mm_load_si128((__m128i const*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
+        {
+            return _mm_load_pd(mem);
+        }
+
+        // load_unaligned
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
+        {
+            return _mm_loadu_ps(mem);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
+        {
+            return _mm_loadu_si128((__m128i const*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
+        {
+            return _mm_loadu_pd(mem);
+        }
+
+        // load_complex
+        namespace detail
+        {
+            // Redefine these methods in the SSE-based archs if required
+            template <class A>
+            XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<sse2>) noexcept
+            {
+                return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) };
+            }
+            template <class A>
+            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<sse2>) noexcept
+            {
+                return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) };
+            }
+        }
+
+        // le
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmple_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmple_pd(self, other);
+        }
+
+        // lt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmplt_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_cmplt_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_cmplt_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_cmplt_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    __m128i tmp1 = _mm_sub_epi64(self, other);
+                    __m128i tmp2 = _mm_xor_si128(self, other);
+                    __m128i tmp3 = _mm_andnot_si128(other, self);
+                    __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
+                    __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
+                    __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
+                    return _mm_shuffle_epi32(tmp6, 0xF5);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+                    auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+                    __m128i tmp1 = _mm_sub_epi64(xself, xother);
+                    __m128i tmp2 = _mm_xor_si128(xself, xother);
+                    __m128i tmp3 = _mm_andnot_si128(xother, xself);
+                    __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
+                    __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
+                    __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
+                    return _mm_shuffle_epi32(tmp6, 0xF5);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmplt_pd(self, other);
+        }
+
+        /* compression table to turn 0b10 into 0b1,
+         * 0b100010 into 0b101 etc
+         */
+        namespace detail
+        {
+            XSIMD_INLINE int mask_lut(int mask)
+            {
+                // clang-format off
+                static const int mask_lut[256] = {
+                  0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x4, 0x0, 0x5, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6, 0x0, 0x7, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x8, 0x0, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0xA, 0x0, 0xB, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0xC, 0x0, 0xD, 0x0, 0x0, 0x0, 0x0, 0x0, 0xE, 0x0, 0xF, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                };
+                // clang-format on
+                return mask_lut[mask & 0xAA];
+            }
+        }
+
+        // mask
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_movemask_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                uint64_t mask8 = _mm_movemask_epi8(self);
+                return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_movemask_ps(_mm_castsi128_ps(self));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_movemask_pd(_mm_castsi128_pd(self));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_ps(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_pd(self);
+        }
+
+        // max
+        template <class A>
+        XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_max_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return select(self > other, self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_max_pd(self, other);
+        }
+
+        // min
+        template <class A>
+        XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_min_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return select(self <= other, self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_min_pd(self, other);
+        }
+
+        // mul
+        template <class A>
+        XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_mul_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_mul_pd(self, other);
+        }
+
+        // mul
+        template <class A>
+        XSIMD_INLINE batch<int16_t, A> mul(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_mullo_epi16(self, other);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                        requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtps_epi32(self);
+        }
+
+        // neg
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return 0 - self;
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> neg(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(
+                self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000)));
+        }
+
+        // neq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpneq_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return ~(self == other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data)));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpneq_pd(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, other);
+        }
+
+        // reciprocal
+        template <class A>
+        XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
+                                                kernel::requires_arch<sse2>)
+        {
+            return _mm_rcp_ps(self);
+        }
+
+        // reduce_add
+        template <class A>
+        XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self));
+            __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
+            return _mm_cvtss_f32(tmp1);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
+                __m128i tmp2 = _mm_add_epi32(self, tmp1);
+                __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
+                __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
+                return _mm_cvtsi128_si32(tmp4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
+                __m128i tmp2 = _mm_add_epi64(self, tmp1);
+#if defined(__x86_64__)
+                return _mm_cvtsi128_si64(tmp2);
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, tmp2);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                return hadd(self, generic {});
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
+        }
+
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
+            batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
+            batch<T, A> acc0 = max(self, step0);
+
+            constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
+            batch<T, A> acc1 = max(acc0, step1);
+
+            constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
+            batch<T, A> acc2 = max(acc1, step2);
+            if (sizeof(T) == 2)
+                return acc2.get(0);
+            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
+            batch<T, A> acc3 = max(acc2, step3);
+            return acc3.get(0);
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
+            batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
+            batch<T, A> acc0 = min(self, step0);
+
+            constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
+            batch<T, A> acc1 = min(acc0, step1);
+
+            constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
+            batch<T, A> acc2 = min(acc1, step2);
+            if (sizeof(T) == 2)
+                return acc2.get(0);
+            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
+            batch<T, A> acc3 = min(acc2, step3);
+            return acc3.get(0);
+        }
+
+        // rsqrt
+        template <class A>
+        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_rsqrt_ps(val);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val)));
+        }
+
+        // select
+        template <class A>
+        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br));
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br));
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, sse2 {});
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br));
+        }
+
+        // shuffle
+        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
+        XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
+            // shuffle within lane
+            if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4)
+                return _mm_shuffle_ps(x, y, smask);
+
+            // shuffle within opposite lane
+            if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4)
+                return _mm_shuffle_ps(y, x, smask);
+            return shuffle(x, y, mask, generic {});
+        }
+
+        template <class A, class ITy, ITy I0, ITy I1>
+        XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1> mask, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t smask = detail::mod_shuffle(I0, I1);
+            // shuffle within lane
+            if (I0 < 2 && I1 >= 2)
+                return _mm_shuffle_pd(x, y, smask);
+
+            // shuffle within opposite lane
+            if (I0 >= 2 && I1 < 2)
+                return _mm_shuffle_pd(y, x, smask);
+            return shuffle(x, y, mask, generic {});
+        }
+
+        // sqrt
+        template <class A>
+        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_sqrt_ps(val);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_sqrt_pd(val);
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<sse2>) noexcept
+        {
+            return _mm_slli_si128(x, N);
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<sse2>) noexcept
+        {
+            return _mm_srli_si128(x, N);
+        }
+
+        // sadd
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_adds_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_adds_epi16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, generic {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_adds_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_adds_epu16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, generic {});
+                }
+            }
+        }
+
+        // set
+        template <class A, class... Values>
+        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
+            return _mm_setr_ps(values...);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1) noexcept
+        {
+            return _mm_set_epi64x(v1, v0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3) noexcept
+        {
+            return _mm_setr_epi32(v0, v1, v2, v3);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        {
+            return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        {
+            return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+
+        template <class A, class... Values>
+        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
+            return _mm_setr_pd(values...);
+        }
+
+        template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
+        }
+
+        template <class A, class... Values>
+        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
+            return _mm_castsi128_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        template <class A, class... Values>
+        XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
+            return _mm_castsi128_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        // ssub
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_subs_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_subs_epi16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, generic {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_subs_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_subs_epu16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, generic {});
+                }
+            }
+        }
+
+        // store_aligned
+        template <class A>
+        XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_ps(mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_si128((__m128i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_si128((__m128i*)mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_pd(mem, self);
+        }
+
+        // store_unaligned
+        template <class A>
+        XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_ps(mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_si128((__m128i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_si128((__m128i*)mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_pd(mem, self);
+        }
+
+        // sub
+        template <class A>
+        XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_sub_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_sub_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_sub_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_sub_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_sub_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_sub_pd(self, other);
+        }
+
+        // swizzle
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
+            return _mm_shuffle_ps(self, self, index);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(V0, V1);
+            return _mm_shuffle_pd(self, self, index);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
+            return _mm_shuffle_epi32(self, index);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<sse2>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, sse2 {}));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
+            return _mm_shuffle_epi32(self, index);
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
+        }
+
+        // transpose
+        template <class A>
+        XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<sse2>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
+            _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+            matrix_begin[0] = r0;
+            matrix_begin[1] = r1;
+            matrix_begin[2] = r2;
+            matrix_begin[3] = r3;
+        }
+        template <class A>
+        XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<sse2>) noexcept
+        {
+            transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
+        }
+        template <class A>
+        XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<sse2>) noexcept
+        {
+            transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
+        }
+
+        template <class A>
+        XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<sse2>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            auto r0 = matrix_begin[0], r1 = matrix_begin[1];
+            matrix_begin[0] = _mm_unpacklo_pd(r0, r1);
+            matrix_begin[1] = _mm_unpackhi_pd(r0, r1);
+        }
+        template <class A>
+        XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<sse2>) noexcept
+        {
+            transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
+        }
+        template <class A>
+        XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<sse2>) noexcept
+        {
+            transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
+        }
+
+        // zip_hi
+        template <class A>
+        XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpackhi_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_unpackhi_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_unpackhi_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_unpackhi_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_unpackhi_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpackhi_pd(self, other);
+        }
+
+        // zip_lo
+        template <class A>
+        XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpacklo_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_unpacklo_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_unpacklo_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_unpacklo_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_unpacklo_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpacklo_pd(self, other);
+        }
+    }
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_sse3.hpp b/include/onnxruntime/xsimd/arch/xsimd_sse3.hpp
new file mode 100644
index 0000000000000..ffdc5bc9fabd8
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_sse3.hpp
@@ -0,0 +1,64 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE3_HPP
+#define XSIMD_SSE3_HPP
+
+#include "../types/xsimd_sse3_register.hpp"
+#include <type_traits>
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // haddp
+        template <class A>
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse3>) noexcept
+        {
+            return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
+                               _mm_hadd_ps(row[2], row[3]));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse3>) noexcept
+        {
+            return _mm_hadd_pd(row[0], row[1]);
+        }
+
+        // load_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse3>) noexcept
+        {
+            return _mm_lddqu_si128((__m128i const*)mem);
+        }
+
+        // reduce_add
+        template <class A>
+        XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<sse3>) noexcept
+        {
+            __m128 tmp0 = _mm_hadd_ps(self, self);
+            __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
+            return _mm_cvtss_f32(tmp1);
+        }
+        template <class A>
+        XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept
+        {
+            __m128d tmp0 = _mm_hadd_pd(self, self);
+            return _mm_cvtsd_f64(tmp0);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_sse4_1.hpp b/include/onnxruntime/xsimd/arch/xsimd_sse4_1.hpp
new file mode 100644
index 0000000000000..7fce2c3147276
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_sse4_1.hpp
@@ -0,0 +1,339 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_1_HPP
+#define XSIMD_SSE4_1_HPP
+
+#include <type_traits>
+
+#include "../types/xsimd_sse4_1_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+        // any
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE bool any(batch<T, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return !_mm_testz_si128(self, self);
+        }
+        // ceil
+        template <class A>
+        XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_ceil_ps(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_ceil_pd(self);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                __m128i xH = _mm_srai_epi32(x, 16);
+                xH = _mm_blend_epi16(xH, _mm_setzero_si128(), 0x33);
+                xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
+                __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                __m128i xH = _mm_srli_epi64(x, 32);
+                xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
+                __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+        }
+
+        // eq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_cmpeq_epi64(self, other);
+            }
+            else
+            {
+                return eq(self, other, ssse3 {});
+            }
+        }
+
+        // floor
+        template <class A>
+        XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_floor_ps(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_floor_pd(self);
+        }
+
+        // insert
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_insert_epi8(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_insert_epi32(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+#if (!defined(_MSC_VER) && __x86_64__) || (_MSC_VER > 1900 && defined(_M_X64))
+                return _mm_insert_epi64(self, val, I);
+#else
+                uint32_t lo, hi;
+                memcpy(&lo, (reinterpret_cast<uint32_t*>(&val)), sizeof(lo));
+                memcpy(&hi, (reinterpret_cast<uint32_t*>(&val)) + 1, sizeof(hi));
+                return _mm_insert_epi32(_mm_insert_epi32(self, lo, 2 * I), hi, 2 * I + 1);
+#endif
+            }
+            else
+            {
+                return insert(self, val, pos, ssse3 {});
+            }
+        }
+
+        // max
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_max_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_max_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_max_epi32(self, other);
+                }
+                else
+                {
+                    return max(self, other, ssse3 {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_max_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_max_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_max_epu32(self, other);
+                }
+                else
+                {
+                    return max(self, other, ssse3 {});
+                }
+            }
+        }
+
+        // min
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_min_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_min_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_min_epi32(self, other);
+                }
+                else
+                {
+                    return min(self, other, ssse3 {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_min_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_min_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_min_epu32(self, other);
+                }
+                else
+                {
+                    return min(self, other, ssse3 {});
+                }
+            }
+        }
+
+        // mul
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_or_si128(
+                    _mm_and_si128(_mm_mullo_epi16(self, other), _mm_srli_epi16(_mm_cmpeq_epi8(self, self), 8)),
+                    _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_epi16(self, 8), _mm_srli_epi16(other, 8)), 8));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_mullo_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_mullo_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_add_epi64(
+                    _mm_mul_epu32(self, other),
+                    _mm_slli_epi64(
+                        _mm_add_epi64(
+                            _mm_mul_epu32(other, _mm_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))),
+                            _mm_mul_epu32(self, _mm_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))),
+                        32));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // nearbyint
+        template <class A>
+        XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+
+        // select
+        namespace detail
+        {
+            template <class T>
+            XSIMD_INLINE constexpr T interleave(T const& cond) noexcept
+            {
+                return (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 49) & 0x5555) | (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 48) & 0xAAAA);
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_blendv_epi8(false_br, true_br, cond);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_blendv_ps(false_br, true_br, cond);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_blendv_pd(false_br, true_br, cond);
+        }
+
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_blend_epi16(false_br, true_br, mask);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                constexpr int imask = detail::interleave(mask);
+                return _mm_blend_epi16(false_br, true_br, imask);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                constexpr int imask = detail::interleave(mask);
+                constexpr int imask2 = detail::interleave(imask);
+                return _mm_blend_epi16(false_br, true_br, imask2);
+            }
+            else
+            {
+                return select(batch_bool_constant<T, A, Values...>(), true_br, false_br, ssse3 {});
+            }
+        }
+        template <class A, bool... Values>
+        XSIMD_INLINE batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<float, A, Values...>::mask();
+            return _mm_blend_ps(false_br, true_br, mask);
+        }
+        template <class A, bool... Values>
+        XSIMD_INLINE batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<double, A, Values...>::mask();
+            return _mm_blend_pd(false_br, true_br, mask);
+        }
+
+        // trunc
+        template <class A>
+        XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_ps(self, _MM_FROUND_TO_ZERO);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> trunc(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_sse4_2.hpp b/include/onnxruntime/xsimd/arch/xsimd_sse4_2.hpp
new file mode 100644
index 0000000000000..5265182f9d227
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_sse4_2.hpp
@@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_2_HPP
+#define XSIMD_SSE4_2_HPP
+
+#include <limits>
+
+#include "../types/xsimd_sse4_2_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // lt
+        template <class A>
+        XSIMD_INLINE batch_bool<int64_t, A> lt(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<sse4_2>) noexcept
+        {
+            return _mm_cmpgt_epi64(other, self);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<uint64_t, A> lt(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<sse4_2>) noexcept
+        {
+            auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+            auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+            return _mm_cmpgt_epi64(xother, xself);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_ssse3.hpp b/include/onnxruntime/xsimd/arch/xsimd_ssse3.hpp
new file mode 100644
index 0000000000000..decaa5e22b800
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_ssse3.hpp
@@ -0,0 +1,175 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSSE3_HPP
+#define XSIMD_SSSE3_HPP
+
+#include <cstddef>
+#include <type_traits>
+
+#include "../types/xsimd_ssse3_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // abs
+        template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<ssse3>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_abs_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_abs_epi16(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_abs_epi32(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_abs_epi64(self);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // extract_pair
+        namespace detail
+        {
+
+            template <class T, class A>
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& other, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
+            {
+                return other;
+            }
+
+            template <class T, class A, std::size_t I, std::size_t... Is>
+            XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (i == I)
+                {
+                    return _mm_alignr_epi8(self, other, sizeof(T) * I);
+                }
+                else
+                    return extract_pair(self, other, i, ::xsimd::detail::index_sequence<Is...>());
+            }
+        }
+
+        template <class A, class T, class _ = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<ssse3>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(0 <= i && i < size && "index in bounds");
+            return detail::extract_pair(self, other, i, ::xsimd::detail::make_index_sequence<size>());
+        }
+
+        // reduce_add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<ssse3>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                __m128i tmp1 = _mm_hadd_epi16(self, self);
+                __m128i tmp2 = _mm_hadd_epi16(tmp1, tmp1);
+                __m128i tmp3 = _mm_hadd_epi16(tmp2, tmp2);
+                return _mm_cvtsi128_si32(tmp3) & 0xFFFF;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m128i tmp1 = _mm_hadd_epi32(self, self);
+                __m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1);
+                return _mm_cvtsi128_si32(tmp2);
+            }
+            else
+            {
+                return reduce_add(self, sse3 {});
+            }
+        }
+
+        // rotate_left
+        template <size_t N, class A>
+        XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<ssse3>) noexcept
+        {
+            return _mm_alignr_epi8(self, self, N);
+        }
+        template <size_t N, class A>
+        XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<ssse3>) noexcept
+        {
+            return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), ssse3 {}));
+        }
+
+        // swizzle (dynamic mask)
+        template <class A>
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<ssse3>) noexcept
+        {
+            return _mm_shuffle_epi8(self, mask);
+        }
+        template <class A>
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<ssse3>) noexcept
+        {
+            return _mm_shuffle_epi8(self, mask);
+        }
+
+        template <class A, class T, class IT>
+        XSIMD_INLINE typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+        swizzle(batch<T, A> const& self, batch<IT, A> mask, requires_arch<ssse3>) noexcept
+        {
+            constexpr auto pikes = static_cast<as_unsigned_integer_t<T>>(0x0706050403020100ul);
+            constexpr auto comb = static_cast<as_unsigned_integer_t<T>>(0x0101010101010101ul * sizeof(T));
+            return bitwise_cast<T>(swizzle(bitwise_cast<uint8_t>(self), bitwise_cast<uint8_t>(mask * comb + pikes), ssse3 {}));
+        }
+
+        // swizzle (constant mask)
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
+        {
+            constexpr batch_constant<uint8_t, A, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
+                                     2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1>
+                mask8;
+            return _mm_shuffle_epi8(self, mask8.as_batch());
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, ssse3 {}));
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
+        {
+            return swizzle(self, mask.as_batch(), ssse3 {});
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
+        {
+            return swizzle(self, mask.as_batch(), ssse3 {});
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_sve.hpp b/include/onnxruntime/xsimd/arch/xsimd_sve.hpp
new file mode 100644
index 0000000000000..9958692a8c3f1
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_sve.hpp
@@ -0,0 +1,1148 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Yibo Cai                                                   *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SVE_HPP
+#define XSIMD_SVE_HPP
+
+#include <complex>
+#include <type_traits>
+
+#include "../types/xsimd_sve_register.hpp"
+
+namespace xsimd
+{
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        namespace detail
+        {
+            using xsimd::index;
+            using xsimd::types::detail::sve_vector_type;
+
+            // predicate creation
+            XSIMD_INLINE svbool_t sve_ptrue_impl(index<1>) noexcept { return svptrue_b8(); }
+            XSIMD_INLINE svbool_t sve_ptrue_impl(index<2>) noexcept { return svptrue_b16(); }
+            XSIMD_INLINE svbool_t sve_ptrue_impl(index<4>) noexcept { return svptrue_b32(); }
+            XSIMD_INLINE svbool_t sve_ptrue_impl(index<8>) noexcept { return svptrue_b64(); }
+
+            template <class T>
+            svbool_t sve_ptrue() noexcept { return sve_ptrue_impl(index<sizeof(T)> {}); }
+
+            // count active lanes in a predicate
+            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); }
+            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); }
+            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); }
+            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); }
+
+            template <class T>
+            XSIMD_INLINE uint64_t sve_pcount(svbool_t p) noexcept { return sve_pcount_impl(p, index<sizeof(T)> {}); }
+
+            // enable for signed integers
+            template <class T>
+            using sve_enable_signed_int_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, int>::type;
+
+            // enable for unsigned integers
+            template <class T>
+            using sve_enable_unsigned_int_t = typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, int>::type;
+
+            // enable for floating points
+            template <class T>
+            using sve_enable_floating_point_t = typename std::enable_if<std::is_floating_point<T>::value, int>::type;
+
+            // enable for signed integers or floating points
+            template <class T>
+            using sve_enable_signed_int_or_floating_point_t = typename std::enable_if<std::is_signed<T>::value, int>::type;
+
+            // enable for all SVE supported types
+            template <class T>
+            using sve_enable_all_t = typename std::enable_if<std::is_arithmetic<T>::value, int>::type;
+        } // namespace detail
+
+        /*********
+         * Load *
+         *********/
+
+        namespace detail
+        {
+            // "char" is not allowed in SVE load/store operations
+            using sve_fix_char_t_impl = typename std::conditional<std::is_signed<char>::value, int8_t, uint8_t>::type;
+
+            template <class T>
+            using sve_fix_char_t = typename std::conditional<std::is_same<char, typename std::decay<T>::type>::value,
+                                                             sve_fix_char_t_impl, T>::type;
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<sve>) noexcept
+        {
+            return svld1(detail::sve_ptrue<T>(), reinterpret_cast<detail::sve_fix_char_t<T> const*>(src));
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<sve>) noexcept
+        {
+            return load_aligned<A>(src, convert<T>(), sve {});
+        }
+
+        // load_complex
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<std::complex<T>, A> load_complex_aligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
+        {
+            const T* buf = reinterpret_cast<const T*>(mem);
+            const auto tmp = svld2(detail::sve_ptrue<T>(), buf);
+            const auto real = svget2(tmp, 0);
+            const auto imag = svget2(tmp, 1);
+            return batch<std::complex<T>, A> { real, imag };
+        }
+
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<std::complex<T>, A> load_complex_unaligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
+        {
+            return load_complex_aligned<A>(mem, convert<std::complex<T>> {}, sve {});
+        }
+
+        /*********
+         * Store *
+         *********/
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
+        {
+            svst1(detail::sve_ptrue<T>(), reinterpret_cast<detail::sve_fix_char_t<T>*>(dst), src);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
+        {
+            store_aligned<A>(dst, src, sve {});
+        }
+
+        // store_complex
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
+        {
+            using v2type = typename std::conditional<(sizeof(T) == 4), svfloat32x2_t, svfloat64x2_t>::type;
+            v2type tmp {};
+            tmp = svset2(tmp, 0, src.real());
+            tmp = svset2(tmp, 1, src.imag());
+            T* buf = reinterpret_cast<T*>(dst);
+            svst2(detail::sve_ptrue<T>(), buf, tmp);
+        }
+
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
+        {
+            store_complex_aligned(dst, src, sve {});
+        }
+
+        /******************
+         * scatter/gather *
+         ******************/
+
+        namespace detail
+        {
+            template <class T, class U>
+            using sve_enable_sg_t = typename std::enable_if<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>::type;
+        }
+
+        // scatter
+        template <class A, class T, class U, detail::sve_enable_sg_t<T, U> = 0>
+        XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
+        {
+            svst1_scatter_index(detail::sve_ptrue<T>(), dst, index.data, src.data);
+        }
+
+        // gather
+        template <class A, class T, class U, detail::sve_enable_sg_t<T, U> = 0>
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
+        {
+            return svld1_gather_index(detail::sve_ptrue<T>(), src, index.data);
+        }
+
+        /********************
+         * Scalar to vector *
+         ********************/
+
+        // broadcast
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u8(uint8_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s8(int8_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u16(uint16_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s16(int16_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u32(uint32_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s32(int32_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u64(uint64_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s64(int64_t(arg));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> broadcast(float arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_f32(arg);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> broadcast(double arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_f64(arg);
+        }
+
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<sve>) noexcept
+        {
+            return broadcast<sve>(val, sve {});
+        }
+
+        /**************
+         * Arithmetic *
+         **************/
+
+        // add
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svadd_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // sadd
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svqadd(lhs, rhs);
+        }
+
+        // sub
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svsub_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // ssub
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svqsub(lhs, rhs);
+        }
+
+        // mul
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svmul_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // div
+        template <class A, class T, typename std::enable_if<sizeof(T) >= 4, int>::type = 0>
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svdiv_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // max
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svmax_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // min
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svmin_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // neg
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u8(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s8(arg)));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u16(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s16(arg)));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u32(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s32(arg)));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u64(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s64(arg)));
+        }
+
+        template <class A, class T, detail::sve_enable_signed_int_or_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svneg_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        // abs
+        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return arg;
+        }
+
+        template <class A, class T, detail::sve_enable_signed_int_or_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svabs_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        // fma: x * y + z
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return svmad_x(detail::sve_ptrue<T>(), x, y, z);
+        }
+
+        // fnma: z - x * y
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return svmsb_x(detail::sve_ptrue<T>(), x, y, z);
+        }
+
+        // fms: x * y - z
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return -fnma(x, y, z, sve {});
+        }
+
+        // fnms: - x * y - z
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return -fma(x, y, z, sve {});
+        }
+
+        /**********************
+         * Logical operations *
+         **********************/
+
+        // bitwise_and
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svand_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(lhs);
+            const auto rhs_bits = svreinterpret_u32(rhs);
+            const auto result_bits = svand_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(lhs);
+            const auto rhs_bits = svreinterpret_u64(rhs);
+            const auto result_bits = svand_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svand_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // bitwise_andnot
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svbic_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(lhs);
+            const auto rhs_bits = svreinterpret_u32(rhs);
+            const auto result_bits = svbic_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(lhs);
+            const auto rhs_bits = svreinterpret_u64(rhs);
+            const auto result_bits = svbic_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svbic_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // bitwise_or
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svorr_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(lhs);
+            const auto rhs_bits = svreinterpret_u32(rhs);
+            const auto result_bits = svorr_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(lhs);
+            const auto rhs_bits = svreinterpret_u64(rhs);
+            const auto result_bits = svorr_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svorr_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // bitwise_xor
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return sveor_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(lhs);
+            const auto rhs_bits = svreinterpret_u32(rhs);
+            const auto result_bits = sveor_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(lhs);
+            const auto rhs_bits = svreinterpret_u64(rhs);
+            const auto result_bits = sveor_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // bitwise_not
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svnot_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto arg_bits = svreinterpret_u32(arg);
+            const auto result_bits = svnot_x(detail::sve_ptrue<float>(), arg_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto arg_bits = svreinterpret_u64(arg);
+            const auto result_bits = svnot_x(detail::sve_ptrue<double>(), arg_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svnot_z(detail::sve_ptrue<T>(), arg);
+        }
+
+        /**********
+         * Shifts *
+         **********/
+
+        namespace detail
+        {
+            template <class A, class T, class U>
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<1>) noexcept
+            {
+                return svreinterpret_u8(arg);
+            }
+
+            template <class A, class T, class U>
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<2>) noexcept
+            {
+                return svreinterpret_u16(arg);
+            }
+
+            template <class A, class T, class U>
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<4>) noexcept
+            {
+                return svreinterpret_u32(arg);
+            }
+
+            template <class A, class T, class U>
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<8>) noexcept
+            {
+                return svreinterpret_u64(arg);
+            }
+
+            template <class A, class T, class U = as_unsigned_integer_t<T>>
+            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch(batch<T, A> const& arg) noexcept
+            {
+                return sve_to_unsigned_batch_impl<A, T, U>(arg, index<sizeof(T)> {});
+            }
+        } // namespace detail
+
+        // bitwise_lshift
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
+            return svlsl_x(detail::sve_ptrue<T>(), arg, n);
+        }
+
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svlsl_x(detail::sve_ptrue<T>(), lhs, detail::sve_to_unsigned_batch<A, T>(rhs));
+        }
+
+        // bitwise_rshift
+        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
+            return svlsr_x(detail::sve_ptrue<T>(), arg, static_cast<T>(n));
+        }
+
+        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svlsr_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A, class T, detail::sve_enable_signed_int_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
+            return svasr_x(detail::sve_ptrue<T>(), arg, static_cast<as_unsigned_integer_t<T>>(n));
+        }
+
+        template <class A, class T, detail::sve_enable_signed_int_t<T> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svasr_x(detail::sve_ptrue<T>(), lhs, detail::sve_to_unsigned_batch<A, T>(rhs));
+        }
+
+        /**************
+         * Reductions *
+         **************/
+
+        // reduce_add
+        template <class A, class T, class V = typename batch<T, A>::value_type, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE V reduce_add(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            // sve integer reduction results are promoted to 64 bits
+            return static_cast<V>(svaddv(detail::sve_ptrue<T>(), arg));
+        }
+
+        // reduce_max
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE T reduce_max(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svmaxv(detail::sve_ptrue<T>(), arg);
+        }
+
+        // reduce_min
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE T reduce_min(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svminv(detail::sve_ptrue<T>(), arg);
+        }
+
+        // haddp
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> haddp(const batch<T, A>* row, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            T sums[size];
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                sums[i] = reduce_add(row[i], sve {});
+            }
+            return svld1(detail::sve_ptrue<T>(), sums);
+        }
+
+        /***************
+         * Comparisons *
+         ***************/
+
+        // eq
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpeq(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto neq_result = sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
+            return svnot_z(detail::sve_ptrue<T>(), neq_result);
+        }
+
+        // neq
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpne(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // lt
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmplt(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // le
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmple(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // gt
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpgt(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // ge
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpge(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        /***************
+         * Permutation *
+         ***************/
+
+        //  rotate_left
+        template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& a, requires_arch<sve>) noexcept
+        {
+            return svext(a, a, N);
+        }
+
+        // swizzle (dynamic)
+        template <class A, class T, class I>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& arg, batch<I, A> indices, requires_arch<sve>) noexcept
+        {
+            return svtbl(arg, indices);
+        }
+
+        template <class A, class T, class I>
+        XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
+                                                       batch<I, A> indices,
+                                                       requires_arch<sve>) noexcept
+        {
+            const auto real = swizzle(self.real(), indices, sve {});
+            const auto imag = swizzle(self.imag(), indices, sve {});
+            return batch<std::complex<T>>(real, imag);
+        }
+
+        // swizzle (static)
+        template <class A, class T, class I, I... idx>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<I, A, idx...> indices, requires_arch<sve>) noexcept
+        {
+            static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
+            return swizzle(arg, indices.as_batch(), sve {});
+        }
+
+        template <class A, class T, class I, I... idx>
+        XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& arg,
+                                                       batch_constant<I, A, idx...> indices,
+                                                       requires_arch<sve>) noexcept
+        {
+            static_assert(batch<std::complex<T>, A>::size == sizeof...(idx), "invalid swizzle indices");
+            return swizzle(arg, indices.as_batch(), sve {});
+        }
+
+        /*************
+         * Selection *
+         *************/
+
+        // extract_pair
+        namespace detail
+        {
+            template <class A, class T>
+            XSIMD_INLINE batch<T, A> sve_extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
+            {
+                assert(false && "extract_pair out of bounds");
+                return batch<T, A> {};
+            }
+
+            template <class A, class T, size_t I, size_t... Is>
+            XSIMD_INLINE batch<T, A> sve_extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return svext(rhs, lhs, I);
+                }
+                else
+                {
+                    return sve_extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t... Is>
+            XSIMD_INLINE batch<T, A> sve_extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
+            {
+                if (n == 0)
+                {
+                    return rhs;
+                }
+                else
+                {
+                    return sve_extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(n < size && "index in bounds");
+            return detail::sve_extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
+        }
+
+        // select
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<sve>) noexcept
+        {
+            return svsel(cond, a, b);
+        }
+
+        template <class A, class T, bool... b>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sve>) noexcept
+        {
+            return select(batch_bool<T, A> { b... }, true_br, false_br, sve {});
+        }
+
+        // zip_lo
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svzip1(lhs, rhs);
+        }
+
+        // zip_hi
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svzip2(lhs, rhs);
+        }
+
+        /*****************************
+         * Floating-point arithmetic *
+         *****************************/
+
+        // rsqrt
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svrsqrte(arg);
+        }
+
+        // sqrt
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svsqrt_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        // reciprocal
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<sve>) noexcept
+        {
+            return svrecpe(arg);
+        }
+
+        /******************************
+         * Floating-point conversions *
+         ******************************/
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A, class T, detail::enable_sized_integral_t<T, 4> = 0>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_f32_x(detail::sve_ptrue<T>(), arg);
+            }
+
+            template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_f64_x(detail::sve_ptrue<T>(), arg);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& arg, batch<int32_t, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_s32_x(detail::sve_ptrue<float>(), arg);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& arg, batch<uint32_t, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_u32_x(detail::sve_ptrue<float>(), arg);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& arg, batch<int64_t, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_s64_x(detail::sve_ptrue<double>(), arg);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<uint64_t, A> fast_cast(batch<double, A> const& arg, batch<uint64_t, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_u64_x(detail::sve_ptrue<double>(), arg);
+            }
+        }
+
+        /*********
+         * Miscs *
+         *********/
+
+        // set
+        template <class A, class T, class... Args>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sve>, Args... args) noexcept
+        {
+            return detail::sve_vector_type<T> { args... };
+        }
+
+        template <class A, class T, class... Args>
+        XSIMD_INLINE batch<std::complex<T>, A> set(batch<std::complex<T>, A> const&, requires_arch<sve>,
+                                                   Args... args_complex) noexcept
+        {
+            return batch<std::complex<T>>(detail::sve_vector_type<T> { args_complex.real()... },
+                                          detail::sve_vector_type<T> { args_complex.imag()... });
+        }
+
+        template <class A, class T, class... Args>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sve>, Args... args) noexcept
+        {
+            using U = as_unsigned_integer_t<T>;
+            const auto values = detail::sve_vector_type<U> { static_cast<U>(args)... };
+            const auto zero = broadcast<A, U>(static_cast<U>(0), sve {});
+            return svcmpne(detail::sve_ptrue<T>(), values, zero);
+        }
+
+        // insert
+        namespace detail
+        {
+            // generate index sequence (iota)
+            XSIMD_INLINE svuint8_t sve_iota_impl(index<1>) noexcept { return svindex_u8(0, 1); }
+            XSIMD_INLINE svuint16_t sve_iota_impl(index<2>) noexcept { return svindex_u16(0, 1); }
+            XSIMD_INLINE svuint32_t sve_iota_impl(index<4>) noexcept { return svindex_u32(0, 1); }
+            XSIMD_INLINE svuint64_t sve_iota_impl(index<8>) noexcept { return svindex_u64(0, 1); }
+
+            template <class T, class V = sve_vector_type<as_unsigned_integer_t<T>>>
+            XSIMD_INLINE V sve_iota() noexcept { return sve_iota_impl(index<sizeof(T)> {}); }
+        } // namespace detail
+
+        template <class A, class T, size_t I, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<sve>) noexcept
+        {
+            // create a predicate with only the I-th lane activated
+            const auto iota = detail::sve_iota<T>();
+            const auto index_predicate = svcmpeq(detail::sve_ptrue<T>(), iota, static_cast<as_unsigned_integer_t<T>>(I));
+            return svsel(index_predicate, broadcast<A, T>(val, sve {}), arg);
+        }
+
+        // all
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return detail::sve_pcount<T>(arg) == batch_bool<T, A>::size;
+        }
+
+        // any
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svptest_any(arg, arg);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 1> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u8(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 1> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s8(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 2> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u16(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 2> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s16(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 4> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u32(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 4> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s32(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 8> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u64(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 8> = 0>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s64(arg);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_f32(arg);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_f64(arg);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in, detail::sve_enable_all_t<T_in> = 0>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<sve>) noexcept
+        {
+            return arg.data;
+        }
+
+        // from_bool
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return select(arg, batch<T, A>(1), batch<T, A>(0));
+        }
+
+        // slide_left
+        namespace detail
+        {
+            template <size_t N>
+            struct sve_slider_left
+            {
+                template <class A, class T>
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                {
+                    using u8_vector = batch<uint8_t, A>;
+                    const auto left = svdup_n_u8(0);
+                    const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data;
+                    const u8_vector result(svext(left, right, u8_vector::size - N));
+                    return bitwise_cast(result, batch<T, A> {}, sve {});
+                }
+            };
+
+            template <>
+            struct sve_slider_left<0>
+            {
+                template <class A, class T>
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                {
+                    return arg;
+                }
+            };
+        } // namespace detail
+
+        template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return detail::sve_slider_left<N>()(arg);
+        }
+
+        // slide_right
+        namespace detail
+        {
+            template <size_t N>
+            struct sve_slider_right
+            {
+                template <class A, class T>
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                {
+                    using u8_vector = batch<uint8_t, A>;
+                    const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data;
+                    const auto right = svdup_n_u8(0);
+                    const u8_vector result(svext(left, right, N));
+                    return bitwise_cast(result, batch<T, A> {}, sve {});
+                }
+            };
+
+            template <>
+            struct sve_slider_right<batch<uint8_t, sve>::size>
+            {
+                template <class A, class T>
+                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const&) noexcept
+                {
+                    return batch<T, A> {};
+                }
+            };
+        } // namespace detail
+
+        template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return detail::sve_slider_right<N>()(arg);
+        }
+
+        // isnan
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return !(arg == arg);
+        }
+
+        // nearbyint
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svrintx_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto nearest = svrintx_x(detail::sve_ptrue<float>(), arg);
+            return svcvt_s32_x(detail::sve_ptrue<float>(), nearest);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> nearbyint_as_int(batch<double, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto nearest = svrintx_x(detail::sve_ptrue<double>(), arg);
+            return svcvt_s64_x(detail::sve_ptrue<double>(), nearest);
+        }
+
+        // ldexp
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        XSIMD_INLINE batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& exp, requires_arch<sve>) noexcept
+        {
+            return svscale_x(detail::sve_ptrue<T>(), x, exp);
+        }
+
+    } // namespace kernel
+} // namespace xsimd
+
+#endif
diff --git a/include/onnxruntime/xsimd/arch/xsimd_wasm.hpp b/include/onnxruntime/xsimd/arch/xsimd_wasm.hpp
new file mode 100644
index 0000000000000..9d29839923c3c
--- /dev/null
+++ b/include/onnxruntime/xsimd/arch/xsimd_wasm.hpp
@@ -0,0 +1,1703 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Anutosh Bhat                                               *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_WASM_HPP
+#define XSIMD_WASM_HPP
+
+#include <type_traits>
+
+#include "../types/xsimd_wasm_register.hpp"
+
+namespace xsimd
+{
+    template <typename T, class A, bool... Values>
+    struct batch_bool_constant;
+
+    template <class T_out, class T_in, class A>
+    XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
+
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fwd
+        template <class A, class T, size_t I>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+        template <class A, typename T, typename ITy, ITy... Indices>
+        XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<generic>) noexcept;
+
+        // abs
+        template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_abs(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_abs(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_abs(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_abs(self);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_abs(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_abs(self);
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_add(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_add(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_add(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_add(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_add(self, other);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_add(self, other);
+        }
+
+        // avgr
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_u8x16_avgr(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_u16x8_avgr(self, other);
+            }
+            else
+            {
+                return avgr(self, other, generic {});
+            }
+        }
+
+        // avg
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto adj = ((self ^ other) << 7) >> 7;
+                return avgr(self, other, A {}) - adj;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto adj = ((self ^ other) << 15) >> 15;
+                return avgr(self, other, A {}) - adj;
+            }
+            else
+            {
+                return avg(self, other, generic {});
+            }
+        }
+
+        // all
+        template <class A>
+        XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_bitmask(self) == 0x0F;
+        }
+        template <class A>
+        XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_bitmask(self) == 0x03;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i8x16_bitmask(self) == 0xFFFF;
+        }
+
+        // any
+        template <class A>
+        XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_bitmask(self) != 0;
+        }
+        template <class A>
+        XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_bitmask(self) != 0;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i8x16_bitmask(self) != 0;
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<wasm>) noexcept
+        {
+            return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
+        }
+
+        // bitwise_and
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_and(self, other);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_and(self, other);
+        }
+
+        // bitwise_andnot
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_andnot(self, other);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_andnot(self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class Tp>
+        XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<wasm>) noexcept
+        {
+            return batch<Tp, A>(self.data);
+        }
+
+        // bitwise_or
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_or(self, other);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_or(self, other);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_shl(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_shl(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_shl(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_shl(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<wasm>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_i8x16_shr(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_i16x8_shr(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_i32x4_shr(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return wasm_i64x2_shr(self, other);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_u8x16_shr(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_u16x8_shr(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_u32x4_shr(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return wasm_u64x2_shr(self, other);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+        }
+
+        // bitwise_not
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_not(self);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_not(self);
+        }
+
+        // bitwise_xor
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_xor(self, other);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_xor(self, other);
+        }
+
+        // broadcast
+        template <class A>
+        batch<float, A> XSIMD_INLINE broadcast(float val, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_splat(val);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_splat(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_splat(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_splat(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_splat(val);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_splat(val);
+        }
+
+        // ceil
+        template <class A>
+        XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_ceil(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_ceil(self);
+        }
+
+        // div
+        template <class A>
+        XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_div(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_div(self, other);
+        }
+
+        // eq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_eq(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_eq(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_eq(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_eq(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_eq(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_eq(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_eq(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_eq(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_eq(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_eq(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_eq(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_eq(self, other);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<wasm>) noexcept
+            {
+                return wasm_f32x4_convert_i32x4(self);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<wasm>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to wasm
+                v128_t xH = wasm_u64x2_shr(x, 32);
+                xH = wasm_v128_or(xH, wasm_f64x2_splat(19342813113834066795298816.)); //  2^84
+                v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
+                v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); //  2^52
+                v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(19342813118337666422669312.)); //  2^84 + 2^52
+                return wasm_f64x2_add(f, xL);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<wasm>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to wasm
+                v128_t xH = wasm_i32x4_shr(x, 16);
+                xH = wasm_v128_and(xH, wasm_i16x8_make(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
+                xH = wasm_i64x2_add(xH, wasm_f64x2_splat(442721857769029238784.)); //  3*2^67
+                v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
+                v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); //  2^52
+                v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(442726361368656609280.)); //  3*2^67 + 2^52
+                return wasm_f64x2_add(f, xL);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<wasm>) noexcept
+            {
+                return wasm_i32x4_make(
+                    static_cast<int32_t>(wasm_f32x4_extract_lane(self, 0)),
+                    static_cast<int32_t>(wasm_f32x4_extract_lane(self, 1)),
+                    static_cast<int32_t>(wasm_f32x4_extract_lane(self, 2)),
+                    static_cast<int32_t>(wasm_f32x4_extract_lane(self, 3)));
+            }
+        }
+
+        // floor
+        template <class A>
+        XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_floor(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_floor(self);
+        }
+
+        // from_mask
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
+        {
+            alignas(A::alignment()) static const uint32_t lut[][4] = {
+                { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+            };
+            assert(!(mask & ~0xFul) && "inbound mask");
+            return wasm_v128_load((const v128_t*)lut[mask]);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut[][4] = {
+                { 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+            };
+            assert(!(mask & ~0x3ul) && "inbound mask");
+            return wasm_v128_load((const v128_t*)lut[mask]);
+        }
+        template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut64[] = {
+                0x0000000000000000,
+                0x000000000000FFFF,
+                0x00000000FFFF0000,
+                0x00000000FFFFFFFF,
+                0x0000FFFF00000000,
+                0x0000FFFF0000FFFF,
+                0x0000FFFFFFFF0000,
+                0x0000FFFFFFFFFFFF,
+                0xFFFF000000000000,
+                0xFFFF00000000FFFF,
+                0xFFFF0000FFFF0000,
+                0xFFFF0000FFFFFFFF,
+                0xFFFFFFFF00000000,
+                0xFFFFFFFF0000FFFF,
+                0xFFFFFFFFFFFF0000,
+                0xFFFFFFFFFFFFFFFF,
+            };
+            alignas(A::alignment()) static const uint32_t lut32[] = {
+                0x00000000,
+                0x000000FF,
+                0x0000FF00,
+                0x0000FFFF,
+                0x00FF0000,
+                0x00FF00FF,
+                0x00FFFF00,
+                0x00FFFFFF,
+                0xFF000000,
+                0xFF0000FF,
+                0xFF00FF00,
+                0xFF00FFFF,
+                0xFFFF0000,
+                0xFFFF00FF,
+                0xFFFFFF00,
+                0xFFFFFFFF,
+            };
+            alignas(A::alignment()) static const uint32_t lut16[][4] = {
+                { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+            };
+            alignas(A::alignment()) static const uint64_t lut8[][4] = {
+                { 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+            };
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(!(mask & ~0xFFFF) && "inbound mask");
+                return wasm_i32x4_make(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(!(mask & ~0xFF) && "inbound mask");
+                return wasm_i64x2_make(lut64[mask & 0xF], lut64[mask >> 4]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                assert(!(mask & ~0xFul) && "inbound mask");
+                return wasm_v128_load((const v128_t*)lut16[mask]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                assert(!(mask & ~0x3ul) && "inbound mask");
+                return wasm_v128_load((const v128_t*)lut8[mask]);
+            }
+        }
+
+        // ge
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_ge(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_ge(self, other);
+        }
+
+        // gt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_gt(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_i8x16_gt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_i16x8_gt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_i32x4_gt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return wasm_i64x2_gt(self, other);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_u8x16_gt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_u16x8_gt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_u32x4_gt(self, other);
+                }
+                else
+                {
+                    return gt(self, other, generic {});
+                }
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_gt(self, other);
+        }
+
+        // haddp
+        template <class A>
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<wasm>) noexcept
+        {
+            v128_t tmp0 = wasm_i32x4_shuffle(row[0], row[1], 0, 4, 1, 5);
+            v128_t tmp1 = wasm_i32x4_shuffle(row[0], row[1], 2, 6, 3, 7);
+            v128_t tmp2 = wasm_i32x4_shuffle(row[2], row[3], 2, 6, 3, 7);
+            tmp0 = wasm_f32x4_add(tmp0, tmp1);
+            tmp1 = wasm_i32x4_shuffle(row[2], row[3], 0, 4, 1, 5);
+            tmp1 = wasm_f32x4_add(tmp1, tmp2);
+            tmp2 = wasm_i32x4_shuffle(tmp1, tmp0, 6, 7, 2, 3);
+            tmp0 = wasm_i32x4_shuffle(tmp0, tmp1, 0, 1, 4, 5);
+            return wasm_f32x4_add(tmp0, tmp2);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_add(wasm_i64x2_shuffle(row[0], row[1], 0, 2),
+                                  wasm_i64x2_shuffle(row[0], row[1], 1, 3));
+        }
+
+        // insert
+        template <class A, size_t I>
+        XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I> pos, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_replace_lane(self, pos, val);
+        }
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<wasm>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_i8x16_replace_lane(self, pos, val);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_i16x8_replace_lane(self, pos, val);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_i32x4_replace_lane(self, pos, val);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return wasm_i64x2_replace_lane(self, pos, val);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_u8x16_replace_lane(self, pos, val);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_u16x8_replace_lane(self, pos, val);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_u32x4_replace_lane(self, pos, val);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return wasm_u64x2_replace_lane(self, pos, val);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+        }
+
+        template <class A, size_t I>
+        XSIMD_INLINE batch<double, A> insert(batch<double, A> const& self, double val, index<I> pos, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_replace_lane(self, pos, val);
+        }
+
+        // isnan
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_or(wasm_f32x4_ne(self, self), wasm_f32x4_ne(self, self));
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_or(wasm_f64x2_ne(self, self), wasm_f64x2_ne(self, self));
+        }
+
+        // le
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_le(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_le(self, other);
+        }
+
+        // load_aligned
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_load(mem);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_load((v128_t const*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_load(mem);
+        }
+
+        // load_complex
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<wasm>) noexcept
+            {
+                return { wasm_i32x4_shuffle(hi, lo, 0, 2, 4, 6), wasm_i32x4_shuffle(hi, lo, 1, 3, 5, 7) };
+            }
+            template <class A>
+            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<wasm>) noexcept
+            {
+                return { wasm_i64x2_shuffle(hi, lo, 0, 2), wasm_i64x2_shuffle(hi, lo, 1, 3) };
+            }
+        }
+
+        // load_unaligned
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_load(mem);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_load((v128_t const*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_load(mem);
+        }
+
+        // lt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_lt(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_i8x16_lt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_i16x8_lt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_i32x4_lt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return wasm_i64x2_lt(self, other);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_u8x16_lt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_u16x8_lt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return wasm_u32x4_lt(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    auto xself = wasm_v128_xor(self, wasm_i64x2_splat(std::numeric_limits<int64_t>::lowest()));
+                    auto xother = wasm_v128_xor(other, wasm_i64x2_splat(std::numeric_limits<int64_t>::lowest()));
+                    v128_t tmp1 = wasm_i64x2_sub(xself, xother);
+                    v128_t tmp2 = wasm_v128_xor(xself, xother);
+                    v128_t tmp3 = wasm_v128_andnot(xself, xother);
+                    v128_t tmp4 = wasm_v128_andnot(tmp1, tmp2);
+                    v128_t tmp5 = wasm_v128_or(tmp3, tmp4);
+                    v128_t tmp6 = wasm_i32x4_shr(tmp5, 31);
+                    return wasm_i32x4_shuffle(tmp6, wasm_i32x4_splat(0), 1, 1, 3, 3);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_lt(self, other);
+        }
+
+        // mask
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_bitmask(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_bitmask(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_bitmask(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_bitmask(self);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_bitmask(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_bitmask(self);
+        }
+
+        // max
+        template <class A>
+        XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_pmax(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return select(self > other, self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_pmax(self, other);
+        }
+
+        // min
+        template <class A>
+        XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_pmin(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return select(self <= other, self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_pmin(self, other);
+        }
+
+        // mul
+        template <class A>
+        XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_mul(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_mul(self, other);
+        }
+
+        // neg
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_neg(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_neg(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_neg(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_neg(self);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> neg(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_neg(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_neg(self);
+        }
+
+        // neq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_ne(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return ~(self == other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_ne(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return ~(self == other);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_ne(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_ne(self, other);
+        }
+
+        // reciprocal
+        template <class A>
+        XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            v128_t one = wasm_f32x4_splat(1.0f);
+            return wasm_f32x4_div(one, self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> reciprocal(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            v128_t one = wasm_f64x2_splat(1.0);
+            return wasm_f64x2_div(one, self);
+        }
+
+        // reduce_add
+        template <class A>
+        XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            v128_t tmp0 = wasm_f32x4_add(self, wasm_i32x4_shuffle(self, self, 6, 7, 2, 3));
+            v128_t tmp1 = wasm_i32x4_shuffle(tmp0, tmp0, 1, 0, 4, 4);
+            v128_t tmp2 = wasm_f32x4_add(tmp0, tmp1);
+            v128_t tmp3 = wasm_i32x4_shuffle(tmp0, tmp2, 4, 1, 2, 3);
+            return wasm_f32x4_extract_lane(tmp3, 0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0);
+                v128_t tmp1 = wasm_i32x4_add(self, tmp0);
+                v128_t tmp2 = wasm_i32x4_shuffle(tmp1, wasm_i32x4_splat(0), 1, 0, 0, 0);
+                v128_t tmp3 = wasm_i32x4_add(tmp1, tmp2);
+                return wasm_i32x4_extract_lane(tmp3, 0);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0);
+                v128_t tmp1 = wasm_i64x2_add(self, tmp0);
+                return wasm_i64x2_extract_lane(tmp1, 0);
+            }
+            else
+            {
+                return hadd(self, generic {});
+            }
+        }
+        template <class A>
+        XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            v128_t tmp0 = wasm_i64x2_shuffle(self, self, 1, 3);
+            v128_t tmp1 = wasm_f64x2_add(self, tmp0);
+            v128_t tmp2 = wasm_i64x2_shuffle(tmp0, tmp1, 2, 1);
+            return wasm_f64x2_extract_lane(tmp2, 0);
+        }
+
+        // rsqrt
+        template <class A>
+        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            v128_t one = wasm_f32x4_splat(1.0f);
+            return wasm_f32x4_div(one, wasm_f32x4_sqrt(self));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            v128_t one = wasm_f64x2_splat(1.0);
+            return wasm_f64x2_div(one, wasm_f64x2_sqrt(self));
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<wasm>) noexcept
+        {
+            return wasm_i8x16_shuffle(
+                wasm_i64x2_const(0, 0), x, ((N) & 0xF0) ? 0 : 16 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 17 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 18 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 19 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 20 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 21 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 22 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 23 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 24 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 25 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 26 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 27 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 28 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 29 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 30 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 31 - ((N) & 0xF));
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<wasm>) noexcept
+        {
+            return wasm_i8x16_shuffle(
+                x, wasm_i64x2_const(0, 0), ((N) & 0xF0) ? 16 : ((N) & 0xF) + 0,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 1, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 2,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 3, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 4,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 5, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 6,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 7, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 8,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 9, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 10,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 11, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 12,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 13, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 14,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 15);
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_i8x16_add_sat(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_i16x8_add_sat(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, generic {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_u8x16_add_sat(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_u16x8_add_sat(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, generic {});
+                }
+            }
+        }
+
+        // select
+        template <class A>
+        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, wasm {});
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
+        }
+
+        // shuffle
+        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
+        XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3);
+        }
+
+        template <class A, class ITy, ITy I0, ITy I1>
+        XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_shuffle(x, y, I0, I1);
+        }
+
+        // set
+        template <class A, class... Values>
+        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<wasm>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
+            return wasm_f32x4_make(values...);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1) noexcept
+        {
+            return wasm_i64x2_make(v0, v1);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3) noexcept
+        {
+            return wasm_i32x4_make(v0, v1, v2, v3);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        {
+            return wasm_i16x8_make(v0, v1, v2, v3, v4, v5, v6, v7);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        {
+            return wasm_i8x16_make(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+
+        template <class A, class... Values>
+        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<wasm>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
+            return wasm_f64x2_make(values...);
+        }
+
+        template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<wasm>, Values... values) noexcept
+        {
+            return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
+        }
+
+        template <class A, class... Values>
+        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<wasm>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
+            return set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data;
+        }
+
+        template <class A, class... Values>
+        XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<wasm>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
+            return set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data;
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_i8x16_sub_sat(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_i16x8_sub_sat(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, generic {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return wasm_u8x16_sub_sat(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return wasm_u16x8_sub_sat(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, generic {});
+                }
+            }
+        }
+
+        // store_aligned
+        template <class A>
+        XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_store(mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_store((v128_t*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_store((v128_t*)mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_store(mem, self);
+        }
+
+        // store_complex
+        namespace detail
+        {
+            // complex_low
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<wasm>) noexcept
+            {
+                return wasm_i32x4_shuffle(self.real(), self.imag(), 0, 4, 1, 5);
+            }
+            // complex_high
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<wasm>) noexcept
+            {
+                return wasm_i32x4_shuffle(self.real(), self.imag(), 2, 6, 3, 7);
+            }
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<wasm>) noexcept
+            {
+                return wasm_i64x2_shuffle(self.real(), self.imag(), 0, 2);
+            }
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<wasm>) noexcept
+            {
+                return wasm_i64x2_shuffle(self.real(), self.imag(), 1, 3);
+            }
+        }
+
+        // store_unaligned
+        template <class A>
+        XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_store(mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_store((v128_t*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_store((v128_t*)mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_v128_store(mem, self);
+        }
+
+        // sub
+        template <class A>
+        XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_sub(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_sub(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_sub(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_sub(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_sub(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_sub(self, other);
+        }
+
+        // sqrt
+        template <class A>
+        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_sqrt(val);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_sqrt(val);
+        }
+
+        // swizzle
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_shuffle(self, self, V0, V1);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_shuffle(self, self, V0, V1);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<wasm>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, wasm {}));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<wasm>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, wasm {}));
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i16x8_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7);
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<wasm>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, wasm {}));
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i8x16_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15);
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<wasm>) noexcept
+        {
+            return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, wasm {}));
+        }
+
+        // transpose
+        template <class A, class T>
+        XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<wasm>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
+
+                auto t0 = wasm_i32x4_shuffle(r0, r1, 0, 4, 1, 5); // r0[0] r1[0] r0[1] r1[1]
+                auto t1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7); // r0[2] r1[2] r0[3] r1[3]
+
+                auto t2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5); // r2[0] r3[0] r2[1] r3[1]
+                auto t3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7); // r2[2] r3[2] r2[3] r3[3]
+
+                matrix_begin[0] = wasm_i32x4_shuffle(t0, t2, 0, 1, 4, 5); // r0[0] r1[0] r2[0] r3[0]
+                matrix_begin[1] = wasm_i32x4_shuffle(t0, t2, 2, 3, 6, 7); // r0[1] r1[1] r2[1] r3[1]
+                matrix_begin[2] = wasm_i32x4_shuffle(t1, t3, 0, 1, 4, 5); // r0[2] r1[2] r2[2] r3[2]
+                matrix_begin[3] = wasm_i32x4_shuffle(t1, t3, 2, 3, 6, 7); // r0[3] r1[3] r2[3] r3[3]
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto r0 = matrix_begin[0], r1 = matrix_begin[1];
+
+                matrix_begin[0] = wasm_i64x2_shuffle(r0, r1, 0, 2);
+                matrix_begin[1] = wasm_i64x2_shuffle(r0, r1, 1, 3);
+            }
+            else
+            {
+                transpose(matrix_begin, matrix_end, generic {});
+            }
+        }
+
+        // trunc
+        template <class A>
+        XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f32x4_trunc(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> trunc(batch<double, A> const& self, requires_arch<wasm>) noexcept
+        {
+            return wasm_f64x2_trunc(self);
+        }
+
+        // zip_hi
+        template <class A>
+        XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_shuffle(self, other, 2, 6, 3, 7);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_shuffle(self, other, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_shuffle(self, other, 4, 12, 5, 13, 6, 14, 7, 15);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_shuffle(self, other, 2, 6, 3, 7);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_shuffle(self, other, 1, 3);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_shuffle(self, other, 1, 3);
+        }
+
+        // zip_lo
+        template <class A>
+        XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_i32x4_shuffle(self, other, 0, 4, 1, 5);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return wasm_i8x16_shuffle(self, other, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return wasm_i16x8_shuffle(self, other, 0, 8, 1, 9, 2, 10, 3, 11);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return wasm_i32x4_shuffle(self, other, 0, 4, 1, 5);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return wasm_i64x2_shuffle(self, other, 0, 2);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
+        {
+            return wasm_i64x2_shuffle(self, other, 0, 2);
+        }
+    }
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/config/xsimd_arch.hpp b/include/onnxruntime/xsimd/config/xsimd_arch.hpp
new file mode 100644
index 0000000000000..39d0d581de96b
--- /dev/null
+++ b/include/onnxruntime/xsimd/config/xsimd_arch.hpp
@@ -0,0 +1,238 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ARCH_HPP
+#define XSIMD_ARCH_HPP
+
+#include <initializer_list>
+#include <type_traits>
+#include <utility>
+
+#include "../types/xsimd_all_registers.hpp"
+#include "./xsimd_config.hpp"
+#include "./xsimd_cpuid.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * Dummy architectures that only appears in a list of architecture when no
+     * other architecture has been detected.
+     */
+    struct unavailable
+    {
+        static constexpr bool supported() noexcept { return false; }
+        static constexpr bool available() noexcept { return false; }
+        static constexpr std::size_t alignment() noexcept { return 0; }
+        static constexpr bool requires_alignment() noexcept { return false; }
+        static constexpr char const* name() noexcept { return "<none>"; }
+    };
+
+    namespace detail
+    {
+        // Checks whether T appears in Tys.
+        template <class T, class... Tys>
+        struct contains;
+
+        template <class T>
+        struct contains<T> : std::false_type
+        {
+        };
+
+        template <class T, class Ty, class... Tys>
+        struct contains<T, Ty, Tys...>
+            : std::conditional<std::is_same<Ty, T>::value, std::true_type,
+                               contains<T, Tys...>>::type
+        {
+        };
+
+        template <typename T>
+        XSIMD_INLINE constexpr T max_of(T value) noexcept
+        {
+            return value;
+        }
+
+        template <typename T, typename... Ts>
+        XSIMD_INLINE constexpr T max_of(T head0, T head1, Ts... tail) noexcept
+        {
+            return max_of((head0 > head1 ? head0 : head1), tail...);
+        }
+
+        template <typename... Ts>
+        struct head;
+
+        template <typename T, typename... Ts>
+        struct head<T, Ts...>
+        {
+            using type = T;
+        };
+
+        template <>
+        struct head<>
+        {
+            using type = unavailable;
+        };
+
+    } // namespace detail
+
+    // An arch_list is a list of architectures.
+    template <class... Archs>
+    struct arch_list
+    {
+        using best = typename detail::head<Archs...>::type;
+
+        template <class Arch>
+        using add = arch_list<Archs..., Arch>;
+
+        template <class... OtherArchs>
+        using extend = arch_list<Archs..., OtherArchs...>;
+
+        template <class Arch>
+        static constexpr bool contains() noexcept
+        {
+            return detail::contains<Arch, Archs...>::value;
+        }
+
+        template <class F>
+        static XSIMD_INLINE void for_each(F&& f) noexcept
+        {
+            (void)std::initializer_list<bool> { (f(Archs {}), true)... };
+        }
+
+        static constexpr std::size_t alignment() noexcept
+        {
+            // all alignments are a power of two
+            return detail::max_of(Archs::alignment()..., static_cast<size_t>(0));
+        }
+    };
+
+    namespace detail
+    {
+
+        // Filter archlists Archs, picking only supported archs and adding
+        // them to L.
+        template <class L, class... Archs>
+        struct supported_helper;
+
+        template <class L>
+        struct supported_helper<L, arch_list<>>
+        {
+            using type = L;
+        };
+
+        template <class L, class Arch, class... Archs>
+        struct supported_helper<L, arch_list<Arch, Archs...>>
+            : supported_helper<
+                  typename std::conditional<Arch::supported(),
+                                            typename L::template add<Arch>, L>::type,
+                  arch_list<Archs...>>
+        {
+        };
+
+        template <class... Archs>
+        struct supported : supported_helper<arch_list<>, Archs...>
+        {
+        };
+
+        // Joins all arch_list Archs in a single arch_list.
+        template <class... Archs>
+        struct join;
+
+        template <class Arch>
+        struct join<Arch>
+        {
+            using type = Arch;
+        };
+
+        template <class Arch, class... Archs, class... Args>
+        struct join<Arch, arch_list<Archs...>, Args...>
+            : join<typename Arch::template extend<Archs...>, Args...>
+        {
+        };
+    } // namespace detail
+
+    using all_x86_architectures = arch_list<
+        avx512vnni<avx512vbmi>, avx512vbmi, avx512ifma, avx512pf, avx512vnni<avx512bw>, avx512bw, avx512er, avx512dq, avx512cd, avx512f,
+        avxvnni, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>,
+        sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
+
+    using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
+    using all_rvv_architectures = arch_list<detail::rvv<512>, detail::rvv<256>, detail::rvv<128>>;
+    using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<i8mm<neon64>, neon64, neon>>::type;
+    using all_riscv_architectures = all_rvv_architectures;
+    using all_wasm_architectures = arch_list<wasm>;
+    using all_architectures = typename detail::join<all_riscv_architectures, all_wasm_architectures, all_arm_architectures, all_x86_architectures>::type;
+
+    using supported_architectures = typename detail::supported<all_architectures>::type;
+
+    using x86_arch = typename detail::supported<all_x86_architectures>::type::best;
+    using arm_arch = typename detail::supported<all_arm_architectures>::type::best;
+    using riscv_arch = typename detail::supported<all_riscv_architectures>::type::best;
+    using best_arch = typename supported_architectures::best;
+
+#ifdef XSIMD_DEFAULT_ARCH
+    using default_arch = XSIMD_DEFAULT_ARCH;
+#else
+    using default_arch = best_arch;
+#endif
+
+    namespace detail
+    {
+        template <class F, class ArchList>
+        class dispatcher
+        {
+
+            const decltype(available_architectures()) availables_archs;
+            F functor;
+
+            template <class Arch, class... Tys>
+            XSIMD_INLINE auto walk_archs(arch_list<Arch>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
+            {
+                assert(Arch::available() && "At least one arch must be supported during dispatch");
+                return functor(Arch {}, std::forward<Tys>(args)...);
+            }
+
+            template <class Arch, class ArchNext, class... Archs, class... Tys>
+            XSIMD_INLINE auto walk_archs(arch_list<Arch, ArchNext, Archs...>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
+            {
+                if (availables_archs.has(Arch {}))
+                    return functor(Arch {}, std::forward<Tys>(args)...);
+                else
+                    return walk_archs(arch_list<ArchNext, Archs...> {}, std::forward<Tys>(args)...);
+            }
+
+        public:
+            XSIMD_INLINE dispatcher(F f) noexcept
+                : availables_archs(available_architectures())
+                , functor(f)
+            {
+            }
+
+            template <class... Tys>
+            XSIMD_INLINE auto operator()(Tys&&... args) noexcept -> decltype(functor(default_arch {}, std::forward<Tys>(args)...))
+            {
+                return walk_archs(ArchList {}, std::forward<Tys>(args)...);
+            }
+        };
+    }
+
+    // Generic function dispatch, à la ifunc
+    template <class ArchList = supported_architectures, class F>
+    XSIMD_INLINE detail::dispatcher<F, ArchList> dispatch(F&& f) noexcept
+    {
+        return { std::forward<F>(f) };
+    }
+
+} // namespace xsimd
+
+#endif
diff --git a/include/onnxruntime/xsimd/config/xsimd_config.hpp b/include/onnxruntime/xsimd/config/xsimd_config.hpp
new file mode 100644
index 0000000000000..d756f8817d267
--- /dev/null
+++ b/include/onnxruntime/xsimd/config/xsimd_config.hpp
@@ -0,0 +1,462 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_CONFIG_HPP
+#define XSIMD_CONFIG_HPP
+
+#define XSIMD_VERSION_MAJOR 13
+#define XSIMD_VERSION_MINOR 1
+#define XSIMD_VERSION_PATCH 0
+
+/**
+ * high level free functions
+ *
+ * @defgroup xsimd_config_macro Instruction Set Detection
+ */
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE2 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE2__
+#define XSIMD_WITH_SSE2 1
+#else
+#define XSIMD_WITH_SSE2 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE3 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE3__
+#define XSIMD_WITH_SSE3 1
+#else
+#define XSIMD_WITH_SSE3 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSSE3 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSSE3__
+#define XSIMD_WITH_SSSE3 1
+#else
+#define XSIMD_WITH_SSSE3 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE4.1 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE4_1__
+#define XSIMD_WITH_SSE4_1 1
+#else
+#define XSIMD_WITH_SSE4_1 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE4.2 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE4_2__
+#define XSIMD_WITH_SSE4_2 1
+#else
+#define XSIMD_WITH_SSE4_2 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX__
+#define XSIMD_WITH_AVX 1
+#else
+#define XSIMD_WITH_AVX 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX2 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX2__
+#define XSIMD_WITH_AVX2 1
+#else
+#define XSIMD_WITH_AVX2 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVXVNNI is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVXVNNI__
+#define XSIMD_WITH_AVXVNNI 1
+#else
+#define XSIMD_WITH_AVXVNNI 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if FMA3 for SSE is available at compile-time, to 0 otherwise.
+ */
+#ifdef __FMA__
+
+#if defined(__SSE__)
+#ifndef XSIMD_WITH_FMA3_SSE // Leave the opportunity to manually disable it, see #643
+#define XSIMD_WITH_FMA3_SSE 1
+#endif
+#else
+
+#if XSIMD_WITH_FMA3_SSE
+#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_SSE 0
+#endif
+
+#else
+
+#if XSIMD_WITH_FMA3_SSE
+#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_SSE 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if FMA3 for AVX is available at compile-time, to 0 otherwise.
+ */
+#ifdef __FMA__
+
+#if defined(__AVX__)
+#ifndef XSIMD_WITH_FMA3_AVX // Leave the opportunity to manually disable it, see #643
+#define XSIMD_WITH_FMA3_AVX 1
+#endif
+#else
+
+#if XSIMD_WITH_FMA3_AVX
+#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_AVX 0
+#endif
+
+#if defined(__AVX2__)
+#ifndef XSIMD_WITH_FMA3_AVX2 // Leave the opportunity to manually disable it, see #643
+#define XSIMD_WITH_FMA3_AVX2 1
+#endif
+#else
+
+#if XSIMD_WITH_FMA3_AVX2
+#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_AVX2 0
+#endif
+
+#else
+
+#if XSIMD_WITH_FMA3_AVX
+#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
+#endif
+
+#if XSIMD_WITH_FMA3_AVX2
+#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_AVX 0
+#define XSIMD_WITH_FMA3_AVX2 0
+
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if FMA4 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __FMA4__
+#define XSIMD_WITH_FMA4 1
+#else
+#define XSIMD_WITH_FMA4 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512F is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512F__
+// AVX512 instructions are supported starting with gcc 6
+// see https://www.gnu.org/software/gcc/gcc-6/changes.html
+// check clang first, newer clang always defines __GNUC__ = 4
+#if defined(__clang__) && __clang_major__ >= 6
+#define XSIMD_WITH_AVX512F 1
+#elif defined(__GNUC__) && __GNUC__ < 6
+#define XSIMD_WITH_AVX512F 0
+#else
+#define XSIMD_WITH_AVX512F 1
+#if __GNUC__ == 6
+#define XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY 1
+#endif
+#endif
+#else
+#define XSIMD_WITH_AVX512F 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512CD is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512CD__
+// Avoids repeating the GCC workaround over and over
+#define XSIMD_WITH_AVX512CD XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512CD 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512DQ is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512DQ__
+#define XSIMD_WITH_AVX512DQ XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512DQ 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512BW is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512BW__
+#define XSIMD_WITH_AVX512BW XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512BW 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512ER is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512ER__
+#define XSIMD_WITH_AVX512ER XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512ER 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512PF is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512PF__
+#define XSIMD_WITH_AVX512PF XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512PF 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512IFMA is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512IFMA__
+#define XSIMD_WITH_AVX512IFMA XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512IFMA 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512VBMI is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512VBMI__
+#define XSIMD_WITH_AVX512VBMI XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512VBMI 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512VNNI is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512VNNI__
+
+#if XSIMD_WITH_AVX512VBMI
+#define XSIMD_WITH_AVX512VNNI_AVX512VBMI XSIMD_WITH_AVX512F
+#define XSIMD_WITH_AVX512VNNI_AVX512BW XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512VNNI_AVX512VBMI 0
+#define XSIMD_WITH_AVX512VNNI_AVX512BW XSIMD_WITH_AVX512F
+#endif
+
+#else
+
+#define XSIMD_WITH_AVX512VNNI_AVX512VBMI 0
+#define XSIMD_WITH_AVX512VNNI_AVX512BW 0
+
+#endif
+
+#ifdef __ARM_NEON
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if NEON is available at compile-time, to 0 otherwise.
+ */
+#if __ARM_ARCH >= 7
+#define XSIMD_WITH_NEON 1
+#else
+#define XSIMD_WITH_NEON 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if NEON64 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __aarch64__
+#define XSIMD_WITH_NEON64 1
+#else
+#define XSIMD_WITH_NEON64 0
+#endif
+#else
+#define XSIMD_WITH_NEON 0
+#define XSIMD_WITH_NEON64 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if i8mm neon64 extension is available at compile-time, to 0 otherwise.
+ */
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+#define XSIMD_WITH_I8MM_NEON64 1
+#else
+#define XSIMD_WITH_I8MM_NEON64 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SVE is available and bit width is pre-set at compile-time, to 0 otherwise.
+ */
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
+#define XSIMD_WITH_SVE 1
+#define XSIMD_SVE_BITS __ARM_FEATURE_SVE_BITS
+#else
+#define XSIMD_WITH_SVE 0
+#define XSIMD_SVE_BITS 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if RVV is available and bit width is pre-set at compile-time, to 0 otherwise.
+ */
+#if defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && __riscv_v_fixed_vlen > 0
+#define XSIMD_WITH_RVV 1
+#define XSIMD_RVV_BITS __riscv_v_fixed_vlen
+#else
+#define XSIMD_WITH_RVV 0
+#define XSIMD_RVV_BITS 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if WebAssembly SIMD is available at compile-time, to 0 otherwise.
+ */
+#ifdef __EMSCRIPTEN__
+#define XSIMD_WITH_WASM 1
+#else
+#define XSIMD_WITH_WASM 0
+#endif
+
+// Workaround for MSVC compiler
+#ifdef _MSC_VER
+
+#if XSIMD_WITH_AVX512
+
+#undef XSIMD_WITH_AVX2
+#define XSIMD_WITH_AVX2 1
+
+#endif
+
+#if XSIMD_WITH_AVX2
+
+#undef XSIMD_WITH_AVX
+#define XSIMD_WITH_AVX 1
+
+#undef XSIMD_WITH_FMA3_AVX
+#define XSIMD_WITH_FMA3_AVX 1
+
+#undef XSIMD_WITH_FMA3_AVX2
+#define XSIMD_WITH_FMA3_AVX2 1
+
+#endif
+
+#if XSIMD_WITH_AVX
+
+#undef XSIMD_WITH_SSE4_2
+#define XSIMD_WITH_SSE4_2 1
+
+#endif
+
+#if XSIMD_WITH_SSE4_2
+
+#undef XSIMD_WITH_SSE4_1
+#define XSIMD_WITH_SSE4_1 1
+
+#endif
+
+#if XSIMD_WITH_SSE4_1
+
+#undef XSIMD_WITH_SSSE3
+#define XSIMD_WITH_SSSE3 1
+
+#endif
+
+#if XSIMD_WITH_SSSE3
+
+#undef XSIMD_WITH_SSE3
+#define XSIMD_WITH_SSE3 1
+
+#endif
+
+#if XSIMD_WITH_SSE3 || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#undef XSIMD_WITH_SSE2
+#define XSIMD_WITH_SSE2 1
+#endif
+
+#endif
+
+#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM
+#define XSIMD_NO_SUPPORTED_ARCHITECTURE
+#endif
+
+#endif
diff --git a/include/onnxruntime/xsimd/config/xsimd_cpuid.hpp b/include/onnxruntime/xsimd/config/xsimd_cpuid.hpp
new file mode 100644
index 0000000000000..8021fceb82b69
--- /dev/null
+++ b/include/onnxruntime/xsimd/config/xsimd_cpuid.hpp
@@ -0,0 +1,262 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_CPUID_HPP
+#define XSIMD_CPUID_HPP
+
+#include <algorithm>
+#include <cstring>
+
+#if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM) || defined(__riscv_vector))
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+
+#ifndef HWCAP2_I8MM
+#define HWCAP2_I8MM (1 << 13)
+#endif
+
+#endif
+
+#if defined(_MSC_VER)
+// Contains the definition of __cpuidex
+#include <intrin.h>
+#endif
+
+#include "../types/xsimd_all_registers.hpp"
+
+namespace xsimd
+{
+    namespace detail
+    {
+        struct supported_arch
+        {
+
+#define ARCH_FIELD_EX(arch, field_name) \
+    unsigned field_name;                \
+    XSIMD_INLINE bool has(::xsimd::arch) const { return this->field_name; }
+
+#define ARCH_FIELD_EX_REUSE(arch, field_name) \
+    XSIMD_INLINE bool has(::xsimd::arch) const { return this->field_name; }
+
+#define ARCH_FIELD(name) ARCH_FIELD_EX(name, name)
+
+            ARCH_FIELD(sse2)
+            ARCH_FIELD(sse3)
+
+            ARCH_FIELD(ssse3)
+            ARCH_FIELD(sse4_1)
+            ARCH_FIELD(sse4_2)
+            // ARCH_FIELD(sse4a)
+            ARCH_FIELD_EX(fma3<::xsimd::sse4_2>, fma3_sse42)
+            ARCH_FIELD(fma4)
+            // ARCH_FIELD(xop)
+            ARCH_FIELD(avx)
+            ARCH_FIELD_EX(fma3<::xsimd::avx>, fma3_avx)
+            ARCH_FIELD(avx2)
+            ARCH_FIELD(avxvnni)
+            ARCH_FIELD_EX(fma3<::xsimd::avx2>, fma3_avx2)
+            ARCH_FIELD(avx512f)
+            ARCH_FIELD(avx512cd)
+            ARCH_FIELD(avx512dq)
+            ARCH_FIELD(avx512bw)
+            ARCH_FIELD(avx512er)
+            ARCH_FIELD(avx512pf)
+            ARCH_FIELD(avx512ifma)
+            ARCH_FIELD(avx512vbmi)
+            ARCH_FIELD_EX(avx512vnni<::xsimd::avx512bw>, avx512vnni_bw)
+            ARCH_FIELD_EX(avx512vnni<::xsimd::avx512vbmi>, avx512vnni_vbmi)
+            ARCH_FIELD(neon)
+            ARCH_FIELD(neon64)
+            ARCH_FIELD_EX(i8mm<::xsimd::neon64>, i8mm_neon64)
+            ARCH_FIELD_EX(detail::sve<512>, sve)
+            ARCH_FIELD_EX_REUSE(detail::sve<256>, sve)
+            ARCH_FIELD_EX_REUSE(detail::sve<128>, sve)
+            ARCH_FIELD_EX(detail::rvv<512>, rvv)
+            ARCH_FIELD_EX_REUSE(detail::rvv<256>, rvv)
+            ARCH_FIELD_EX_REUSE(detail::rvv<128>, rvv)
+            ARCH_FIELD(wasm)
+
+#undef ARCH_FIELD
+
+            XSIMD_INLINE supported_arch() noexcept
+            {
+                memset(this, 0, sizeof(supported_arch));
+
+#if XSIMD_WITH_WASM
+                wasm = 1;
+#endif
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+                neon = 1;
+                neon64 = 1;
+#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
+                i8mm_neon64 = bool(getauxval(AT_HWCAP2) & HWCAP2_I8MM);
+#endif
+#elif defined(__ARM_NEON) || defined(_M_ARM)
+
+#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
+                neon = bool(getauxval(AT_HWCAP) & HWCAP_NEON);
+#endif
+
+#elif defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
+
+#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
+                sve = bool(getauxval(AT_HWCAP) & HWCAP_SVE);
+#endif
+
+#elif defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && __riscv_v_fixed_vlen > 0
+
+#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
+#ifndef HWCAP_V
+#define HWCAP_V (1 << ('V' - 'A'))
+#endif
+                rvv = bool(getauxval(AT_HWCAP) & HWCAP_V);
+#endif
+
+#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
+
+                auto get_xcr0_low = []() noexcept
+                {
+                    uint32_t xcr0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+
+                    xcr0 = (uint32_t)_xgetbv(0);
+
+#elif defined(__GNUC__)
+
+                    __asm__(
+                        "xorl %%ecx, %%ecx\n"
+                        "xgetbv\n"
+                        : "=a"(xcr0)
+                        :
+#if defined(__i386__)
+                        : "ecx", "edx"
+#else
+                        : "rcx", "rdx"
+#endif
+                    );
+
+#else /* _MSC_VER < 1400 */
+#error "_MSC_VER < 1400 is not supported"
+#endif /* _MSC_VER && _MSC_VER >= 1400 */
+                    return xcr0;
+                };
+
+                auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept
+                {
+
+#if defined(_MSC_VER)
+                    __cpuidex(reg, level, count);
+
+#elif defined(__INTEL_COMPILER)
+                    __cpuid(reg, level);
+
+#elif defined(__GNUC__) || defined(__clang__)
+
+#if defined(__i386__) && defined(__PIC__)
+                    // %ebx may be the PIC register
+                    __asm__("xchg{l}\t{%%}ebx, %1\n\t"
+                            "cpuid\n\t"
+                            "xchg{l}\t{%%}ebx, %1\n\t"
+                            : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
+                            : "0"(level), "2"(count));
+
+#else
+                    __asm__("cpuid\n\t"
+                            : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
+                            : "0"(level), "2"(count));
+#endif
+
+#else
+#error "Unsupported configuration"
+#endif
+                };
+
+                int regs1[4];
+
+                get_cpuid(regs1, 0x1);
+
+                // OS can explicitly disable the usage of SSE/AVX extensions
+                // by setting an appropriate flag in CR0 register
+                //
+                // https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html
+
+                unsigned sse_state_os_enabled = 1;
+                unsigned avx_state_os_enabled = 1;
+                unsigned avx512_state_os_enabled = 1;
+
+                // OSXSAVE: A value of 1 indicates that the OS has set CR4.OSXSAVE[bit
+                // 18] to enable XSETBV/XGETBV instructions to access XCR0 and
+                // to support processor extended state management using
+                // XSAVE/XRSTOR.
+                bool osxsave = regs1[2] >> 27 & 1;
+                if (osxsave)
+                {
+
+                    uint32_t xcr0 = get_xcr0_low();
+
+                    sse_state_os_enabled = xcr0 >> 1 & 1;
+                    avx_state_os_enabled = xcr0 >> 2 & sse_state_os_enabled;
+                    avx512_state_os_enabled = xcr0 >> 6 & avx_state_os_enabled;
+                }
+
+                sse2 = regs1[3] >> 26 & sse_state_os_enabled;
+                sse3 = regs1[2] >> 0 & sse_state_os_enabled;
+                ssse3 = regs1[2] >> 9 & sse_state_os_enabled;
+                sse4_1 = regs1[2] >> 19 & sse_state_os_enabled;
+                sse4_2 = regs1[2] >> 20 & sse_state_os_enabled;
+                fma3_sse42 = regs1[2] >> 12 & sse_state_os_enabled;
+
+                avx = regs1[2] >> 28 & avx_state_os_enabled;
+                fma3_avx = avx && fma3_sse42;
+
+                int regs8[4];
+                get_cpuid(regs8, 0x80000001);
+                fma4 = regs8[2] >> 16 & avx_state_os_enabled;
+
+                // sse4a = regs[2] >> 6 & 1;
+
+                // xop = regs[2] >> 11 & 1;
+
+                int regs7[4];
+                get_cpuid(regs7, 0x7);
+                avx2 = regs7[1] >> 5 & avx_state_os_enabled;
+
+                int regs7a[4];
+                get_cpuid(regs7a, 0x7, 0x1);
+                avxvnni = regs7a[0] >> 4 & avx_state_os_enabled;
+
+                fma3_avx2 = avx2 && fma3_sse42;
+
+                avx512f = regs7[1] >> 16 & avx512_state_os_enabled;
+                avx512cd = regs7[1] >> 28 & avx512_state_os_enabled;
+                avx512dq = regs7[1] >> 17 & avx512_state_os_enabled;
+                avx512bw = regs7[1] >> 30 & avx512_state_os_enabled;
+                avx512er = regs7[1] >> 27 & avx512_state_os_enabled;
+                avx512pf = regs7[1] >> 26 & avx512_state_os_enabled;
+                avx512ifma = regs7[1] >> 21 & avx512_state_os_enabled;
+                avx512vbmi = regs7[2] >> 1 & avx512_state_os_enabled;
+                avx512vnni_bw = regs7[2] >> 11 & avx512_state_os_enabled;
+                avx512vnni_vbmi = avx512vbmi && avx512vnni_bw;
+#endif
+            }
+        };
+    } // namespace detail
+
+    XSIMD_INLINE detail::supported_arch available_architectures() noexcept
+    {
+        static detail::supported_arch supported;
+        return supported;
+    }
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/config/xsimd_inline.hpp b/include/onnxruntime/xsimd/config/xsimd_inline.hpp
new file mode 100644
index 0000000000000..88e9cbcd0d658
--- /dev/null
+++ b/include/onnxruntime/xsimd/config/xsimd_inline.hpp
@@ -0,0 +1,23 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_INLINE_HPP
+#define XSIMD_INLINE_HPP
+
+#if defined(__GNUC__)
+#define XSIMD_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define XSIMD_INLINE inline __forceinline
+#else
+#define XSIMD_INLINE inline
+#endif
+
+#endif
diff --git a/include/onnxruntime/xsimd/math/xsimd_rem_pio2.hpp b/include/onnxruntime/xsimd/math/xsimd_rem_pio2.hpp
new file mode 100644
index 0000000000000..eb232c5682001
--- /dev/null
+++ b/include/onnxruntime/xsimd/math/xsimd_rem_pio2.hpp
@@ -0,0 +1,719 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+
+namespace xsimd
+{
+    namespace detail
+    {
+
+        /* origin: boost/simd/arch/common/scalar/function/rem_pio2.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+#if defined(_MSC_VER)
+#define ONCE0                                       \
+    __pragma(warning(push))                         \
+        __pragma(warning(disable : 4127)) while (0) \
+            __pragma(warning(pop)) /**/
+#else
+#define ONCE0 while (0)
+#endif
+
+        /*
+         * ====================================================
+         * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+         *
+         * Developed at SunPro, a Sun Microsystems, Inc. business.
+         * Permission to use, copy, modify, and distribute this
+         * software is freely granted, provided that this notice
+         * is preserved.
+         * ====================================================
+         */
+
+#if defined(__GNUC__) && defined(__BYTE_ORDER__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define XSIMD_LITTLE_ENDIAN
+#endif
+#elif defined(_WIN32)
+        // We can safely assume that Windows is always little endian
+#define XSIMD_LITTLE_ENDIAN
+#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
+#define XSIMD_LITTLE_ENDIAN
+#endif
+
+#ifdef XSIMD_LITTLE_ENDIAN
+#define LOW_WORD_IDX 0
+#define HIGH_WORD_IDX sizeof(std::uint32_t)
+#else
+#define LOW_WORD_IDX sizeof(std::uint32_t)
+#define HIGH_WORD_IDX 0
+#endif
+
+#define GET_HIGH_WORD(i, d)                                            \
+    do                                                                 \
+    {                                                                  \
+        double f = (d);                                                \
+        std::memcpy(&(i), reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
+                    sizeof(std::uint32_t));                            \
+    }                                                                  \
+    ONCE0                                                              \
+    /**/
+
+#define GET_LOW_WORD(i, d)                                            \
+    do                                                                \
+    {                                                                 \
+        double f = (d);                                               \
+        std::memcpy(&(i), reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
+                    sizeof(std::uint32_t));                           \
+    }                                                                 \
+    ONCE0                                                             \
+    /**/
+
+#define SET_HIGH_WORD(d, v)                                      \
+    do                                                           \
+    {                                                            \
+        double f = (d);                                          \
+        std::uint32_t value = (v);                               \
+        std::memcpy(reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
+                    &value, sizeof(std::uint32_t));              \
+        (d) = f;                                                 \
+    }                                                            \
+    ONCE0                                                        \
+    /**/
+
+#define SET_LOW_WORD(d, v)                                      \
+    do                                                          \
+    {                                                           \
+        double f = (d);                                         \
+        std::uint32_t value = (v);                              \
+        std::memcpy(reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
+                    &value, sizeof(std::uint32_t));             \
+        (d) = f;                                                \
+    }                                                           \
+    ONCE0                                                       \
+    /**/
+
+        /*
+         * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
+         * double x[],y[]; int e0,nx,prec; int ipio2[];
+         *
+         * __kernel_rem_pio2 return the last three digits of N with
+         *		y = x - N*pi/2
+         * so that |y| < pi/2.
+         *
+         * The method is to compute the integer (mod 8) and fraction parts of
+         * (2/pi)*x without doing the full multiplication. In general we
+         * skip the part of the product that are known to be a huge integer (
+         * more accurately, = 0 mod 8 ). Thus the number of operations are
+         * independent of the exponent of the input.
+         *
+         * (2/pi) is represented by an array of 24-bit integers in ipio2[].
+         *
+         * Input parameters:
+         * 	x[]	The input value (must be positive) is broken into nx
+         *		pieces of 24-bit integers in double precision format.
+         *		x[i] will be the i-th 24 bit of x. The scaled exponent
+         *		of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
+         *		match x's up to 24 bits.
+         *
+         *		Example of breaking a double positive z into x[0]+x[1]+x[2]:
+         *			e0 = ilogb(z)-23
+         *			z  = scalbn(z,-e0)
+         *		for i = 0,1,2
+         *			x[i] = floor(z)
+         *			z    = (z-x[i])*2**24
+         *
+         *
+         *	y[]	ouput result in an array of double precision numbers.
+         *		The dimension of y[] is:
+         *			24-bit  precision	1
+         *			53-bit  precision	2
+         *			64-bit  precision	2
+         *			113-bit precision	3
+         *		The actual value is the sum of them. Thus for 113-bit
+         *		precison, one may have to do something like:
+         *
+         *		long double t,w,r_head, r_tail;
+         *		t = (long double)y[2] + (long double)y[1];
+         *		w = (long double)y[0];
+         *		r_head = t+w;
+         *		r_tail = w - (r_head - t);
+         *
+         *	e0	The exponent of x[0]
+         *
+         *	nx	dimension of x[]
+         *
+         *  	prec	an integer indicating the precision:
+         *			0	24  bits (single)
+         *			1	53  bits (double)
+         *			2	64  bits (extended)
+         *			3	113 bits (quad)
+         *
+         *	ipio2[]
+         *		integer array, contains the (24*i)-th to (24*i+23)-th
+         *		bit of 2/pi after binary point. The corresponding
+         *		floating value is
+         *
+         *			ipio2[i] * 2^(-24(i+1)).
+         *
+         * External function:
+         *	double scalbn(), floor();
+         *
+         *
+         * Here is the description of some local variables:
+         *
+         * 	jk	jk+1 is the initial number of terms of ipio2[] needed
+         *		in the computation. The recommended value is 2,3,4,
+         *		6 for single, double, extended,and quad.
+         *
+         * 	jz	local integer variable indicating the number of
+         *		terms of ipio2[] used.
+         *
+         *	jx	nx - 1
+         *
+         *	jv	index for pointing to the suitable ipio2[] for the
+         *		computation. In general, we want
+         *			( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
+         *		is an integer. Thus
+         *			e0-3-24*jv >= 0 or (e0-3)/24 >= jv
+         *		Hence jv = max(0,(e0-3)/24).
+         *
+         *	jp	jp+1 is the number of terms in PIo2[] needed, jp = jk.
+         *
+         * 	q[]	double array with integral value, representing the
+         *		24-bits chunk of the product of x and 2/pi.
+         *
+         *	q0	the corresponding exponent of q[0]. Note that the
+         *		exponent for q[i] would be q0-24*i.
+         *
+         *	PIo2[]	double precision array, obtained by cutting pi/2
+         *		into 24 bits chunks.
+         *
+         *	f[]	ipio2[] in floating point
+         *
+         *	iq[]	integer array by breaking up q[] in 24-bits chunk.
+         *
+         *	fq[]	final product of x*(2/pi) in fq[0],..,fq[jk]
+         *
+         *	ih	integer. If >0 it indicates q[] is >= 0.5, hence
+         *		it also indicates the *sign* of the result.
+         *
+         */
+
+        XSIMD_INLINE int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept
+        {
+            static const int32_t init_jk[] = { 2, 3, 4, 6 }; /* initial value for jk */
+
+            static const double PIo2[] = {
+                1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
+                7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
+                5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
+                3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
+                1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
+                1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
+                2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
+                2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
+            };
+
+            static const double
+                zero
+                = 0.0,
+                one = 1.0,
+                two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
+                twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
+
+            int32_t jz, jx, jv, jp, jk, carry, n, iq[20], i, j, k, m, q0, ih;
+            double z, fw, f[20], fq[20], q[20];
+
+            /* initialize jk*/
+            jk = init_jk[prec];
+            jp = jk;
+
+            /* determine jx,jv,q0, note that 3>q0 */
+            jx = nx - 1;
+            jv = (e0 - 3) / 24;
+            if (jv < 0)
+                jv = 0;
+            q0 = e0 - 24 * (jv + 1);
+
+            /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
+            j = jv - jx;
+            m = jx + jk;
+            for (i = 0; i <= m; i++, j++)
+                f[i] = (j < 0) ? zero : (double)ipio2[j];
+
+            /* compute q[0],q[1],...q[jk] */
+            for (i = 0; i <= jk; i++)
+            {
+                for (j = 0, fw = 0.0; j <= jx; j++)
+                    fw += x[j] * f[jx + i - j];
+                q[i] = fw;
+            }
+
+            jz = jk;
+
+        recompute:
+            /* distill q[] into iq[] reversingly */
+            for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--)
+            {
+                fw = (double)((int32_t)(twon24 * z));
+                iq[i] = (int)(z - two24 * fw);
+                z = q[j - 1] + fw;
+            }
+
+            /* compute n */
+            z = std::scalbn(z, q0); /* actual value of z */
+            z -= 8.0 * std::floor(z * 0.125); /* trim off integer >= 8 */
+            n = (int32_t)z;
+            z -= (double)n;
+            ih = 0;
+            if (q0 > 0)
+            { /* need iq[jz-1] to determine n */
+                i = (iq[jz - 1] >> (24 - q0));
+                n += i;
+                iq[jz - 1] -= i << (24 - q0);
+                ih = iq[jz - 1] >> (23 - q0);
+            }
+            else if (q0 == 0)
+                ih = iq[jz - 1] >> 23;
+            else if (z >= 0.5)
+                ih = 2;
+
+            if (ih > 0)
+            { /* q > 0.5 */
+                n += 1;
+                carry = 0;
+                for (i = 0; i < jz; i++)
+                { /* compute 1-q */
+                    j = iq[i];
+                    if (carry == 0)
+                    {
+                        if (j != 0)
+                        {
+                            carry = 1;
+                            iq[i] = 0x1000000 - j;
+                        }
+                    }
+                    else
+                        iq[i] = 0xffffff - j;
+                }
+                if (q0 > 0)
+                { /* rare case: chance is 1 in 12 */
+                    switch (q0)
+                    {
+                    case 1:
+                        iq[jz - 1] &= 0x7fffff;
+                        break;
+                    case 2:
+                        iq[jz - 1] &= 0x3fffff;
+                        break;
+                    }
+                }
+                if (ih == 2)
+                {
+                    z = one - z;
+                    if (carry != 0)
+                        z -= std::scalbn(one, q0);
+                }
+            }
+
+            /* check if recomputation is needed */
+            if (z == zero)
+            {
+                j = 0;
+                for (i = jz - 1; i >= jk; i--)
+                    j |= iq[i];
+                if (j == 0)
+                { /* need recomputation */
+                    for (k = 1; iq[jk - k] == 0; k++)
+                        ; /* k = no. of terms needed */
+
+                    for (i = jz + 1; i <= jz + k; i++)
+                    { /* add q[jz+1] to q[jz+k] */
+                        f[jx + i] = (double)ipio2[jv + i];
+                        for (j = 0, fw = 0.0; j <= jx; j++)
+                            fw += x[j] * f[jx + i - j];
+                        q[i] = fw;
+                    }
+                    jz += k;
+                    goto recompute;
+                }
+            }
+
+            /* chop off zero terms */
+            if (z == 0.0)
+            {
+                jz -= 1;
+                q0 -= 24;
+                while (iq[jz] == 0)
+                {
+                    jz--;
+                    q0 -= 24;
+                }
+            }
+            else
+            { /* break z into 24-bit if necessary */
+                z = std::scalbn(z, -q0);
+                if (z >= two24)
+                {
+                    fw = (double)((int32_t)(twon24 * z));
+                    iq[jz] = (int32_t)(z - two24 * fw);
+                    jz += 1;
+                    q0 += 24;
+                    iq[jz] = (int32_t)fw;
+                }
+                else
+                    iq[jz] = (int32_t)z;
+            }
+
+            /* convert integer "bit" chunk to floating-point value */
+            fw = scalbn(one, q0);
+            for (i = jz; i >= 0; i--)
+            {
+                q[i] = fw * (double)iq[i];
+                fw *= twon24;
+            }
+
+            /* compute PIo2[0,...,jp]*q[jz,...,0] */
+            for (i = jz; i >= 0; i--)
+            {
+                for (fw = 0.0, k = 0; k <= jp && k <= jz - i; k++)
+                    fw += PIo2[k] * q[i + k];
+                fq[jz - i] = fw;
+            }
+
+            /* compress fq[] into y[] */
+            switch (prec)
+            {
+            case 0:
+                fw = 0.0;
+                for (i = jz; i >= 0; i--)
+                    fw += fq[i];
+                y[0] = (ih == 0) ? fw : -fw;
+                break;
+            case 1:
+            case 2:
+                fw = 0.0;
+                for (i = jz; i >= 0; i--)
+                    fw += fq[i];
+                y[0] = (ih == 0) ? fw : -fw;
+                fw = fq[0] - fw;
+                for (i = 1; i <= jz; i++)
+                    fw += fq[i];
+                y[1] = (ih == 0) ? fw : -fw;
+                break;
+            case 3: /* painful */
+                for (i = jz; i > 0; i--)
+                {
+                    fw = fq[i - 1] + fq[i];
+                    fq[i] += fq[i - 1] - fw;
+                    fq[i - 1] = fw;
+                }
+                for (i = jz; i > 1; i--)
+                {
+                    fw = fq[i - 1] + fq[i];
+                    fq[i] += fq[i - 1] - fw;
+                    fq[i - 1] = fw;
+                }
+                for (fw = 0.0, i = jz; i >= 2; i--)
+                    fw += fq[i];
+                if (ih == 0)
+                {
+                    y[0] = fq[0];
+                    y[1] = fq[1];
+                    y[2] = fw;
+                }
+                else
+                {
+                    y[0] = -fq[0];
+                    y[1] = -fq[1];
+                    y[2] = -fw;
+                }
+            }
+            return n & 7;
+        }
+
+        XSIMD_INLINE std::int32_t __ieee754_rem_pio2(double x, double* y) noexcept
+        {
+            static const std::int32_t two_over_pi[] = {
+                0xA2F983,
+                0x6E4E44,
+                0x1529FC,
+                0x2757D1,
+                0xF534DD,
+                0xC0DB62,
+                0x95993C,
+                0x439041,
+                0xFE5163,
+                0xABDEBB,
+                0xC561B7,
+                0x246E3A,
+                0x424DD2,
+                0xE00649,
+                0x2EEA09,
+                0xD1921C,
+                0xFE1DEB,
+                0x1CB129,
+                0xA73EE8,
+                0x8235F5,
+                0x2EBB44,
+                0x84E99C,
+                0x7026B4,
+                0x5F7E41,
+                0x3991D6,
+                0x398353,
+                0x39F49C,
+                0x845F8B,
+                0xBDF928,
+                0x3B1FF8,
+                0x97FFDE,
+                0x05980F,
+                0xEF2F11,
+                0x8B5A0A,
+                0x6D1F6D,
+                0x367ECF,
+                0x27CB09,
+                0xB74F46,
+                0x3F669E,
+                0x5FEA2D,
+                0x7527BA,
+                0xC7EBE5,
+                0xF17B3D,
+                0x0739F7,
+                0x8A5292,
+                0xEA6BFB,
+                0x5FB11F,
+                0x8D5D08,
+                0x560330,
+                0x46FC7B,
+                0x6BABF0,
+                0xCFBC20,
+                0x9AF436,
+                0x1DA9E3,
+                0x91615E,
+                0xE61B08,
+                0x659985,
+                0x5F14A0,
+                0x68408D,
+                0xFFD880,
+                0x4D7327,
+                0x310606,
+                0x1556CA,
+                0x73A8C9,
+                0x60E27B,
+                0xC08C6B,
+            };
+
+            static const std::int32_t npio2_hw[] = {
+                0x3FF921FB,
+                0x400921FB,
+                0x4012D97C,
+                0x401921FB,
+                0x401F6A7A,
+                0x4022D97C,
+                0x4025FDBB,
+                0x402921FB,
+                0x402C463A,
+                0x402F6A7A,
+                0x4031475C,
+                0x4032D97C,
+                0x40346B9C,
+                0x4035FDBB,
+                0x40378FDB,
+                0x403921FB,
+                0x403AB41B,
+                0x403C463A,
+                0x403DD85A,
+                0x403F6A7A,
+                0x40407E4C,
+                0x4041475C,
+                0x4042106C,
+                0x4042D97C,
+                0x4043A28C,
+                0x40446B9C,
+                0x404534AC,
+                0x4045FDBB,
+                0x4046C6CB,
+                0x40478FDB,
+                0x404858EB,
+                0x404921FB,
+            };
+
+            /*
+             * invpio2:  53 bits of 2/pi
+             * pio2_1:   first  33 bit of pi/2
+             * pio2_1t:  pi/2 - pio2_1
+             * pio2_2:   second 33 bit of pi/2
+             * pio2_2t:  pi/2 - (pio2_1+pio2_2)
+             * pio2_3:   third  33 bit of pi/2
+             * pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
+             */
+
+            static const double
+                zero
+                = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
+                half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
+                two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
+                invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
+                pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
+                pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
+                pio2_2 = 6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
+                pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
+                pio2_3 = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
+                pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
+
+            double z = 0., w, t, r, fn;
+            double tx[3];
+            std::int32_t e0, i, j, nx, n, ix, hx;
+            std::uint32_t low;
+
+            GET_HIGH_WORD(hx, x); /* high word of x */
+            ix = hx & 0x7fffffff;
+            if (ix <= 0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */
+            {
+                y[0] = x;
+                y[1] = 0;
+                return 0;
+            }
+            if (ix < 0x4002d97c)
+            { /* |x| < 3pi/4, special case with n=+-1 */
+                if (hx > 0)
+                {
+                    z = x - pio2_1;
+                    if (ix != 0x3ff921fb)
+                    { /* 33+53 bit pi is good enough */
+                        y[0] = z - pio2_1t;
+                        y[1] = (z - y[0]) - pio2_1t;
+                    }
+                    else
+                    { /* near pi/2, use 33+33+53 bit pi */
+                        z -= pio2_2;
+                        y[0] = z - pio2_2t;
+                        y[1] = (z - y[0]) - pio2_2t;
+                    }
+                    return 1;
+                }
+                else
+                { /* negative x */
+                    z = x + pio2_1;
+                    if (ix != 0x3ff921fb)
+                    { /* 33+53 bit pi is good enough */
+                        y[0] = z + pio2_1t;
+                        y[1] = (z - y[0]) + pio2_1t;
+                    }
+                    else
+                    { /* near pi/2, use 33+33+53 bit pi */
+                        z += pio2_2;
+                        y[0] = z + pio2_2t;
+                        y[1] = (z - y[0]) + pio2_2t;
+                    }
+
+                    return -1;
+                }
+            }
+            if (ix <= 0x413921fb)
+            { /* |x| ~<= 2^19*(pi/2), medium_ size */
+                t = std::fabs(x);
+                n = (std::int32_t)(t * invpio2 + half);
+                fn = (double)n;
+                r = t - fn * pio2_1;
+                w = fn * pio2_1t; /* 1st round good to 85 bit */
+                if ((n < 32) && (n > 0) && (ix != npio2_hw[n - 1]))
+                {
+                    y[0] = r - w; /* quick check no cancellation */
+                }
+                else
+                {
+                    std::uint32_t high;
+                    j = ix >> 20;
+                    y[0] = r - w;
+                    GET_HIGH_WORD(high, y[0]);
+                    i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
+                    if (i > 16)
+                    { /* 2nd iteration needed, good to 118 */
+                        t = r;
+                        w = fn * pio2_2;
+                        r = t - w;
+                        w = fn * pio2_2t - ((t - r) - w);
+                        y[0] = r - w;
+                        GET_HIGH_WORD(high, y[0]);
+                        i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
+                        if (i > 49)
+                        { /* 3rd iteration need, 151 bits acc */
+                            t = r; /* will cover all possible cases */
+                            w = fn * pio2_3;
+                            r = t - w;
+                            w = fn * pio2_3t - ((t - r) - w);
+                            y[0] = r - w;
+                        }
+                    }
+                }
+                y[1] = (r - y[0]) - w;
+                if (hx < 0)
+                {
+                    y[0] = -y[0];
+                    y[1] = -y[1];
+                    return -n;
+                }
+                else
+                    return n;
+            }
+            /*
+             * all other (large) arguments
+             */
+            if (ix >= 0x7ff00000)
+            { /* x is inf or NaN */
+                y[0] = y[1] = x - x;
+                return 0;
+            }
+            /* set z = scalbn(|x|,ilogb(x)-23) */
+            GET_LOW_WORD(low, x);
+            SET_LOW_WORD(z, low);
+            e0 = (ix >> 20) - 1046; /* e0 = ilogb(z)-23; */
+            SET_HIGH_WORD(z, static_cast<uint32_t>(ix - (e0 << 20)));
+            for (i = 0; i < 2; i++)
+            {
+                tx[i] = (double)((std::int32_t)(z));
+                z = (z - tx[i]) * two24;
+            }
+            tx[2] = z;
+            nx = 3;
+            while (tx[nx - 1] == zero)
+                nx--; /* skip zero term */
+            n = __kernel_rem_pio2(tx, y, e0, nx, 2, two_over_pi);
+            if (hx < 0)
+            {
+                y[0] = -y[0];
+                y[1] = -y[1];
+                return -n;
+            }
+            return n;
+        }
+    }
+
+#undef XSIMD_LITTLE_ENDIAN
+#undef SET_LOW_WORD
+#undef SET_HIGH_WORD
+#undef GET_LOW_WORD
+#undef GET_HIGH_WORD
+#undef HIGH_WORD_IDX
+#undef LOW_WORD_IDX
+#undef ONCE0
+}
diff --git a/include/onnxruntime/xsimd/memory/xsimd_aligned_allocator.hpp b/include/onnxruntime/xsimd/memory/xsimd_aligned_allocator.hpp
new file mode 100644
index 0000000000000..51779f31caf19
--- /dev/null
+++ b/include/onnxruntime/xsimd/memory/xsimd_aligned_allocator.hpp
@@ -0,0 +1,349 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ALIGNED_ALLOCATOR_HPP
+#define XSIMD_ALIGNED_ALLOCATOR_HPP
+
+#include <algorithm>
+#include <cstddef>
+#include <utility>
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#include <cstdlib>
+#endif
+
+#include <cassert>
+#include <memory>
+
+#include "../config/xsimd_arch.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @class aligned_allocator
+     * @brief Allocator for aligned memory
+     *
+     * The aligned_allocator class template is an allocator that
+     * performs memory allocation aligned by the specified value.
+     *
+     * @tparam T type of objects to allocate.
+     * @tparam Align alignment in bytes.
+     */
+    template <class T, size_t Align>
+    class aligned_allocator
+    {
+    public:
+        using value_type = T;
+        using pointer = T*;
+        using const_pointer = const T*;
+        using reference = T&;
+        using const_reference = const T&;
+        using size_type = size_t;
+        using difference_type = ptrdiff_t;
+
+        static constexpr size_t alignment = Align;
+
+        template <class U>
+        struct rebind
+        {
+            using other = aligned_allocator<U, Align>;
+        };
+
+        XSIMD_INLINE aligned_allocator() noexcept;
+        XSIMD_INLINE aligned_allocator(const aligned_allocator& rhs) noexcept;
+
+        template <class U>
+        XSIMD_INLINE aligned_allocator(const aligned_allocator<U, Align>& rhs) noexcept;
+
+        XSIMD_INLINE ~aligned_allocator();
+
+        XSIMD_INLINE pointer address(reference) noexcept;
+        XSIMD_INLINE const_pointer address(const_reference) const noexcept;
+
+        XSIMD_INLINE pointer allocate(size_type n, const void* hint = 0);
+        XSIMD_INLINE void deallocate(pointer p, size_type n);
+
+        XSIMD_INLINE size_type max_size() const noexcept;
+        XSIMD_INLINE size_type size_max() const noexcept;
+
+        template <class U, class... Args>
+        XSIMD_INLINE void construct(U* p, Args&&... args);
+
+        template <class U>
+        XSIMD_INLINE void destroy(U* p);
+    };
+
+    template <class T1, size_t Align1, class T2, size_t Align2>
+    XSIMD_INLINE bool operator==(const aligned_allocator<T1, Align1>& lhs,
+                                 const aligned_allocator<T2, Align2>& rhs) noexcept;
+
+    template <class T1, size_t Align1, class T2, size_t Align2>
+    XSIMD_INLINE bool operator!=(const aligned_allocator<T1, Align1>& lhs,
+                                 const aligned_allocator<T2, Align2>& rhs) noexcept;
+
+    XSIMD_INLINE void* aligned_malloc(size_t size, size_t alignment);
+    XSIMD_INLINE void aligned_free(void* ptr);
+
+    template <class T>
+    XSIMD_INLINE size_t get_alignment_offset(const T* p, size_t size, size_t block_size);
+
+    /************************************
+     * aligned_allocator implementation *
+     ************************************/
+
+    /**
+     * Default constructor.
+     */
+    template <class T, size_t A>
+    XSIMD_INLINE aligned_allocator<T, A>::aligned_allocator() noexcept
+    {
+    }
+
+    /**
+     * Copy constructor.
+     */
+    template <class T, size_t A>
+    XSIMD_INLINE aligned_allocator<T, A>::aligned_allocator(const aligned_allocator&) noexcept
+    {
+    }
+
+    /**
+     * Extended copy constructor.
+     */
+    template <class T, size_t A>
+    template <class U>
+    XSIMD_INLINE aligned_allocator<T, A>::aligned_allocator(const aligned_allocator<U, A>&) noexcept
+    {
+    }
+
+    /**
+     * Destructor.
+     */
+    template <class T, size_t A>
+    XSIMD_INLINE aligned_allocator<T, A>::~aligned_allocator()
+    {
+    }
+
+    /**
+     * Returns the actual address of \c r even in presence of overloaded \c operator&.
+     * @param r the object to acquire address of.
+     * @return the actual address of \c r.
+     */
+    template <class T, size_t A>
+    XSIMD_INLINE auto
+    aligned_allocator<T, A>::address(reference r) noexcept -> pointer
+    {
+        return &r;
+    }
+
+    /**
+     * Returns the actual address of \c r even in presence of overloaded \c operator&.
+     * @param r the object to acquire address of.
+     * @return the actual address of \c r.
+     */
+    template <class T, size_t A>
+    XSIMD_INLINE auto
+    aligned_allocator<T, A>::address(const_reference r) const noexcept -> const_pointer
+    {
+        return &r;
+    }
+
+    /**
+     * Allocates <tt>n * sizeof(T)</tt> bytes of uninitialized memory, aligned by \c A.
+     * The alignment may require some extra memory allocation.
+     * @param n the number of objects to allocate storage for.
+     * @param hint unused parameter provided for standard compliance.
+     * @return a pointer to the first byte of a memory block suitably aligned and sufficient to
+     * hold an array of \c n objects of type \c T.
+     */
+    template <class T, size_t A>
+    XSIMD_INLINE auto
+    aligned_allocator<T, A>::allocate(size_type n, const void*) -> pointer
+    {
+        pointer res = reinterpret_cast<pointer>(aligned_malloc(sizeof(T) * n, A));
+#if defined(_CPPUNWIND) || defined(__cpp_exceptions)
+        if (res == nullptr)
+            throw std::bad_alloc();
+#endif
+        return res;
+    }
+
+    /**
+     * Deallocates the storage referenced by the pointer p, which must be a pointer obtained by
+     * an earlier call to allocate(). The argument \c n must be equal to the first argument of the call
+     * to allocate() that originally produced \c p; otherwise, the behavior is undefined.
+     * @param p pointer obtained from allocate().
+     * @param n number of objects earlier passed to allocate().
+     */
+    template <class T, size_t A>
+    XSIMD_INLINE void aligned_allocator<T, A>::deallocate(pointer p, size_type)
+    {
+        aligned_free(p);
+    }
+
+    /**
+     * Returns the maximum theoretically possible value of \c n, for which the
+     * call allocate(n, 0) could succeed.
+     * @return the maximum supported allocated size.
+     */
+    template <class T, size_t A>
+    XSIMD_INLINE auto
+    aligned_allocator<T, A>::max_size() const noexcept -> size_type
+    {
+        return size_type(-1) / sizeof(T);
+    }
+
+    /**
+     * This method is deprecated, use max_size() instead
+     */
+    template <class T, size_t A>
+    XSIMD_INLINE auto
+    aligned_allocator<T, A>::size_max() const noexcept -> size_type
+    {
+        return size_type(-1) / sizeof(T);
+    }
+
+    /**
+     * Constructs an object of type \c T in allocated uninitialized memory
+     * pointed to by \c p, using placement-new.
+     * @param p pointer to allocated uninitialized memory.
+     * @param args the constructor arguments to use.
+     */
+    template <class T, size_t A>
+    template <class U, class... Args>
+    XSIMD_INLINE void aligned_allocator<T, A>::construct(U* p, Args&&... args)
+    {
+        new ((void*)p) U(std::forward<Args>(args)...);
+    }
+
+    /**
+     * Calls the destructor of the object pointed to by \c p.
+     * @param p pointer to the object that is going to be destroyed.
+     */
+    template <class T, size_t A>
+    template <class U>
+    XSIMD_INLINE void aligned_allocator<T, A>::destroy(U* p)
+    {
+        p->~U();
+    }
+
+    /**
+     * @defgroup allocator_comparison Comparison operators
+     */
+
+    /**
+     * @ingroup allocator_comparison
+     * Compares two aligned memory allocator for equality. Since allocators
+     * are stateless, return \c true iff <tt>A1 == A2</tt>.
+     * @param lhs aligned_allocator to compare.
+     * @param rhs aligned_allocator to compare.
+     * @return true if the allocators have the same alignment.
+     */
+    template <class T1, size_t A1, class T2, size_t A2>
+    XSIMD_INLINE bool operator==(const aligned_allocator<T1, A1>& lhs,
+                                 const aligned_allocator<T2, A2>& rhs) noexcept
+    {
+        return lhs.alignment == rhs.alignment;
+    }
+
+    /**
+     * @ingroup allocator_comparison
+     * Compares two aligned memory allocator for inequality. Since allocators
+     * are stateless, return \c true iff <tt>A1 != A2</tt>.
+     * @param lhs aligned_allocator to compare.
+     * @param rhs aligned_allocator to compare.
+     * @return true if the allocators have different alignments.
+     */
+    template <class T1, size_t A1, class T2, size_t A2>
+    XSIMD_INLINE bool operator!=(const aligned_allocator<T1, A1>& lhs,
+                                 const aligned_allocator<T2, A2>& rhs) noexcept
+    {
+        return !(lhs == rhs);
+    }
+
+    /****************************************
+     * aligned malloc / free implementation *
+     ****************************************/
+
+    namespace detail
+    {
+        XSIMD_INLINE void* xaligned_malloc(size_t size, size_t alignment)
+        {
+            assert(((alignment & (alignment - 1)) == 0) && "alignment must be a power of two");
+            assert((alignment >= sizeof(void*)) && "alignment must be at least the size of a pointer");
+            void* res = nullptr;
+#ifdef _WIN32
+            res = _aligned_malloc(size, alignment);
+#else
+            if (posix_memalign(&res, alignment, size) != 0)
+            {
+                res = nullptr;
+            }
+#endif
+            return res;
+        }
+
+        XSIMD_INLINE void xaligned_free(void* ptr)
+        {
+#ifdef _WIN32
+            _aligned_free(ptr);
+#else
+            free(ptr);
+#endif
+        }
+    }
+
+    XSIMD_INLINE void* aligned_malloc(size_t size, size_t alignment)
+    {
+        return detail::xaligned_malloc(size, alignment);
+    }
+
+    XSIMD_INLINE void aligned_free(void* ptr)
+    {
+        detail::xaligned_free(ptr);
+    }
+
+    template <class T>
+    XSIMD_INLINE size_t get_alignment_offset(const T* p, size_t size, size_t block_size)
+    {
+        // size_t block_size = simd_traits<T>::size;
+        if (block_size == 1)
+        {
+            // The simd_block consists of exactly one scalar so that all
+            // elements of the array
+            // are "well" aligned.
+            return 0;
+        }
+        else if (size_t(p) & (sizeof(T) - 1))
+        {
+            // The array is not aligned to the size of a single element, so that
+            // no element
+            // of the array is well aligned
+            return size;
+        }
+        else
+        {
+            size_t block_mask = block_size - 1;
+            return std::min<size_t>(
+                (block_size - ((size_t(p) / sizeof(T)) & block_mask)) & block_mask,
+                size);
+        }
+    }
+
+    template <class T, class A = default_arch>
+    using default_allocator = typename std::conditional<A::requires_alignment(),
+                                                        aligned_allocator<T, A::alignment()>,
+                                                        std::allocator<T>>::type;
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/memory/xsimd_alignment.hpp b/include/onnxruntime/xsimd/memory/xsimd_alignment.hpp
new file mode 100644
index 0000000000000..2d59ac1fc4884
--- /dev/null
+++ b/include/onnxruntime/xsimd/memory/xsimd_alignment.hpp
@@ -0,0 +1,91 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ALIGNMENT_HPP
+#define XSIMD_ALIGNMENT_HPP
+
+#include "../types/xsimd_utils.hpp"
+#include "xsimd_aligned_allocator.hpp"
+
+namespace xsimd
+{
+    /**
+     * @struct aligned_mode
+     * @brief tag for load and store of aligned memory.
+     */
+    struct aligned_mode
+    {
+    };
+
+    /**
+     * @struct unaligned_mode
+     * @brief tag for load and store of unaligned memory.
+     */
+    struct unaligned_mode
+    {
+    };
+
+    /***********************
+     * Allocator alignment *
+     ***********************/
+
+    template <class A>
+    struct allocator_alignment
+    {
+        using type = unaligned_mode;
+    };
+
+    template <class T, size_t N>
+    struct allocator_alignment<aligned_allocator<T, N>>
+    {
+        using type = aligned_mode;
+    };
+
+    template <class A>
+    using allocator_alignment_t = typename allocator_alignment<A>::type;
+
+    /***********************
+     * container alignment *
+     ***********************/
+
+    template <class C, class = void>
+    struct container_alignment
+    {
+        using type = unaligned_mode;
+    };
+
+    template <class C>
+    struct container_alignment<C, detail::void_t<typename C::allocator_type>>
+    {
+        using type = allocator_alignment_t<typename C::allocator_type>;
+    };
+
+    template <class C>
+    using container_alignment_t = typename container_alignment<C>::type;
+
+    /*********************
+     * alignment checker *
+     *********************/
+
+    /**
+     * Checks whether pointer \c ptr is aligned according the alignment
+     * requirements of \c Arch.
+     * @return true if the alignment requirements are met
+     */
+    template <class Arch = default_arch>
+    XSIMD_INLINE bool is_aligned(void const* ptr)
+    {
+        return (reinterpret_cast<uintptr_t>(ptr) % static_cast<uintptr_t>(Arch::alignment())) == 0;
+    }
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_all_registers.hpp b/include/onnxruntime/xsimd/types/xsimd_all_registers.hpp
new file mode 100644
index 0000000000000..6d024a167777f
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_all_registers.hpp
@@ -0,0 +1,52 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#include "xsimd_fma3_sse_register.hpp"
+#include "xsimd_fma4_register.hpp"
+#include "xsimd_sse2_register.hpp"
+#include "xsimd_sse3_register.hpp"
+#include "xsimd_sse4_1_register.hpp"
+#include "xsimd_sse4_2_register.hpp"
+
+#include "xsimd_avx2_register.hpp"
+#include "xsimd_avx_register.hpp"
+#include "xsimd_avxvnni_register.hpp"
+#include "xsimd_fma3_avx2_register.hpp"
+#include "xsimd_fma3_avx_register.hpp"
+
+#include "xsimd_avx512vnni_avx512bw_register.hpp"
+#include "xsimd_avx512vnni_avx512vbmi_register.hpp"
+
+#include "xsimd_avx512ifma_register.hpp"
+#include "xsimd_avx512vbmi_register.hpp"
+
+#include "xsimd_avx512er_register.hpp"
+#include "xsimd_avx512pf_register.hpp"
+
+#include "xsimd_avx512bw_register.hpp"
+#include "xsimd_avx512cd_register.hpp"
+#include "xsimd_avx512dq_register.hpp"
+#include "xsimd_avx512f_register.hpp"
+
+#include "xsimd_i8mm_neon64_register.hpp"
+
+#include "xsimd_neon64_register.hpp"
+#include "xsimd_neon_register.hpp"
+
+#include "xsimd_sve_register.hpp"
+
+#include "xsimd_rvv_register.hpp"
+
+#include "xsimd_wasm_register.hpp"
+
+#if XSIMD_WITH_EMULATED
+#include "xsimd_emulated_register.hpp"
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_api.hpp b/include/onnxruntime/xsimd/types/xsimd_api.hpp
new file mode 100644
index 0000000000000..7aa7cc0d67d9b
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_api.hpp
@@ -0,0 +1,2700 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_API_HPP
+#define XSIMD_API_HPP
+
+#include <complex>
+#include <cstddef>
+#include <limits>
+#include <ostream>
+
+#include "../arch/xsimd_isa.hpp"
+#include "../types/xsimd_batch.hpp"
+#include "../types/xsimd_traits.hpp"
+
+namespace xsimd
+{
+    /**
+     * high level free functions
+     *
+     * @defgroup batch_arithmetic Arithmetic operators
+     * @defgroup batch_constant Constant batches
+     * @defgroup batch_cond Conditional operators
+     * @defgroup batch_data_transfer Memory operators
+     * @defgroup batch_math Basic math operators
+     * @defgroup batch_math_extra Extra math operators
+     * @defgroup batch_fp Floating point manipulation
+     * @defgroup batch_rounding Rounding operators
+     * @defgroup batch_conversion Conversion operators
+     * @defgroup batch_complex Complex operators
+     * @defgroup batch_logical Logical operators
+     * @defgroup batch_bitwise Bitwise operators
+     * @defgroup batch_reducers Reducers
+     * @defgroup batch_miscellaneous Miscellaneous
+     * @defgroup batch_trigo Trigonometry
+     *
+     * @defgroup batch_bool_logical Boolean logical operators
+     * @defgroup batch_bool_reducers Boolean reducers
+     */
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the absolute values of each scalar in the batch \c x.
+     * @param x batch of integer or floating point values.
+     * @return the absolute values of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> abs(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::abs<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the absolute values of each complex in the batch \c z.
+     * @param z batch of complex values.
+     * @return the absolute values of \c z.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> abs(batch<std::complex<T>, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::abs<A>(z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the sum of the batches \c x and \c y.
+     * @param x batch or scalar involved in the addition.
+     * @param y batch or scalar involved in the addition.
+     * @return the sum of \c x and \c y
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto add(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x + y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x + y;
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the arc cosine of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> acos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::acos<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the inverse hyperbolic cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the inverse hyperbolic cosine of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> acosh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::acosh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the argument of the batch \c z.
+     * @param z batch of complex or real values.
+     * @return the argument of \c z.
+     */
+    template <class T, class A>
+    XSIMD_INLINE real_batch_type_t<batch<T, A>> arg(batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::arg<A>(z, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the arc sine of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> asin(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::asin<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the inverse hyperbolic sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the inverse hyperbolic sine of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> asinh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::asinh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the arc tangent of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> atan(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::atan<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc tangent of the batch \c x/y, using the signs of the
+     * arguments to determine the correct quadrant.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return the arc tangent of \c x/y.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> atan2(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::atan2<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the inverse hyperbolic tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the inverse hyperbolic tangent of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> atanh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::atanh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the average of batches \c x and \c y
+     * @param x batch of T
+     * @param y batch of T
+     * @return the average of elements between \c x and \c y.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::avg<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the rounded average of batches \c x and \c y
+     * @param x batch of T
+     * @param y batch of T
+     * @return the rounded average of elements between \c x and \c y.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::avgr<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a static_cast from \c T_in to \c T_out on \c \c x.
+     * @param x batch_bool of \c T_in
+     * @return \c x cast to \c T_out
+     */
+    template <class T_out, class T_in, class A>
+    XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T_out, A>();
+        detail::static_check_supported_config<T_in, A>();
+        static_assert(batch_bool<T_out, A>::size == batch_bool<T_in, A>::size, "Casting between incompatibles batch_bool types.");
+        return kernel::batch_bool_cast<A>(x, batch_bool<T_out, A> {}, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a static_cast from \c T_in to \c T_out on \c \c x.
+     * @param x batch of \c T_in
+     * @return \c x cast to \c T_out
+     */
+    template <class T_out, class T_in, class A>
+    XSIMD_INLINE batch<T_out, A> batch_cast(batch<T_in, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T_out, A>();
+        detail::static_check_supported_config<T_in, A>();
+        return kernel::batch_cast<A>(x, batch<T_out, A> {}, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes the bit of sign of \c x
+     * @param x batch of scalar
+     * @return bit of sign of \c x
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> bitofsign(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitofsign<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise and of the batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and.
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto bitwise_and(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x & y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x & y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise and of the batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and.
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto bitwise_and(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x & y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x & y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise and not of batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and not.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_andnot<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_logical
+     *
+     * Computes the bitwise and not of batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and not.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_andnot<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a reinterpret_cast from \c T_in to \c T_out on \c x.
+     * @param x batch of \c T_in
+     * @return \c x reinterpreted as \c T_out
+     */
+    template <class T_out, class T_in, class A>
+    XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T_in, A>();
+        detail::static_check_supported_config<T_out, A>();
+        return kernel::bitwise_cast<A>(x, batch<T_out, A> {}, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Perform a bitwise shift to the left
+     * @param x batch of \c T_in
+     * @param shift scalar amount to shift
+     * @return shifted \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, int shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_lshift<A>(x, shift, A {});
+    }
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_lshift<A>(x, shift, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise not of batch \c x.
+     * @param x batch involved in the operation.
+     * @return the result of the bitwise not.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_not<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise not of batch \c x.
+     * @param x batch involved in the operation.
+     * @return the result of the bitwise not.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_not<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise or of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise or.
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto bitwise_or(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x | y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x | y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise or of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise or.
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto bitwise_or(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x | y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x | y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Perform a bitwise shift to the right
+     * @param x batch of \c T_in
+     * @param shift scalar amount to shift
+     * @return shifted \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, int shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_rshift<A>(x, shift, A {});
+    }
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_rshift<A>(x, shift, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise xor of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise xor.
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto bitwise_xor(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x ^ y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x ^ y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise xor of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise xor.
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto bitwise_xor(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x ^ y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x ^ y;
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the single value \c v.
+     * @param v the value used to initialize the batch
+     * @return a new batch instance
+     */
+    template <class T, class A = default_arch>
+    XSIMD_INLINE batch<T, A> broadcast(T v) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::broadcast(v);
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the single value \c v and
+     * the specified batch value type \c To.
+     * @param v the value used to initialize the batch
+     * @return a new batch instance
+     */
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE simd_return_type<From, To, A> broadcast_as(From v) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        using batch_value_type = typename simd_return_type<From, To, A>::value_type;
+        using value_type = typename std::conditional<std::is_same<From, bool>::value,
+                                                     bool,
+                                                     batch_value_type>::type;
+        return simd_return_type<From, To, A>(value_type(v));
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the cubic root of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the cubic root of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> cbrt(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::cbrt<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of smallest integer values not less than
+     * scalars in \c x.
+     * @param x batch of floating point values.
+     * @return the batch of smallest integer values not less than \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> ceil(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ceil<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Clips the values of the batch \c x between those of the batches \c lo and \c hi.
+     * @param x batch of scalar values.
+     * @param lo batch of scalar values.
+     * @param hi batch of scalar values.
+     * @return the result of the clipping.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> clip(batch<T, A> const& x, batch<T, A> const& lo, batch<T, A> const& hi) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::clip(x, lo, hi, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Pick elements from \c x selected by \c mask, and append them to the
+     * resulting vector, zeroing the remaining slots
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> compress(batch<T, A> const& x, batch_bool<T, A> const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::compress<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the conjugate of the batch \c z.
+     * @param z batch of complex values.
+     * @return the argument of \c z.
+     */
+    template <class A, class T>
+    XSIMD_INLINE complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& z) noexcept
+    {
+        return kernel::conj(z, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes a value whose  absolute  value  matches
+     *        that of \c x, but whose sign bit matches that of \c y.
+     * @param x batch of scalars
+     * @param y batch of scalars
+     * @return batch whose absolute  value  matches that of \c x, but whose sign bit
+     * matches that of \c y.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> copysign(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::copysign<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the cosine of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> cos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::cos<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * computes the hyperbolic cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the hyperbolic cosine of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> cosh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::cosh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Count the number of values set to true in the batch \c x
+     * @param x boolean or batch of boolean
+     * @return the result of the counting.
+     */
+    template <class T, class A>
+    XSIMD_INLINE size_t count(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::count<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Subtract 1 to batch \c x.
+     * @param x batch involved in the decrement.
+     * @return the subtraction of \c x and 1.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> decr(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::decr<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Subtract 1 to batch \c x for each element where \c mask is true.
+     * @param x batch involved in the increment.
+     * @param mask whether to perform the increment or not. Can be a \c
+     *             batch_bool or a \c batch_bool_constant.
+     * @return the subtraction of \c x and 1 when \c mask is true.
+     */
+    template <class T, class A, class Mask>
+    XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& x, Mask const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::decr_if<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the division of the batch \c x by the batch \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the division.
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto div(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x / y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x / y;
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise equality comparison of batches \c x and \c y.
+     * @param x batch of scalars
+     * @param y batch of scalars
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto eq(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x == y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x == y;
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise equality comparison of batches of boolean values \c x and \c y.
+     * @param x batch of booleans involved in the comparison.
+     * @param y batch of booleans involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto eq(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x == y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x == y;
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the natural exponential of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural exponential of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> exp(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::exp<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the base 10 exponential of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 10 exponential of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> exp10(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::exp10<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the base 2 exponential of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 2 exponential of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> exp2(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::exp2<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Load contiguous elements from \c x and place them in slots selected by \c
+     * mask, zeroing the other slots
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> expand(batch<T, A> const& x, batch_bool<T, A> const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::expand<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the natural exponential of the batch \c x, minus one.
+     * @param x batch of floating point values.
+     * @return the natural exponential of \c x, minus one.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> expm1(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::expm1<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the error function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the error function of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> erf(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::erf<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the complementary error function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the error function of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> erfc(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::erfc<A>(x, A {});
+    }
+
+    /**
+     * Extract vector from pair of vectors
+     * extracts the lowest vector elements from the second source \c x
+     * and the highest vector elements from the first source \c y
+     * Concatenates the results into th Return value.
+     * @param x batch of integer or floating point values.
+     * @param y batch of integer or floating point values.
+     * @param i integer specifying the lowest vector element to extract from the first source register
+     * @return.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& x, batch<T, A> const& y, std::size_t i) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::extract_pair<A>(x, y, i, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the absolute values of each scalar in the batch \c x.
+     * @param x batch floating point values.
+     * @return the absolute values of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> fabs(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::abs<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the positive difference between \c x and \c y, that is,
+     * <tt>max(0, x-y)</tt>.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return the positive difference.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> fdim(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fdim<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of largest integer values not greater than
+     * scalars in \c x.
+     * @param x batch of floating point values.
+     * @return the batch of largest integer values not greater than \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> floor(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::floor<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>(x*y) + z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused multiply-add operation.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fma<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the larger values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the larger values.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> fmax(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::max<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the smaller values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the smaller values.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> fmin(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::min<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the modulo of the batch \c x by the batch \c y.
+     * @param x batch involved in the modulo.
+     * @param y batch involved in the modulo.
+     * @return the result of the modulo.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> fmod(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fmod<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>(x*y) - z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused multiply-sub operation.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fms<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>-(x*y) + z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused negated multiply-add operation.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fnma<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>-(x*y) - z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused negated multiply-sub operation.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fnms<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_fp
+     *
+     * Split split the number x into a normalized fraction and an exponent which is stored in exp
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return the normalized fraction of x
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> frexp(const batch<T, A>& x, batch<as_integer_t<T>, A>& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::frexp<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise greater or equal comparison of batches \c x and \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x >= y;
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise greater than comparison of batches \c x and \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x > y;
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Parallel horizontal addition: adds the scalars of each batch
+     * in the array pointed by \c row and store them in a returned
+     * batch.
+     * @param row an array of \c N batches
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> haddp(batch<T, A> const* row) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::haddp<A>(row, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the square root of the sum of the squares of the batches
+     * \c x, and \c y.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return the square root of the sum of the squares of \c x and \c y.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> hypot(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::hypot<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the imaginary part of the batch \c x.
+     * @param x batch of complex or real values.
+     * @return the argument of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE real_batch_type_t<batch<T, A>> imag(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::imag<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Add 1 to batch \c x.
+     * @param x batch involved in the increment.
+     * @return the sum of \c x and 1.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> incr(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::incr<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Add 1 to batch \c x for each element where \c mask is true.
+     * @param x batch involved in the increment.
+     * @param mask whether to perform the increment or not. Can be a \c
+     *             batch_bool or a \c batch_bool_constant.
+     * @return the sum of \c x and 1 when \c mask is true.
+     */
+    template <class T, class A, class Mask>
+    XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& x, Mask const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::incr_if<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_constant
+     *
+     * Return a batch of scalars representing positive infinity
+     * @return a batch of positive infinity
+     */
+    template <class B>
+    XSIMD_INLINE B infinity()
+    {
+        using T = typename B::value_type;
+        using A = typename B::arch_type;
+        detail::static_check_supported_config<T, A>();
+        return B(std::numeric_limits<T>::infinity());
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Create a new batch equivalent to \c x but with element \c val set at position \c pos
+     * @param x batch
+     * @param val value to set
+     * @param pos index of the updated slot
+     * @return copy of \c x with position \c pos set to \c val
+     */
+    template <class T, class A, size_t I>
+    XSIMD_INLINE batch<T, A> insert(batch<T, A> const& x, T val, index<I> pos) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::insert<A>(x, val, pos, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x represent an even integer value
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> is_even(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::is_even<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the floating-point scalars in the given batch \c x represent integer value
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> is_flint(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::is_flint<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x represent an odd integer value
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> is_odd(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::is_odd<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x are inf values.
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    XSIMD_INLINE typename batch<T, A>::batch_bool_type isinf(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::isinf<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x are finite values.
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    XSIMD_INLINE typename batch<T, A>::batch_bool_type isfinite(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::isfinite<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x are NaN values.
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    XSIMD_INLINE typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::isnan<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the multiplication of the floating point number \c x by 2 raised to the power \c y.
+     * @param x batch of floating point values.
+     * @param y batch of integer values.
+     * @return a batch of floating point values.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ldexp<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise lesser or equal to comparison of batches \c x and \c y.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x <= y;
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the natural logarithm of the gamma function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural logarithm of the gamma function of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> lgamma(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::lgamma<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr and the specifed
+     * batch value type \c To. The memory needs to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE simd_return_type<From, To, A> load_as(From const* ptr, aligned_mode) noexcept
+    {
+        using batch_value_type = typename simd_return_type<From, To, A>::value_type;
+        detail::static_check_supported_config<From, A>();
+        detail::static_check_supported_config<To, A>();
+        return kernel::load_aligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+    template <class To, class A = default_arch>
+    XSIMD_INLINE simd_return_type<bool, To, A> load_as(bool const* ptr, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        return simd_return_type<bool, To, A>::load_aligned(ptr);
+    }
+
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
+        return kernel::load_complex_aligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), aligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr and the specifed
+     * batch value type \c To. The memory does not need to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE simd_return_type<From, To, A> load_as(From const* ptr, unaligned_mode) noexcept
+    {
+        using batch_value_type = typename simd_return_type<From, To, A>::value_type;
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        return kernel::load_unaligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+    template <class To, class A = default_arch>
+    XSIMD_INLINE simd_return_type<bool, To, A> load_as(bool const* ptr, unaligned_mode) noexcept
+    {
+        return simd_return_type<bool, To, A>::load_unaligned(ptr);
+    }
+
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
+        return kernel::load_complex_unaligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), unaligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory needs to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    XSIMD_INLINE batch<From, A> load(From const* ptr, aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory does not need to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    XSIMD_INLINE batch<From, A> load(From const* ptr, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory needs to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    XSIMD_INLINE batch<From, A> load_aligned(From const* ptr) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory does not need to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    XSIMD_INLINE batch<From, A> load_unaligned(From const* ptr) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the natural logarithm of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural logarithm of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> log(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     * Computes the base 2 logarithm of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 2 logarithm of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> log2(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log2<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     * Computes the base 10 logarithm of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 10 logarithm of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> log10(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log10<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     * Computes the natural logarithm of one plus the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural logarithm of one plus \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> log1p(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log1p<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise lesser than comparison of batches \c x and \c y.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x < y;
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the larger values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the larger values.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> max(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::max<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the smaller values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the smaller values.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> min(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::min<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_constant
+     *
+     * Return a batch of scalars representing positive infinity
+     * @return a batch of positive infinity
+     */
+    template <class B>
+    XSIMD_INLINE B minusinfinity() noexcept
+    {
+        using T = typename B::value_type;
+        using A = typename B::arch_type;
+        detail::static_check_supported_config<T, A>();
+        return B(-std::numeric_limits<T>::infinity());
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the integer modulo of the batch \c x by the batch \c y.
+     * @param x batch involved in the modulo.
+     * @param y batch involved in the modulo.
+     * @return the result of the modulo.
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto mod(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x % y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x % y;
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the product of the batches \c x and \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the product.
+     * @param y batch involved in the product.
+     * @return the result of the product.
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto mul(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x * y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x * y;
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Rounds the scalars in \c x to integer values (in floating point format), using
+     * the current rounding mode.
+     * @param x batch of floating point values.
+     * @return the batch of nearest integer values.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::nearbyint<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Rounds the scalars in \c x to integer values (in integer format) using
+     * the current rounding mode.
+     * @param x batch of floating point values.
+     * @return the batch of nearest integer values.
+     *
+     * @warning For very large values the conversion to int silently overflows.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<as_integer_t<T>, A>
+    nearbyint_as_int(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::nearbyint_as_int(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise inequality comparison of batches \c x and \c y.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto neq(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x != y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x != y;
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise inequality comparison of batches of boolean values \c x and \c y.
+     * @param x batch of booleans involved in the comparison.
+     * @param y batch of booleans involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto neq(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x != y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x != y;
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the opposite of the batch \c x.
+     * @param x batch involved in the operation.
+     * @return the opposite of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> neg(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return -x;
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes  the next representable  floating-point
+     *        value  following  x  in the direction of y
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return \c x raised to the power \c y.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> nextafter(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::nextafter<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the norm of the batch \c x.
+     * @param x batch of complex or real values.
+     * @return the norm of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE real_batch_type_t<batch<T, A>> norm(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::norm(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Returns a complex batch with magnitude \c r and phase angle \c theta.
+     * @param r The magnitude of the desired complex result.
+     * @param theta The phase angle of the desired complex result.
+     * @return \c r exp(i * \c theta).
+     */
+    template <class T, class A>
+    XSIMD_INLINE complex_batch_type_t<batch<T, A>> polar(batch<T, A> const& r, batch<T, A> const& theta = batch<T, A> {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::polar<A>(r, theta, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * No-op on \c x.
+     * @param x batch involved in the operation.
+     * @return \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> pos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return +x;
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the value of the batch \c x raised to the power
+     * \c y.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return \c x raised to the power \c y.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> pow(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::pow<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the value of the batch \c x raised to the power
+     * \c y.
+     * @param x batch of complex floating point values.
+     * @param y batch of floating point values.
+     * @return \c x raised to the power \c y.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A> pow(batch<std::complex<T>, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::pow<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the value of the batch \c x raised to the power
+     * \c y.
+     * @param x batch of complex floating point values.
+     * @param y batch of floating point values.
+     * @return \c x raised to the power \c y.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A> pow(batch<T, A> const& x, batch<std::complex<T>, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::pow<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the value of the batch \c x raised to the power
+     * \c y.
+     * @param x batch of integral values.
+     * @param y batch of integral values.
+     * @return \c x raised to the power \c y.
+     */
+    template <class T, class ITy, class A, class = typename std::enable_if<std::is_integral<ITy>::value, void>::type>
+    XSIMD_INLINE batch<T, A> pow(batch<T, A> const& x, ITy y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ipow<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the projection of the batch \c z.
+     * @param z batch of complex or real values.
+     * @return the projection of \c z.
+     */
+    template <class T, class A>
+    XSIMD_INLINE complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::proj(z, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the real part of the batch \c z.
+     * @param z batch of complex or real values.
+     * @return the argument of \c z.
+     */
+    template <class T, class A>
+    XSIMD_INLINE real_batch_type_t<batch<T, A>> real(batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::real<A>(z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the approximate reciprocal of the batch \c x.
+     * The maximum relative error for this approximation is
+     * less than 1.5*2^-12.
+     * @param x batch of floating point numbers.
+     * @return the reciprocal.
+     */
+    template <class T, class A, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+    XSIMD_INLINE batch<T, A> reciprocal(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reciprocal(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Generic reducer using only batch operations
+     * @param f reducing function, accepting `batch ()(batch, batch)`
+     * @param x batch involved in the reduction
+     * @return the result of the reduction, as a scalar.
+     */
+    template <class T, class A, class F>
+    XSIMD_INLINE T reduce(F&& f, batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::detail::reduce(std::forward<F>(f), x, std::integral_constant<unsigned, batch<T, A>::size>());
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Adds all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    XSIMD_INLINE T reduce_add(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_add<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Max of all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    XSIMD_INLINE T reduce_max(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_max<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Min of all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    XSIMD_INLINE T reduce_min(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_min<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the remainder of dividing \c x by \c y
+     * @param x batch of scalar values
+     * @param y batch of scalar values
+     * @return the result of the addition.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> remainder(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::remainder<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Rounds the scalars in \c x to integer values (in floating point format), using
+     * the current rounding mode.
+     * @param x batch of floating point values.
+     * @return the batch of rounded values.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> rint(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return nearbyint(x);
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Slide the whole batch to the left by \c n elements, and reintroduce the
+     * slided out elements from the right. This is different from
+     * \c rotl that rotates each batch element to the left.
+     *
+     * @tparam N Amount of elements to rotate to the left.
+     * @param x batch of integer values.
+     * @return rotated batch.
+     */
+    template <size_t N, class T, class A>
+    XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotate_left<N, A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Slide the whole batch to the right by \c n elements, and reintroduce the
+     * slided out elements from the left. This is different from
+     * \c rotr that rotates each batch element to the right.
+     *
+     * @tparam N Amount of elements to rotate to the right.
+     * @param x batch of integer values.
+     * @return rotated batch.
+     */
+    template <size_t N, class T, class A>
+    XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotate_right<N, A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Perform a bitwise shift to the left, reintroducing the shifted out bits
+     * to the right
+     * @param x batch to rotate
+     * @param shift scalar amount to shift
+     * @return rotated \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& x, int shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotl<A>(x, shift, A {});
+    }
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotl<A>(x, shift, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Perform a bitwise shift to the right, reintroducing the shifted out bits
+     * to the left.
+     * @param x batch to rotate
+     * @param shift scalar amount to shift
+     * @return rotated \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& x, int shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotr<A>(x, shift, A {});
+    }
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotr<A>(x, shift, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of nearest integer values to scalars in \c x (in
+     * floating point format), rounding halfway cases away from zero, regardless
+     * of the current rounding mode.
+     * @param x batch of flaoting point values.
+     * @return the batch of nearest integer values.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> round(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::round<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes an estimate of the inverse square root of the batch \c x.
+     *
+     * @warning Unlike most xsimd function, this does not return the same result as the
+     * equivalent scalar operation, trading accuracy for speed.
+     *
+     * @param x batch of floating point values.
+     * @return the inverse square root of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rsqrt<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the saturate sum of the batch \c x and the batch \c y.
+
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the saturated addition.
+     * @param y batch involved in the saturated addition.
+     * @return the result of the saturated addition.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sadd<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_cond
+     *
+     * Ternary operator for batches: selects values from the batches \c true_br or \c false_br
+     * depending on the boolean values in the constant batch \c cond. Equivalent to
+     * \code{.cpp}
+     * for(std::size_t i = 0; i < N; ++i)
+     *     res[i] = cond[i] ? true_br[i] : false_br[i];
+     * \endcode
+     * @param cond batch condition.
+     * @param true_br batch values for truthy condition.
+     * @param false_br batch value for falsy condition.
+     * @return the result of the selection.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::select<A>(cond, true_br, false_br, A {});
+    }
+
+    /**
+     * @ingroup batch_cond
+     *
+     * Ternary operator for batches: selects values from the batches \c true_br or \c false_br
+     * depending on the boolean values in the constant batch \c cond. Equivalent to
+     * \code{.cpp}
+     * for(std::size_t i = 0; i < N; ++i)
+     *     res[i] = cond[i] ? true_br[i] : false_br[i];
+     * \endcode
+     * @param cond batch condition.
+     * @param true_br batch values for truthy condition.
+     * @param false_br batch value for falsy condition.
+     * @return the result of the selection.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A> select(batch_bool<T, A> const& cond, batch<std::complex<T>, A> const& true_br, batch<std::complex<T>, A> const& false_br) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::select<A>(cond, true_br, false_br, A {});
+    }
+
+    /**
+     * @ingroup batch_cond
+     *
+     * Ternary operator for batches: selects values from the batches \c true_br or \c false_br
+     * depending on the boolean values in the constant batch \c cond. Equivalent to
+     * \code{.cpp}
+     * for(std::size_t i = 0; i < N; ++i)
+     *     res[i] = cond[i] ? true_br[i] : false_br[i];
+     * \endcode
+     * @param cond constant batch condition.
+     * @param true_br batch values for truthy condition.
+     * @param false_br batch value for falsy condition.
+     * @return the result of the selection.
+     */
+    template <class T, class A, bool... Values>
+    XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::select<A>(cond, true_br, false_br, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Combine elements from \c x and \c y according to selector \c mask
+     * @param x batch
+     * @param y batch
+     * @param mask constant batch mask of integer elements of the same size as
+     * element of \c x and \c y. Each element of the mask index the vector that
+     * would be formed by the concatenation of \c x and \c y. For instance
+     * \code{.cpp}
+     * batch_constant<uint32_t, sse2, 0, 4, 3, 7>
+     * \endcode
+     * Picks \c x[0], \c y[0], \c x[3], \c y[3]
+     *
+     * @return combined batch
+     */
+    template <class T, class A, class Vt, Vt... Values>
+    XSIMD_INLINE typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+    shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<Vt, A, Values...> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::shuffle<A>(x, y, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes the sign of \c x
+     * @param x batch
+     * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> sign(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sign<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes the sign of \c x, assuming x doesn't have any zero
+     * @param x batch
+     * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> signnz(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::signnz<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the sine of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> sin(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sin<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the sine and the cosine of the batch \c x. This method is faster
+     * than calling sine and cosine independently.
+     * @param x batch of floating point values.
+     * @return a pair containing the sine then the cosine of  batch \c x
+     */
+    template <class T, class A>
+    XSIMD_INLINE std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sincos<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the hyperbolic sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the hyperbolic sine of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> sinh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sinh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Slide the whole batch to the left by \c n bytes. This is different from
+     * \c bitwise_lshift that shifts each batch element to the left.
+     *
+     * @tparam N Amount of bytes to slide to the left.
+     * @param x batch of integer values.
+     * @return slided batch.
+     */
+    template <size_t N, class T, class A>
+    XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x) noexcept
+    {
+        static_assert(std::is_integral<T>::value, "can only slide batch of integers");
+        detail::static_check_supported_config<T, A>();
+        return kernel::slide_left<N, A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Slide the whole batch to the right by \c N bytes. This is different from
+     * \c bitwise_rshift that shifts each batch element to the right.
+     *
+     * @tparam N Amount of bytes to slide to the right.
+     * @param x batch of integer values.
+     * @return slided batch.
+     */
+    template <size_t N, class T, class A>
+    XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x) noexcept
+    {
+        static_assert(std::is_integral<T>::value, "can only slide batch of integers");
+        detail::static_check_supported_config<T, A>();
+        return kernel::slide_right<N, A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the square root of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the square root of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sqrt<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the saturate difference of the batch \c x and the batch \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the saturated difference.
+     * @param y batch involved in the saturated difference.
+     * @return the result of the saturated difference.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ssub<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c src to the buffer \c dst. The
+     * memory needs to be aligned.
+     * @param dst the memory buffer to write to
+     * @param src the batch to copy
+     */
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE void store_as(To* dst, batch<From, A> const& src, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        kernel::store_aligned<A>(dst, src, A {});
+    }
+
+    template <class A = default_arch, class From>
+    XSIMD_INLINE void store_as(bool* dst, batch_bool<From, A> const& src, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        kernel::store<A>(src, dst, A {});
+    }
+
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<std::complex<From>, A>();
+        kernel::store_complex_aligned<A>(dst, src, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
+    {
+        store_as(reinterpret_cast<std::complex<To>*>(dst), src, aligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c src to the buffer \c dst. The
+     * memory does not need to be aligned.
+     * @param dst the memory buffer to write to
+     * @param src the batch to copy
+     */
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE void store_as(To* dst, batch<From, A> const& src, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        kernel::store_unaligned<A>(dst, src, A {});
+    }
+
+    template <class A = default_arch, class From>
+    XSIMD_INLINE void store_as(bool* dst, batch_bool<From, A> const& src, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        kernel::store<A>(src, dst, A {});
+    }
+
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<std::complex<From>, A>();
+        kernel::store_complex_unaligned<A>(dst, src, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<std::complex<From>, A>();
+        store_as(reinterpret_cast<std::complex<To>*>(dst), src, unaligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory does not need to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy from
+     */
+    template <class A, class T>
+    XSIMD_INLINE void store(T* mem, batch<T, A> const& val, aligned_mode = {}) noexcept
+    {
+        store_as<T, A>(mem, val, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory does not need to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy from
+     */
+    template <class A, class T>
+    XSIMD_INLINE void store(T* mem, batch<T, A> const& val, unaligned_mode) noexcept
+    {
+        store_as<T, A>(mem, val, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory needs to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy from
+     */
+    template <class A, class T>
+    XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& val) noexcept
+    {
+        store_as<T, A>(mem, val, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory does not need to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy
+     */
+    template <class A, class T>
+    XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& val) noexcept
+    {
+        store_as<T, A>(mem, val, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the difference between \c x and \c y
+     * @tparam X the actual type of batch.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the difference between \c x and \c y
+     */
+    template <class T, class A>
+    XSIMD_INLINE auto sub(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x - y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x - y;
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Rearrange elements from \c x according to constant mask \c mask
+     * @param x batch
+     * @param mask constant batch mask of integer elements of the same size as
+     * element of \c x
+     * @return swizzled batch
+     */
+    template <class T, class A, class Vt, Vt... Values>
+    XSIMD_INLINE typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+    swizzle(batch<T, A> const& x, batch_constant<Vt, A, Values...> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::swizzle<A>(x, mask, A {});
+    }
+    template <class T, class A, class Vt, Vt... Values>
+    XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch_constant<Vt, A, Values...> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::swizzle<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Rearrange elements from \c x according to mask \c mask
+     * @param x batch
+     * @param mask batch mask of integer elements of the same size as
+     * element of \c x
+     * @return swizzled batch
+     */
+    template <class T, class A, class Vt>
+    XSIMD_INLINE typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+    swizzle(batch<T, A> const& x, batch<Vt, A> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::swizzle<A>(x, mask, A {});
+    }
+
+    template <class T, class A, class Vt>
+    XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch<Vt, A> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::swizzle<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the tangent of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> tan(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::tan<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the hyperbolic tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the hyperbolic tangent of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> tanh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::tanh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the gamma function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the gamma function of \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> tgamma(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::tgamma<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a conversion from \c i to a value of an floating point type of the same size as \c T.
+     * This is equivalent to \c batch_cast<as_float_t<T>>(i)
+     * @param i batch of integers.
+     * @return \c i converted to a value of an floating point type of the same size as \c T
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<as_float_t<T>, A> to_float(batch<T, A> const& i) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch_cast<as_float_t<T>>(i);
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a conversion from \c x to a value of an integer type of the same size as \c T
+     * This is equivalent to \c batch_cast<as_integer_t<T>>(x)
+     * @param x batch.
+     * @return \c x converted to a value of an integer type of the same size as \c T
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<as_integer_t<T>, A> to_int(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch_cast<as_integer_t<T>>(x);
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Transposes in place the matrix whose line are each of the batch passed as
+     * argument.
+     * @param matrix_begin pointer to the first line of the matrix to transpose
+     * @param matrix_end pointer to one element after the last line of the matrix to transpose
+     *
+     */
+    template <class T, class A>
+    XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end) noexcept
+    {
+        assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
+        detail::static_check_supported_config<T, A>();
+        return kernel::transpose(matrix_begin, matrix_end, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of nearest integer values not greater in magnitude
+     * than scalars in \c x.
+     * @param x batch of floating point values.
+     * @return the batch of nearest integer values not greater in magnitude than \c x.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> trunc(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::trunc<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Unpack and interleave data from the HIGH half of batches \c x and \c y.
+     * Store the results in the Return value.
+     * @param x a batch of integer or floating point or double precision values.
+     * @param y a batch of integer or floating point or double precision values.
+     * @return a batch of the high part of shuffled values.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::zip_hi<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Unpack and interleave data from the LOW half of batches \c x and \c y.
+     * Store the results in the Return value.
+     * @param x a batch of integer or floating point or double precision values.
+     * @param y a batch of integer or floating point or double precision values.
+     * @return a batch of the low part of shuffled values.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::zip_lo<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Cast a \c batch_bool of \c T into a \c batch of the same type using the
+     * following rule: if an element of \c self is true, it maps to -1 in the
+     * returned integral batch, otherwise it maps to 0.
+     *
+     * @param self batch_bool of \c T
+     * @return \c self cast to a \c batch of \c T
+     */
+    template <class T, class A, typename std::enable_if<std::is_integral<T>::value, int>::type = 3>
+    XSIMD_INLINE batch<T, A> bitwise_cast(batch_bool<T, A> const& self) noexcept
+    {
+        T z(0);
+        detail::static_check_supported_config<T, A>();
+        return select(self, batch<T, A>(T(~z)), batch<T, A>(z));
+    }
+
+    template <class T, class A, typename std::enable_if<std::is_floating_point<T>::value, int>::type = 3>
+    XSIMD_INLINE batch<T, A> bitwise_cast(batch_bool<T, A> const& self) noexcept
+    {
+        T z0(0), z1(0);
+        using int_type = as_unsigned_integer_t<T>;
+        int_type value(~int_type(0));
+        std::memcpy(&z1, &value, sizeof(int_type));
+        detail::static_check_supported_config<T, A>();
+        return select(self, batch<T, A>(z1), batch<T, A>(z0));
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Returns true if all the boolean values in the batch are true,
+     * false otherwise.
+     * @param x the batch to reduce.
+     * @return a boolean scalar.
+     */
+    template <class T, class A>
+    XSIMD_INLINE bool all(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::all<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Return true if any of the boolean values in the batch is true,
+     * false otherwise.
+     * @param x the batch to reduce.
+     * @return a boolean scalar.
+     */
+    template <class T, class A>
+    XSIMD_INLINE bool any(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::any<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Return true if none of the boolean values in the batch is true,
+     * false otherwise.
+     * @param x the batch to reduce.
+     * @return a boolean scalar.
+     */
+    template <class T, class A>
+    XSIMD_INLINE bool none(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return !xsimd::any(x);
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Dump the content of batch \c x to stream \c o
+     * @param o the stream where the batch is dumped
+     * @param x batch to dump.
+     * @return a reference to \c o
+     */
+    template <class T, class A>
+    XSIMD_INLINE std::ostream& operator<<(std::ostream& o, batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        constexpr auto size = batch<T, A>::size;
+        alignas(A::alignment()) T buffer[size];
+        x.store_aligned(&buffer[0]);
+        o << '(';
+        for (std::size_t i = 0; i < size - 1; ++i)
+            o << buffer[i] << ", ";
+        return o << buffer[size - 1] << ')';
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Dump the content of batch \c x to stream \c o
+     * @param o the stream where the batch is dumped
+     * @param x batch to dump.
+     * @return a reference to \c o
+     */
+    template <class T, class A>
+    XSIMD_INLINE std::ostream& operator<<(std::ostream& o, batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        constexpr auto size = batch_bool<T, A>::size;
+        alignas(A::alignment()) bool buffer[size];
+        x.store_aligned(&buffer[0]);
+        o << '(';
+        for (std::size_t i = 0; i < size - 1; ++i)
+            o << buffer[i] << ", ";
+        return o << buffer[size - 1] << ')';
+    }
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_avx2_register.hpp b/include/onnxruntime/xsimd/types/xsimd_avx2_register.hpp
new file mode 100644
index 0000000000000..264b7c3eda96c
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_avx2_register.hpp
@@ -0,0 +1,39 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX2_REGISTER_HPP
+#define XSIMD_AVX2_REGISTER_HPP
+
+#include "./xsimd_avx_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * AVX2 instructions
+     */
+    struct avx2 : avx
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx2"; }
+    };
+
+#if XSIMD_WITH_AVX2
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2, avx);
+    }
+#endif
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_avx512bw_register.hpp b/include/onnxruntime/xsimd/types/xsimd_avx512bw_register.hpp
new file mode 100644
index 0000000000000..9d4d33b64ebaa
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_avx512bw_register.hpp
@@ -0,0 +1,47 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512BW_REGISTER_HPP
+#define XSIMD_AVX512BW_REGISTER_HPP
+
+#include "./xsimd_avx512dq_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512BW instructions
+     */
+    struct avx512bw : avx512dq
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512BW; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512bw"; }
+    };
+
+#if XSIMD_WITH_AVX512BW
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512bw>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512bw, avx512dq);
+
+    }
+#endif
+}
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_avx512cd_register.hpp b/include/onnxruntime/xsimd/types/xsimd_avx512cd_register.hpp
new file mode 100644
index 0000000000000..cf0601395572e
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_avx512cd_register.hpp
@@ -0,0 +1,47 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512CD_REGISTER_HPP
+#define XSIMD_AVX512CD_REGISTER_HPP
+
+#include "./xsimd_avx512f_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512CD instructions
+     */
+    struct avx512cd : avx512f
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512CD; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512cd"; }
+    };
+
+#if XSIMD_WITH_AVX512CD
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512cd>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512cd, avx512f);
+
+    }
+#endif
+}
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_avx512dq_register.hpp b/include/onnxruntime/xsimd/types/xsimd_avx512dq_register.hpp
new file mode 100644
index 0000000000000..f8a8dc543439b
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_avx512dq_register.hpp
@@ -0,0 +1,47 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512DQ_REGISTER_HPP
+#define XSIMD_AVX512DQ_REGISTER_HPP
+
+#include "./xsimd_avx512cd_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512DQ instructions
+     */
+    struct avx512dq : avx512cd
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512DQ; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512dq"; }
+    };
+
+#if XSIMD_WITH_AVX512DQ
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512dq>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512dq, avx512cd);
+
+    }
+#endif
+}
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_avx512er_register.hpp b/include/onnxruntime/xsimd/types/xsimd_avx512er_register.hpp
new file mode 100644
index 0000000000000..a52bd0064e2d2
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_avx512er_register.hpp
@@ -0,0 +1,47 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512ER_REGISTER_HPP
+#define XSIMD_AVX512ER_REGISTER_HPP
+
+#include "./xsimd_avx512dq_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512ER instructions
+     */
+    struct avx512er : avx512cd
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512ER; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512er"; }
+    };
+
+#if XSIMD_WITH_AVX512ER
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512er>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512er, avx512cd);
+
+    }
+#endif
+}
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_avx512f_register.hpp b/include/onnxruntime/xsimd/types/xsimd_avx512f_register.hpp
new file mode 100644
index 0000000000000..1a11b6c92ab68
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_avx512f_register.hpp
@@ -0,0 +1,73 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512F_REGISTER_HPP
+#define XSIMD_AVX512F_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512F instructions
+     */
+    struct avx512f : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512F; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 64; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512f"; }
+    };
+
+#if XSIMD_WITH_AVX512F
+
+    namespace types
+    {
+        template <class T>
+        struct simd_avx512_bool_register
+        {
+            using register_type = typename std::conditional<
+                (sizeof(T) < 4), std::conditional<(sizeof(T) == 1), __mmask64, __mmask32>,
+                std::conditional<(sizeof(T) == 4), __mmask16, __mmask8>>::type::type;
+            register_type data;
+            simd_avx512_bool_register() = default;
+            simd_avx512_bool_register(register_type r) { data = r; }
+            operator register_type() const noexcept { return data; }
+        };
+        template <class T>
+        struct get_bool_simd_register<T, avx512f>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(char, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(short, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(float, avx512f, __m512);
+        XSIMD_DECLARE_SIMD_REGISTER(double, avx512f, __m512d);
+
+    }
+#endif
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_avx512ifma_register.hpp b/include/onnxruntime/xsimd/types/xsimd_avx512ifma_register.hpp
new file mode 100644
index 0000000000000..a8bc8885fb43d
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_avx512ifma_register.hpp
@@ -0,0 +1,47 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512IFMA_REGISTER_HPP
+#define XSIMD_AVX512IFMA_REGISTER_HPP
+
+#include "./xsimd_avx512bw_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512IFMA instructions
+     */
+    struct avx512ifma : avx512bw
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512IFMA; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512ifma"; }
+    };
+
+#if XSIMD_WITH_AVX512IFMA
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512ifma>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512ifma, avx512bw);
+
+    }
+#endif
+}
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_avx512pf_register.hpp b/include/onnxruntime/xsimd/types/xsimd_avx512pf_register.hpp
new file mode 100644
index 0000000000000..4838a8a461e24
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_avx512pf_register.hpp
@@ -0,0 +1,47 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512PF_REGISTER_HPP
+#define XSIMD_AVX512PF_REGISTER_HPP
+
+#include "./xsimd_avx512er_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512BW instructions
+     */
+    struct avx512pf : avx512er
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512PF; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512pf"; }
+    };
+
+#if XSIMD_WITH_AVX512PF
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512pf>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512pf, avx512er);
+
+    }
+#endif
+}
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_avx512vbmi_register.hpp b/include/onnxruntime/xsimd/types/xsimd_avx512vbmi_register.hpp
new file mode 100644
index 0000000000000..40f51e9b1924a
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_avx512vbmi_register.hpp
@@ -0,0 +1,47 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VBMI_REGISTER_HPP
+#define XSIMD_AVX512VBMI_REGISTER_HPP
+
+#include "./xsimd_avx512ifma_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512VBMI instructions
+     */
+    struct avx512vbmi : avx512ifma
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VBMI; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512vbmi"; }
+    };
+
+#if XSIMD_WITH_AVX512VBMI
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512vbmi>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vbmi, avx512ifma);
+
+    }
+#endif
+}
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp b/include/onnxruntime/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp
new file mode 100644
index 0000000000000..a19b949f8bc8a
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp
@@ -0,0 +1,50 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VNNI_AVX512BW_REGISTER_HPP
+#define XSIMD_AVX512VNNI_AVX512BW_REGISTER_HPP
+
+#include "./xsimd_avx512bw_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct avx512vnni;
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512VNNI instructions
+     */
+    template <>
+    struct avx512vnni<avx512bw> : avx512bw
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI_AVX512BW; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512vnni+avx512bw"; }
+    };
+
+#if XSIMD_WITH_AVX512VNNI_AVX512BW
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512vnni<avx512bw>>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vnni<avx512bw>, avx512bw);
+
+    }
+#endif
+}
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp b/include/onnxruntime/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp
new file mode 100644
index 0000000000000..0a6b45f76cddc
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp
@@ -0,0 +1,50 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VNNI_AVX512VBMI_REGISTER_HPP
+#define XSIMD_AVX512VNNI_AVX512VBMI_REGISTER_HPP
+
+#include "./xsimd_avx512vbmi_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct avx512vnni;
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512VNNI instructions
+     */
+    template <>
+    struct avx512vnni<avx512vbmi> : avx512vbmi
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI_AVX512VBMI; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512vnni+avx512vbmi"; }
+    };
+
+#if XSIMD_WITH_AVX512VNNI_AVX512VBMI
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512vnni<avx512vbmi>>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vnni<avx512vbmi>, avx512vbmi);
+
+    }
+#endif
+}
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_avx_register.hpp b/include/onnxruntime/xsimd/types/xsimd_avx_register.hpp
new file mode 100644
index 0000000000000..7357304d5d623
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_avx_register.hpp
@@ -0,0 +1,60 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX_REGISTER_HPP
+#define XSIMD_AVX_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX instructions
+     */
+    struct avx : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 32; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx"; }
+    };
+}
+
+#if XSIMD_WITH_AVX
+
+#include <immintrin.h>
+
+namespace xsimd
+{
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(char, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(short, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256);
+        XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d);
+    }
+}
+#endif
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_avxvnni_register.hpp b/include/onnxruntime/xsimd/types/xsimd_avxvnni_register.hpp
new file mode 100644
index 0000000000000..419547b1cf4b9
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_avxvnni_register.hpp
@@ -0,0 +1,39 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVXVNNI_REGISTER_HPP
+#define XSIMD_AVXVNNI_REGISTER_HPP
+
+#include "./xsimd_avx2_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * AVXVNNI instructions
+     */
+    struct avxvnni : avx2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVXVNNI; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avxvnni"; }
+    };
+
+#if XSIMD_WITH_AVXVNNI
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avxvnni, avx2);
+    }
+#endif
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_batch.hpp b/include/onnxruntime/xsimd/types/xsimd_batch.hpp
new file mode 100644
index 0000000000000..c9b6d11496975
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_batch.hpp
@@ -0,0 +1,1492 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_BATCH_HPP
+#define XSIMD_BATCH_HPP
+
+#include <cassert>
+#include <complex>
+
+#include "../config/xsimd_arch.hpp"
+#include "../memory/xsimd_alignment.hpp"
+#include "./xsimd_utils.hpp"
+
+namespace xsimd
+{
+    template <class T, class A = default_arch>
+    class batch;
+
+    namespace types
+    {
+        template <class T, class A>
+        struct integral_only_operators
+        {
+            XSIMD_INLINE batch<T, A>& operator%=(batch<T, A> const& other) noexcept;
+            XSIMD_INLINE batch<T, A>& operator>>=(int32_t other) noexcept;
+            XSIMD_INLINE batch<T, A>& operator>>=(batch<T, A> const& other) noexcept;
+            XSIMD_INLINE batch<T, A>& operator<<=(int32_t other) noexcept;
+            XSIMD_INLINE batch<T, A>& operator<<=(batch<T, A> const& other) noexcept;
+
+            /** Shorthand for xsimd::mod() */
+            friend XSIMD_INLINE batch<T, A> operator%(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                return batch<T, A>(self) %= other;
+            }
+
+            /** Shorthand for xsimd::bitwise_rshift() */
+            friend XSIMD_INLINE batch<T, A> operator>>(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                return batch<T, A>(self) >>= other;
+            }
+
+            /** Shorthand for xsimd::bitwise_lshift() */
+            friend XSIMD_INLINE batch<T, A> operator<<(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                return batch<T, A>(self) <<= other;
+            }
+
+            /** Shorthand for xsimd::bitwise_rshift() */
+            friend XSIMD_INLINE batch<T, A> operator>>(batch<T, A> const& self, int32_t other) noexcept
+            {
+                return batch<T, A>(self) >>= other;
+            }
+
+            /** Shorthand for xsimd::bitwise_lshift() */
+            friend XSIMD_INLINE batch<T, A> operator<<(batch<T, A> const& self, int32_t other) noexcept
+            {
+                return batch<T, A>(self) <<= other;
+            }
+        };
+        template <class A>
+        struct integral_only_operators<float, A>
+        {
+        };
+        template <class A>
+        struct integral_only_operators<double, A>
+        {
+        };
+
+    }
+
+    namespace details
+    {
+        // These functions are forwarded declared here so that they can be used by friend functions
+        // with batch<T, A>. Their implementation must appear only once the
+        // kernel implementations have been included.
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+    }
+
+    /**
+     * @brief batch of integer or floating point values.
+     *
+     * Abstract representation of an SIMD register for floating point or integral
+     * value.
+     *
+     * @tparam T the type of the underlying values.
+     * @tparam A the architecture this batch is tied too.
+     **/
+    template <class T, class A>
+    class batch : public types::simd_register<T, A>, public types::integral_only_operators<T, A>
+    {
+        static_assert(!std::is_same<T, bool>::value, "use xsimd::batch_bool<T, A> instead of xsimd::batch<bool, A>");
+
+    public:
+        static constexpr std::size_t size = sizeof(types::simd_register<T, A>) / sizeof(T); ///< Number of scalar elements in this batch.
+
+        using value_type = T; ///< Type of the scalar elements within this batch.
+        using arch_type = A; ///< SIMD Architecture abstracted by this batch.
+        using register_type = typename types::simd_register<T, A>::register_type; ///< SIMD register type abstracted by this batch.
+        using batch_bool_type = batch_bool<T, A>; ///< Associated batch type used to represented logical operations on this batch.
+
+        // constructors
+        XSIMD_INLINE batch() = default; ///< Create a batch initialized with undefined values.
+        XSIMD_INLINE batch(T val) noexcept;
+        template <class... Ts>
+        XSIMD_INLINE batch(T val0, T val1, Ts... vals) noexcept;
+        XSIMD_INLINE explicit batch(batch_bool_type const& b) noexcept;
+        XSIMD_INLINE batch(register_type reg) noexcept;
+
+        template <class U>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch broadcast(U val) noexcept;
+
+        // memory operators
+        template <class U>
+        XSIMD_INLINE void store_aligned(U* mem) const noexcept;
+        template <class U>
+        XSIMD_INLINE void store_unaligned(U* mem) const noexcept;
+        template <class U>
+        XSIMD_INLINE void store(U* mem, aligned_mode) const noexcept;
+        template <class U>
+        XSIMD_INLINE void store(U* mem, unaligned_mode) const noexcept;
+
+        template <class U>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(U const* mem) noexcept;
+        template <class U>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_unaligned(U const* mem) noexcept;
+        template <class U>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, aligned_mode) noexcept;
+        template <class U>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, unaligned_mode) noexcept;
+
+        template <class U, class V>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch gather(U const* src, batch<V, arch_type> const& index) noexcept;
+        template <class U, class V>
+        XSIMD_INLINE void scatter(U* dst, batch<V, arch_type> const& index) const noexcept;
+
+        XSIMD_INLINE T get(std::size_t i) const noexcept;
+
+        // comparison operators. Defined as friend to enable automatic
+        // conversion of parameters from scalar to batch, at the cost of using a
+        // proxy implementation from details::.
+        friend XSIMD_INLINE batch_bool<T, A> operator==(batch const& self, batch const& other) noexcept
+        {
+            return details::eq<T, A>(self, other);
+        }
+        friend XSIMD_INLINE batch_bool<T, A> operator!=(batch const& self, batch const& other) noexcept
+        {
+            return details::neq<T, A>(self, other);
+        }
+        friend XSIMD_INLINE batch_bool<T, A> operator>=(batch const& self, batch const& other) noexcept
+        {
+            return details::ge<T, A>(self, other);
+        }
+        friend XSIMD_INLINE batch_bool<T, A> operator<=(batch const& self, batch const& other) noexcept
+        {
+            return details::le<T, A>(self, other);
+        }
+        friend XSIMD_INLINE batch_bool<T, A> operator>(batch const& self, batch const& other) noexcept
+        {
+            return details::gt<T, A>(self, other);
+        }
+        friend XSIMD_INLINE batch_bool<T, A> operator<(batch const& self, batch const& other) noexcept
+        {
+            return details::lt<T, A>(self, other);
+        }
+
+        // Update operators
+        XSIMD_INLINE batch& operator+=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator-=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator*=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator/=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator&=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator|=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator^=(batch const& other) noexcept;
+
+        // incr/decr operators
+        XSIMD_INLINE batch& operator++() noexcept;
+        XSIMD_INLINE batch& operator--() noexcept;
+        XSIMD_INLINE batch operator++(int) noexcept;
+        XSIMD_INLINE batch operator--(int) noexcept;
+
+        // unary operators
+        XSIMD_INLINE batch_bool_type operator!() const noexcept;
+        XSIMD_INLINE batch operator~() const noexcept;
+        XSIMD_INLINE batch operator-() const noexcept;
+        XSIMD_INLINE batch operator+() const noexcept;
+
+        // arithmetic operators. They are defined as friend to enable automatic
+        // conversion of parameters from scalar to batch. Inline implementation
+        // is required to avoid warnings.
+
+        /** Shorthand for xsimd::add() */
+        friend XSIMD_INLINE batch operator+(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) += other;
+        }
+
+        /** Shorthand for xsimd::sub() */
+        friend XSIMD_INLINE batch operator-(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) -= other;
+        }
+
+        /** Shorthand for xsimd::mul() */
+        friend XSIMD_INLINE batch operator*(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) *= other;
+        }
+
+        /** Shorthand for xsimd::div() */
+        friend XSIMD_INLINE batch operator/(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) /= other;
+        }
+
+        /** Shorthand for xsimd::bitwise_and() */
+        friend XSIMD_INLINE batch operator&(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) &= other;
+        }
+
+        /** Shorthand for xsimd::bitwise_or() */
+        friend XSIMD_INLINE batch operator|(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) |= other;
+        }
+
+        /** Shorthand for xsimd::bitwise_xor() */
+        friend XSIMD_INLINE batch operator^(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) ^= other;
+        }
+
+        /** Shorthand for xsimd::logical_and() */
+        friend XSIMD_INLINE batch operator&&(batch const& self, batch const& other) noexcept
+        {
+            return batch(self).logical_and(other);
+        }
+
+        /** Shorthand for xsimd::logical_or() */
+        friend XSIMD_INLINE batch operator||(batch const& self, batch const& other) noexcept
+        {
+            return batch(self).logical_or(other);
+        }
+
+    private:
+        XSIMD_INLINE batch logical_and(batch const& other) const noexcept;
+        XSIMD_INLINE batch logical_or(batch const& other) const noexcept;
+    };
+
+    template <class T, class A>
+    constexpr std::size_t batch<T, A>::size;
+
+    /**
+     * @brief batch of predicate over scalar or complex values.
+     *
+     * Abstract representation of a predicate over SIMD register for scalar or
+     * complex values.
+     *
+     * @tparam T the type of the predicated values.
+     * @tparam A the architecture this batch is tied too.
+     **/
+    template <class T, class A = default_arch>
+    class batch_bool : public types::get_bool_simd_register_t<T, A>
+    {
+        using base_type = types::get_bool_simd_register_t<T, A>;
+
+    public:
+        static constexpr std::size_t size = sizeof(types::simd_register<T, A>) / sizeof(T); ///< Number of scalar elements in this batch.
+
+        using value_type = bool; ///< Type of the scalar elements within this batch.
+        using arch_type = A; ///< SIMD Architecture abstracted by this batch.
+        using register_type = typename base_type::register_type; ///< SIMD register type abstracted by this batch.
+        using batch_type = batch<T, A>; ///< Associated batch type this batch represents logical operations for.
+
+        // constructors
+        XSIMD_INLINE batch_bool() = default; ///< Create a batch initialized with undefined values.
+        XSIMD_INLINE batch_bool(bool val) noexcept;
+        XSIMD_INLINE batch_bool(register_type reg) noexcept;
+        template <class... Ts>
+        XSIMD_INLINE batch_bool(bool val0, bool val1, Ts... vals) noexcept;
+
+        template <class Tp>
+        XSIMD_INLINE batch_bool(Tp const*) = delete;
+
+        // memory operators
+        XSIMD_INLINE void store_aligned(bool* mem) const noexcept;
+        XSIMD_INLINE void store_unaligned(bool* mem) const noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_aligned(bool const* mem) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_unaligned(bool const* mem) noexcept;
+
+        XSIMD_INLINE bool get(std::size_t i) const noexcept;
+
+        // mask operations
+        XSIMD_INLINE uint64_t mask() const noexcept;
+        XSIMD_INLINE static batch_bool from_mask(uint64_t mask) noexcept;
+
+        // comparison operators
+        XSIMD_INLINE batch_bool operator==(batch_bool const& other) const noexcept;
+        XSIMD_INLINE batch_bool operator!=(batch_bool const& other) const noexcept;
+
+        // logical operators
+        XSIMD_INLINE batch_bool operator~() const noexcept;
+        XSIMD_INLINE batch_bool operator!() const noexcept;
+        XSIMD_INLINE batch_bool operator&(batch_bool const& other) const noexcept;
+        XSIMD_INLINE batch_bool operator|(batch_bool const& other) const noexcept;
+        XSIMD_INLINE batch_bool operator^(batch_bool const& other) const noexcept;
+        XSIMD_INLINE batch_bool operator&&(batch_bool const& other) const noexcept;
+        XSIMD_INLINE batch_bool operator||(batch_bool const& other) const noexcept;
+
+        // update operators
+        XSIMD_INLINE batch_bool& operator&=(batch_bool const& other) noexcept { return (*this) = (*this) & other; }
+        XSIMD_INLINE batch_bool& operator|=(batch_bool const& other) noexcept { return (*this) = (*this) | other; }
+        XSIMD_INLINE batch_bool& operator^=(batch_bool const& other) noexcept { return (*this) = (*this) ^ other; }
+
+    private:
+        template <class U, class... V, size_t I, size_t... Is>
+        static XSIMD_INLINE register_type make_register(detail::index_sequence<I, Is...>, U u, V... v) noexcept;
+
+        template <class... V>
+        static XSIMD_INLINE register_type make_register(detail::index_sequence<>, V... v) noexcept;
+    };
+
+    template <class T, class A>
+    constexpr std::size_t batch_bool<T, A>::size;
+
+    /**
+     * @brief batch of complex values.
+     *
+     * Abstract representation of an SIMD register for complex values.
+     *
+     * @tparam T the type of the underlying values.
+     * @tparam A the architecture this batch is tied too.
+     **/
+    template <class T, class A>
+    class batch<std::complex<T>, A>
+    {
+    public:
+        using value_type = std::complex<T>; ///< Type of the complex elements within this batch.
+        using real_batch = batch<T, A>; ///< Type of the scalar elements within this batch.
+        using arch_type = A; ///< SIMD Architecture abstracted by this batch.
+        using batch_bool_type = batch_bool<T, A>; ///< Associated batch type used to represented logical operations on this batch.
+
+        static constexpr std::size_t size = real_batch::size; ///< Number of complex elements in this batch.
+
+        // constructors
+        XSIMD_INLINE batch() = default; ///< Create a batch initialized with undefined values.
+        XSIMD_INLINE batch(value_type const& val) noexcept;
+        XSIMD_INLINE batch(real_batch const& real, real_batch const& imag) noexcept;
+
+        XSIMD_INLINE batch(real_batch const& real) noexcept;
+        XSIMD_INLINE batch(T val) noexcept;
+        template <class... Ts>
+        XSIMD_INLINE batch(value_type val0, value_type val1, Ts... vals) noexcept;
+        XSIMD_INLINE explicit batch(batch_bool_type const& b) noexcept;
+
+        template <class U>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch broadcast(U val) noexcept;
+
+        // memory operators
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(const T* real_src, const T* imag_src = nullptr) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_unaligned(const T* real_src, const T* imag_src = nullptr) noexcept;
+        XSIMD_INLINE void store_aligned(T* real_dst, T* imag_dst) const noexcept;
+        XSIMD_INLINE void store_unaligned(T* real_dst, T* imag_dst) const noexcept;
+
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(const value_type* src) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_unaligned(const value_type* src) noexcept;
+        XSIMD_INLINE void store_aligned(value_type* dst) const noexcept;
+        XSIMD_INLINE void store_unaligned(value_type* dst) const noexcept;
+
+        template <class U>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, aligned_mode) noexcept;
+        template <class U>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, unaligned_mode) noexcept;
+        template <class U>
+        XSIMD_INLINE void store(U* mem, aligned_mode) const noexcept;
+        template <class U>
+        XSIMD_INLINE void store(U* mem, unaligned_mode) const noexcept;
+
+        XSIMD_INLINE real_batch real() const noexcept;
+        XSIMD_INLINE real_batch imag() const noexcept;
+
+        XSIMD_INLINE value_type get(std::size_t i) const noexcept;
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        // xtl-related methods
+        template <bool i3ec>
+        XSIMD_INLINE batch(xtl::xcomplex<T, T, i3ec> const& val) noexcept;
+        template <bool i3ec, class... Ts>
+        XSIMD_INLINE batch(xtl::xcomplex<T, T, i3ec> val0, xtl::xcomplex<T, T, i3ec> val1, Ts... vals) noexcept;
+
+        template <bool i3ec>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept;
+        template <bool i3ec>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_unaligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept;
+        template <bool i3ec>
+        XSIMD_INLINE void store_aligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept;
+        template <bool i3ec>
+        XSIMD_INLINE void store_unaligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept;
+#endif
+
+        // comparison operators
+        XSIMD_INLINE batch_bool<T, A> operator==(batch const& other) const noexcept;
+        XSIMD_INLINE batch_bool<T, A> operator!=(batch const& other) const noexcept;
+
+        // Update operators
+        XSIMD_INLINE batch& operator+=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator-=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator*=(batch const& other) noexcept;
+        XSIMD_INLINE batch& operator/=(batch const& other) noexcept;
+
+        // incr/decr operators
+        XSIMD_INLINE batch& operator++() noexcept;
+        XSIMD_INLINE batch& operator--() noexcept;
+        XSIMD_INLINE batch operator++(int) noexcept;
+        XSIMD_INLINE batch operator--(int) noexcept;
+
+        // unary operators
+        XSIMD_INLINE batch_bool_type operator!() const noexcept;
+        XSIMD_INLINE batch operator~() const noexcept;
+        XSIMD_INLINE batch operator-() const noexcept;
+        XSIMD_INLINE batch operator+() const noexcept;
+
+        // arithmetic operators. They are defined as friend to enable automatic
+        // conversion of parameters from scalar to batch
+
+        /** Shorthand for xsimd::add() */
+        friend XSIMD_INLINE batch operator+(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) += other;
+        }
+
+        /** Shorthand for xsimd::sub() */
+        friend XSIMD_INLINE batch operator-(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) -= other;
+        }
+
+        /** Shorthand for xsimd::mul() */
+        friend XSIMD_INLINE batch operator*(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) *= other;
+        }
+
+        /** Shorthand for xsimd::div() */
+        friend XSIMD_INLINE batch operator/(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) /= other;
+        }
+
+    private:
+        real_batch m_real;
+        real_batch m_imag;
+    };
+
+    template <class T, class A>
+    constexpr std::size_t batch<std::complex<T>, A>::size;
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <typename T, bool i3ec, typename A>
+    struct batch<xtl::xcomplex<T, T, i3ec>, A>
+    {
+        static_assert(std::is_same<T, void>::value,
+                      "Please use batch<std::complex<T>, A> initialized from xtl::xcomplex instead");
+    };
+#endif
+}
+
+#include "../arch/xsimd_isa.hpp"
+#include "./xsimd_batch_constant.hpp"
+#include "./xsimd_traits.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * Create a batch with all element initialized to \c val.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>::batch(T val) noexcept
+        : types::simd_register<T, A>(kernel::broadcast<A>(val, A {}))
+    {
+        detail::static_check_supported_config<T, A>();
+    }
+
+    /**
+     * Create a batch with elements initialized from \c val0, \c val1, \c vals...
+     * There must be exactly \c size elements in total.
+     */
+    template <class T, class A>
+    template <class... Ts>
+    XSIMD_INLINE batch<T, A>::batch(T val0, T val1, Ts... vals) noexcept
+        : batch(kernel::set<A>(batch {}, A {}, val0, val1, static_cast<T>(vals)...))
+    {
+        detail::static_check_supported_config<T, A>();
+        static_assert(sizeof...(Ts) + 2 == size, "The constructor requires as many arguments as batch elements.");
+    }
+
+    /**
+     * Converts a \c bool_batch to a \c batch where each element is
+     * set to 1 (resp. 0) if the corresponding element is `true`
+     * (resp. `false`).
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>::batch(batch_bool<T, A> const& b) noexcept
+        : batch(kernel::from_bool(b, A {}))
+    {
+    }
+
+    /**
+     * Wraps a compatible native simd register as a \c batch. This is generally not needed but
+     * becomes handy when doing architecture-specific operations.
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>::batch(register_type reg) noexcept
+        : types::simd_register<T, A>({ reg })
+    {
+        detail::static_check_supported_config<T, A>();
+    }
+
+    /**
+     * Equivalent to batch::batch(T val).
+     */
+    template <class T, class A>
+    template <class U>
+    XSIMD_NO_DISCARD XSIMD_INLINE batch<T, A> batch<T, A>::broadcast(U val) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch(static_cast<T>(val));
+    }
+
+    /**************************
+     * batch memory operators *
+     **************************/
+
+    /**
+     * Copy content of this batch to the buffer \c mem. The
+     * memory needs to be aligned.
+     */
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE void batch<T, A>::store_aligned(U* mem) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
+               && "store location is not properly aligned");
+        kernel::store_aligned<A>(mem, *this, A {});
+    }
+
+    /**
+     * Copy content of this batch to the buffer \c mem. The
+     * memory does not need to be aligned.
+     */
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE void batch<T, A>::store_unaligned(U* mem) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        kernel::store_unaligned<A>(mem, *this, A {});
+    }
+
+    /**
+     * Equivalent to batch::store_aligned()
+     */
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE void batch<T, A>::store(U* mem, aligned_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return store_aligned(mem);
+    }
+
+    /**
+     * Equivalent to batch::store_unaligned()
+     */
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE void batch<T, A>::store(U* mem, unaligned_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return store_unaligned(mem);
+    }
+
+    /**
+     * Loading from aligned memory. May involve a conversion if \c U is different
+     * from \c T.
+     */
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load_aligned(U const* mem) noexcept
+    {
+        assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
+               && "loaded pointer is not properly aligned");
+        detail::static_check_supported_config<T, A>();
+        return kernel::load_aligned<A>(mem, kernel::convert<T> {}, A {});
+    }
+
+    /**
+     * Loading from unaligned memory. May involve a conversion if \c U is different
+     * from \c T.
+     */
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load_unaligned(U const* mem) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::load_unaligned<A>(mem, kernel::convert<T> {}, A {});
+    }
+
+    /**
+     * Equivalent to batch::load_aligned()
+     */
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load(U const* mem, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return load_aligned(mem);
+    }
+
+    /**
+     * Equivalent to batch::load_unaligned()
+     */
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load(U const* mem, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return load_unaligned(mem);
+    }
+
+    /**
+     * Create a new batch gathering elements starting at address \c src and
+     * offset by each element in \c index.
+     * If \c T is not of the same size as \c U, a \c static_cast is performed
+     * at element gather time.
+     */
+    template <class T, class A>
+    template <typename U, typename V>
+    XSIMD_INLINE batch<T, A> batch<T, A>::gather(U const* src, batch<V, A> const& index) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        static_assert(std::is_convertible<T, U>::value, "Can't convert from src to this batch's type!");
+        return kernel::gather(batch {}, src, index, A {});
+    }
+
+    /**
+     * Scatter elements from this batch into addresses starting at \c dst
+     * and offset by each element in \c index.
+     * If \c T is not of the same size as \c U, a \c static_cast is performed
+     * at element scatter time.
+     */
+    template <class T, class A>
+    template <class U, class V>
+    XSIMD_INLINE void batch<T, A>::scatter(U* dst, batch<V, A> const& index) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        static_assert(std::is_convertible<T, U>::value, "Can't convert from this batch's type to dst!");
+        kernel::scatter<A>(*this, dst, index, A {});
+    }
+
+    /**
+     * Retrieve the \c i th scalar element in this batch.
+     *
+     * \c warning This is very inefficient and should only be used for debugging purpose.
+     */
+    template <class T, class A>
+    XSIMD_INLINE T batch<T, A>::get(std::size_t i) const noexcept
+    {
+        return kernel::get(*this, i, A {});
+    }
+
+    /******************************
+     * batch comparison operators *
+     ******************************/
+    namespace details
+    {
+        /**
+         * Shorthand for xsimd::eq()
+         */
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::eq<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::neq()
+         */
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::neq<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::ge()
+         */
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::ge<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::le()
+         */
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::le<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::gt()
+         */
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::gt<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::lt()
+         */
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::lt<A>(self, other, A {});
+        }
+    }
+
+    /**************************
+     * batch update operators *
+     **************************/
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator+=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::add<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator-=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::sub<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator*=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::mul<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator/=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::div<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>& types::integral_only_operators<T, A>::operator%=(batch<T, A> const& other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::mod<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator&=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::bitwise_and<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator|=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::bitwise_or<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator^=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::bitwise_xor<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>& kernel::integral_only_operators<T, A>::operator>>=(batch<T, A> const& other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::bitwise_rshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>& kernel::integral_only_operators<T, A>::operator<<=(batch<T, A> const& other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::bitwise_lshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>& kernel::integral_only_operators<T, A>::operator>>=(int32_t other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::bitwise_rshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>& kernel::integral_only_operators<T, A>::operator<<=(int32_t other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::bitwise_lshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    /*****************************
+     * batch incr/decr operators *
+     *****************************/
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator++() noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return operator+=(1);
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A>& batch<T, A>::operator--() noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return operator-=(1);
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::operator++(int) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        batch<T, A> copy(*this);
+        operator+=(1);
+        return copy;
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::operator--(int) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        batch copy(*this);
+        operator-=(1);
+        return copy;
+    }
+
+    /*************************
+     * batch unary operators *
+     *************************/
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch<T, A>::operator!() const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::eq<A>(*this, batch(0), A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::operator~() const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_not<A>(*this, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::operator-() const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::neg<A>(*this, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::operator+() const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this;
+    }
+
+    /************************
+     * batch private method *
+     ************************/
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::logical_and(batch<T, A> const& other) const noexcept
+    {
+        return kernel::logical_and<A>(*this, other, A());
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::logical_or(batch<T, A> const& other) const noexcept
+    {
+        return kernel::logical_or<A>(*this, other, A());
+    }
+
+    /***************************
+     * batch_bool constructors *
+     ***************************/
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A>::batch_bool(register_type reg) noexcept
+        : types::get_bool_simd_register_t<T, A>({ reg })
+    {
+    }
+
+    template <class T, class A>
+    template <class... Ts>
+    XSIMD_INLINE batch_bool<T, A>::batch_bool(bool val0, bool val1, Ts... vals) noexcept
+        : batch_bool(kernel::set<A>(batch_bool {}, A {}, val0, val1, static_cast<bool>(vals)...))
+    {
+        static_assert(sizeof...(Ts) + 2 == size, "The constructor requires as many arguments as batch elements.");
+    }
+
+    /*******************************
+     * batch_bool memory operators *
+     *******************************/
+
+    template <class T, class A>
+    XSIMD_INLINE void batch_bool<T, A>::store_aligned(bool* mem) const noexcept
+    {
+        kernel::store(*this, mem, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE void batch_bool<T, A>::store_unaligned(bool* mem) const noexcept
+    {
+        store_aligned(mem);
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::load_aligned(bool const* mem) noexcept
+    {
+        batch_type ref(0);
+        alignas(A::alignment()) T buffer[size];
+        for (std::size_t i = 0; i < size; ++i)
+            buffer[i] = mem[i] ? 1 : 0;
+        return ref != batch_type::load_aligned(&buffer[0]);
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::load_unaligned(bool const* mem) noexcept
+    {
+        return load_aligned(mem);
+    }
+
+    /**
+     * Extract a scalar mask representation from this @c batch_bool.
+     *
+     * @return bit mask
+     */
+    template <class T, class A>
+    XSIMD_INLINE uint64_t batch_bool<T, A>::mask() const noexcept
+    {
+        return kernel::mask(*this, A {});
+    }
+
+    /**
+     * Extract a scalar mask representation from this @c batch_bool.
+     *
+     * @return bit mask
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::from_mask(uint64_t mask) noexcept
+    {
+        return kernel::from_mask(batch_bool<T, A>(), mask, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE bool batch_bool<T, A>::get(std::size_t i) const noexcept
+    {
+        return kernel::get(*this, i, A {});
+    }
+
+    /***********************************
+     * batch_bool comparison operators *
+     ***********************************/
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator==(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::eq<A>(*this, other, A {}).data;
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator!=(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::neq<A>(*this, other, A {}).data;
+    }
+
+    /********************************
+     * batch_bool logical operators *
+     ********************************/
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator~() const noexcept
+    {
+        return kernel::bitwise_not<A>(*this, A {}).data;
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator!() const noexcept
+    {
+        return operator==(batch_bool(false));
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator&(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::bitwise_and<A>(*this, other, A {}).data;
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator|(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::bitwise_or<A>(*this, other, A {}).data;
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator^(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::bitwise_xor<A>(*this, other, A {}).data;
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator&&(batch_bool const& other) const noexcept
+    {
+        return operator&(other);
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::operator||(batch_bool const& other) const noexcept
+    {
+        return operator|(other);
+    }
+
+    /******************************
+     * batch_bool private methods *
+     ******************************/
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A>::batch_bool(bool val) noexcept
+        : base_type { make_register(detail::make_index_sequence<size - 1>(), val) }
+    {
+    }
+
+    template <class T, class A>
+    template <class U, class... V, size_t I, size_t... Is>
+    XSIMD_INLINE auto batch_bool<T, A>::make_register(detail::index_sequence<I, Is...>, U u, V... v) noexcept -> register_type
+    {
+        return make_register(detail::index_sequence<Is...>(), u, u, v...);
+    }
+
+    template <class T, class A>
+    template <class... V>
+    XSIMD_INLINE auto batch_bool<T, A>::make_register(detail::index_sequence<>, V... v) noexcept -> register_type
+    {
+        return kernel::set<A>(batch_bool<T, A>(), A {}, v...).data;
+    }
+
+    /*******************************
+     * batch<complex> constructors *
+     *******************************/
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(value_type const& val) noexcept
+        : m_real(val.real())
+        , m_imag(val.imag())
+    {
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(real_batch const& real, real_batch const& imag) noexcept
+        : m_real(real)
+        , m_imag(imag)
+    {
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(real_batch const& real) noexcept
+        : m_real(real)
+        , m_imag(0)
+    {
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(T val) noexcept
+        : m_real(val)
+        , m_imag(0)
+    {
+    }
+
+    template <class T, class A>
+    template <class... Ts>
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(value_type val0, value_type val1, Ts... vals) noexcept
+        : batch(kernel::set<A>(batch {}, A {}, val0, val1, static_cast<value_type>(vals)...))
+    {
+        static_assert(sizeof...(Ts) + 2 == size, "as many arguments as batch elements");
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(batch_bool_type const& b) noexcept
+        : m_real(b)
+        , m_imag(0)
+    {
+    }
+
+    template <class T, class A>
+    template <class U>
+    XSIMD_NO_DISCARD XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::broadcast(U val) noexcept
+    {
+        return batch(static_cast<std::complex<T>>(val));
+    }
+
+    /***********************************
+     * batch<complex> memory operators *
+     ***********************************/
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const T* real_src, const T* imag_src) noexcept
+    {
+        return { batch<T, A>::load_aligned(real_src), imag_src ? batch<T, A>::load_aligned(imag_src) : batch<T, A>(0) };
+    }
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const T* real_src, const T* imag_src) noexcept
+    {
+        return { batch<T, A>::load_unaligned(real_src), imag_src ? batch<T, A>::load_unaligned(imag_src) : batch<T, A>(0) };
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const value_type* src) noexcept
+    {
+        assert(((reinterpret_cast<uintptr_t>(src) % A::alignment()) == 0)
+               && "loaded pointer is not properly aligned");
+        return kernel::load_complex_aligned<A>(src, kernel::convert<value_type> {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const value_type* src) noexcept
+    {
+        return kernel::load_complex_unaligned<A>(src, kernel::convert<value_type> {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE void batch<std::complex<T>, A>::store_aligned(value_type* dst) const noexcept
+    {
+        assert(((reinterpret_cast<uintptr_t>(dst) % A::alignment()) == 0)
+               && "store location is not properly aligned");
+        return kernel::store_complex_aligned(dst, *this, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE void batch<std::complex<T>, A>::store_unaligned(value_type* dst) const noexcept
+    {
+        return kernel::store_complex_unaligned(dst, *this, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE void batch<std::complex<T>, A>::store_aligned(T* real_dst, T* imag_dst) const noexcept
+    {
+        m_real.store_aligned(real_dst);
+        m_imag.store_aligned(imag_dst);
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE void batch<std::complex<T>, A>::store_unaligned(T* real_dst, T* imag_dst) const noexcept
+    {
+        m_real.store_unaligned(real_dst);
+        m_imag.store_unaligned(imag_dst);
+    }
+
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load(U const* mem, aligned_mode) noexcept
+    {
+        return load_aligned(mem);
+    }
+
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load(U const* mem, unaligned_mode) noexcept
+    {
+        return load_unaligned(mem);
+    }
+
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE void batch<std::complex<T>, A>::store(U* mem, aligned_mode) const noexcept
+    {
+        return store_aligned(mem);
+    }
+
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE void batch<std::complex<T>, A>::store(U* mem, unaligned_mode) const noexcept
+    {
+        return store_unaligned(mem);
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE auto batch<std::complex<T>, A>::real() const noexcept -> real_batch
+    {
+        return m_real;
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE auto batch<std::complex<T>, A>::imag() const noexcept -> real_batch
+    {
+        return m_imag;
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE auto batch<std::complex<T>, A>::get(std::size_t i) const noexcept -> value_type
+    {
+        return kernel::get(*this, i, A {});
+    }
+
+    /**************************************
+     * batch<complex> xtl-related methods *
+     **************************************/
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+
+    template <class T, class A>
+    template <bool i3ec>
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(xtl::xcomplex<T, T, i3ec> const& val) noexcept
+        : m_real(val.real())
+        , m_imag(val.imag())
+    {
+    }
+
+    template <class T, class A>
+    template <bool i3ec, class... Ts>
+    XSIMD_INLINE batch<std::complex<T>, A>::batch(xtl::xcomplex<T, T, i3ec> val0, xtl::xcomplex<T, T, i3ec> val1, Ts... vals) noexcept
+        : batch(kernel::set<A>(batch {}, A {}, val0, val1, static_cast<xtl::xcomplex<T, T, i3ec>>(vals)...))
+    {
+        static_assert(sizeof...(Ts) + 2 == size, "as many arguments as batch elements");
+    }
+
+    // Memory layout of an xcomplex and std::complex are the same when xcomplex
+    // stores values and not reference. Unfortunately, this breaks strict
+    // aliasing...
+
+    template <class T, class A>
+    template <bool i3ec>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept
+    {
+        return load_aligned(reinterpret_cast<std::complex<T> const*>(src));
+    }
+
+    template <class T, class A>
+    template <bool i3ec>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept
+    {
+        return load_unaligned(reinterpret_cast<std::complex<T> const*>(src));
+    }
+
+    template <class T, class A>
+    template <bool i3ec>
+    XSIMD_INLINE void batch<std::complex<T>, A>::store_aligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept
+    {
+        store_aligned(reinterpret_cast<std::complex<T>*>(dst));
+    }
+
+    template <class T, class A>
+    template <bool i3ec>
+    XSIMD_INLINE void batch<std::complex<T>, A>::store_unaligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept
+    {
+        store_unaligned(reinterpret_cast<std::complex<T>*>(dst));
+    }
+
+#endif
+
+    /***************************************
+     * batch<complex> comparison operators *
+     ***************************************/
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch<std::complex<T>, A>::operator==(batch const& other) const noexcept
+    {
+        return m_real == other.m_real && m_imag == other.m_imag;
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch<std::complex<T>, A>::operator!=(batch const& other) const noexcept
+    {
+        return m_real != other.m_real || m_imag != other.m_imag;
+    }
+
+    /***********************************
+     * batch<complex> update operators *
+     ***********************************/
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator+=(batch const& other) noexcept
+    {
+        m_real += other.m_real;
+        m_imag += other.m_imag;
+        return *this;
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator-=(batch const& other) noexcept
+    {
+        m_real -= other.m_real;
+        m_imag -= other.m_imag;
+        return *this;
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator*=(batch const& other) noexcept
+    {
+        real_batch new_real = fms(real(), other.real(), imag() * other.imag());
+        real_batch new_imag = fma(real(), other.imag(), imag() * other.real());
+        m_real = new_real;
+        m_imag = new_imag;
+        return *this;
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator/=(batch const& other) noexcept
+    {
+        real_batch a = real();
+        real_batch b = imag();
+        real_batch c = other.real();
+        real_batch d = other.imag();
+        real_batch e = c * c + d * d;
+        m_real = (c * a + d * b) / e;
+        m_imag = (c * b - d * a) / e;
+        return *this;
+    }
+
+    /**************************************
+     * batch<complex> incr/decr operators *
+     **************************************/
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator++() noexcept
+    {
+        return operator+=(1);
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator--() noexcept
+    {
+        return operator-=(1);
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::operator++(int) noexcept
+    {
+        batch copy(*this);
+        operator+=(1);
+        return copy;
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::operator--(int) noexcept
+    {
+        batch copy(*this);
+        operator-=(1);
+        return copy;
+    }
+
+    /**********************************
+     * batch<complex> unary operators *
+     **********************************/
+
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch<std::complex<T>, A>::operator!() const noexcept
+    {
+        return operator==(batch(0));
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::operator~() const noexcept
+    {
+        return { ~m_real, ~m_imag };
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::operator-() const noexcept
+    {
+        return { -m_real, -m_imag };
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::operator+() const noexcept
+    {
+        return { +m_real, +m_imag };
+    }
+
+    /**********************************
+     * size type aliases
+     **********************************/
+
+    namespace details
+    {
+        template <typename T, std::size_t N, class ArchList>
+        struct sized_batch;
+
+        template <typename T, std::size_t N>
+        struct sized_batch<T, N, xsimd::arch_list<>>
+        {
+            using type = void;
+        };
+
+        template <typename T, class Arch, bool BatchExists = xsimd::has_simd_register<T, Arch>::value>
+        struct batch_trait;
+
+        template <typename T, class Arch>
+        struct batch_trait<T, Arch, true>
+        {
+            using type = xsimd::batch<T, Arch>;
+            static constexpr std::size_t size = xsimd::batch<T, Arch>::size;
+        };
+
+        template <typename T, class Arch>
+        struct batch_trait<T, Arch, false>
+        {
+            using type = void;
+            static constexpr std::size_t size = 0;
+        };
+
+        template <typename T, std::size_t N, class Arch, class... Archs>
+        struct sized_batch<T, N, xsimd::arch_list<Arch, Archs...>>
+        {
+            using type = typename std::conditional<
+                batch_trait<T, Arch>::size == N,
+                typename batch_trait<T, Arch>::type,
+                typename sized_batch<T, N, xsimd::arch_list<Archs...>>::type>::type;
+        };
+    }
+
+    /**
+     * @brief type utility to select a batch of given type and size
+     *
+     * If one of the available architectures has a native vector type of the
+     * given type and size, sets the @p type member to the appropriate batch
+     * type. Otherwise set its to @p void.
+     *
+     * @tparam T the type of the underlying values.
+     * @tparam N the number of elements of that type in the batch.
+     **/
+    template <typename T, std::size_t N>
+    struct make_sized_batch
+    {
+        using type = typename details::sized_batch<T, N, supported_architectures>::type;
+    };
+
+    template <typename T, std::size_t N>
+    using make_sized_batch_t = typename make_sized_batch<T, N>::type;
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_batch_constant.hpp b/include/onnxruntime/xsimd/types/xsimd_batch_constant.hpp
new file mode 100644
index 0000000000000..3d96032774712
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_batch_constant.hpp
@@ -0,0 +1,300 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_BATCH_CONSTANT_HPP
+#define XSIMD_BATCH_CONSTANT_HPP
+
+#include "./xsimd_batch.hpp"
+#include "./xsimd_utils.hpp"
+
+namespace xsimd
+{
+    /**
+     * @brief batch of boolean constant
+     *
+     * Abstract representation of a batch of boolean constants.
+     *
+     * @tparam batch_type the type of the associated batch values.
+     * @tparam Values boolean constant represented by this batch
+     **/
+    template <typename T, class A, bool... Values>
+    struct batch_bool_constant
+    {
+        using batch_type = batch_bool<T, A>;
+        static constexpr std::size_t size = sizeof...(Values);
+        using value_type = bool;
+        static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
+
+    public:
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_bool_constant
+         */
+        constexpr batch_type as_batch_bool() const noexcept { return { Values... }; }
+
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_bool_constant
+         */
+        constexpr operator batch_type() const noexcept { return as_batch_bool(); }
+
+        constexpr bool get(size_t i) const noexcept
+        {
+            return std::array<value_type, size> { { Values... } }[i];
+        }
+
+        static constexpr int mask() noexcept
+        {
+            return mask_helper(0, static_cast<int>(Values)...);
+        }
+
+    private:
+        static constexpr int mask_helper(int acc) noexcept { return acc; }
+
+        template <class... Tys>
+        static constexpr int mask_helper(int acc, int mask, Tys... masks) noexcept
+        {
+            return mask_helper(acc | mask, (masks << 1)...);
+        }
+
+        struct logical_or
+        {
+            constexpr bool operator()(bool x, bool y) const { return x || y; }
+        };
+        struct logical_and
+        {
+            constexpr bool operator()(bool x, bool y) const { return x && y; }
+        };
+        struct logical_xor
+        {
+            constexpr bool operator()(bool x, bool y) const { return x ^ y; }
+        };
+
+        template <class F, class SelfPack, class OtherPack, size_t... Indices>
+        static constexpr batch_bool_constant<T, A, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
+        apply(detail::index_sequence<Indices...>)
+        {
+            return {};
+        }
+
+        template <class F, bool... OtherValues>
+        static constexpr auto apply(batch_bool_constant<T, A, Values...>, batch_bool_constant<T, A, OtherValues...>)
+            -> decltype(apply<F, std::tuple<std::integral_constant<bool, Values>...>, std::tuple<std::integral_constant<bool, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
+        {
+            static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches");
+            return apply<F, std::tuple<std::integral_constant<bool, Values>...>, std::tuple<std::integral_constant<bool, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>());
+        }
+
+    public:
+#define MAKE_BINARY_OP(OP, NAME)                                                      \
+    template <bool... OtherValues>                                                    \
+    constexpr auto operator OP(batch_bool_constant<T, A, OtherValues...> other) const \
+        -> decltype(apply<NAME>(*this, other))                                        \
+    {                                                                                 \
+        return apply<NAME>(*this, other);                                             \
+    }
+
+        MAKE_BINARY_OP(|, logical_or)
+        MAKE_BINARY_OP(||, logical_or)
+        MAKE_BINARY_OP(&, logical_and)
+        MAKE_BINARY_OP(&&, logical_and)
+        MAKE_BINARY_OP(^, logical_xor)
+
+#undef MAKE_BINARY_OP
+
+        constexpr batch_bool_constant<T, A, !Values...> operator!() const
+        {
+            return {};
+        }
+
+        constexpr batch_bool_constant<T, A, !Values...> operator~() const
+        {
+            return {};
+        }
+    };
+
+    /**
+     * @brief batch of integral constants
+     *
+     * Abstract representation of a batch of integral constants.
+     *
+     * @tparam batch_type the type of the associated batch values.
+     * @tparam Values constants represented by this batch
+     **/
+    template <typename T, class A, T... Values>
+    struct batch_constant
+    {
+        static constexpr std::size_t size = sizeof...(Values);
+        using batch_type = batch<T, A>;
+        using value_type = typename batch_type::value_type;
+        static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
+
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_constant
+         */
+        XSIMD_INLINE batch_type as_batch() const noexcept { return { Values... }; }
+
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_constant
+         */
+        XSIMD_INLINE operator batch_type() const noexcept { return as_batch(); }
+
+        /**
+         * @brief Get the @p i th element of this @p batch_constant
+         */
+        constexpr T get(size_t i) const noexcept
+        {
+            return get(i, std::array<T, size> { Values... });
+        }
+
+    private:
+        constexpr T get(size_t i, std::array<T, size> const& values) const noexcept
+        {
+            return values[i];
+        }
+
+        struct arithmetic_add
+        {
+            constexpr T operator()(T x, T y) const { return x + y; }
+        };
+        struct arithmetic_sub
+        {
+            constexpr T operator()(T x, T y) const { return x - y; }
+        };
+        struct arithmetic_mul
+        {
+            constexpr T operator()(T x, T y) const { return x * y; }
+        };
+        struct arithmetic_div
+        {
+            constexpr T operator()(T x, T y) const { return x / y; }
+        };
+        struct arithmetic_mod
+        {
+            constexpr T operator()(T x, T y) const { return x % y; }
+        };
+        struct binary_and
+        {
+            constexpr T operator()(T x, T y) const { return x & y; }
+        };
+        struct binary_or
+        {
+            constexpr T operator()(T x, T y) const { return x | y; }
+        };
+        struct binary_xor
+        {
+            constexpr T operator()(T x, T y) const { return x ^ y; }
+        };
+
+        template <class F, class SelfPack, class OtherPack, size_t... Indices>
+        static constexpr batch_constant<T, A, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
+        apply(detail::index_sequence<Indices...>)
+        {
+            return {};
+        }
+
+        template <class F, T... OtherValues>
+        static constexpr auto apply(batch_constant<T, A, Values...>, batch_constant<T, A, OtherValues...>)
+            -> decltype(apply<F, std::tuple<std::integral_constant<T, Values>...>, std::tuple<std::integral_constant<T, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
+        {
+            static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches");
+            return apply<F, std::tuple<std::integral_constant<T, Values>...>, std::tuple<std::integral_constant<T, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>());
+        }
+
+    public:
+#define MAKE_BINARY_OP(OP, NAME)                                                 \
+    template <T... OtherValues>                                                  \
+    constexpr auto operator OP(batch_constant<T, A, OtherValues...> other) const \
+        -> decltype(apply<NAME>(*this, other))                                   \
+    {                                                                            \
+        return apply<NAME>(*this, other);                                        \
+    }
+
+        MAKE_BINARY_OP(+, arithmetic_add)
+        MAKE_BINARY_OP(-, arithmetic_sub)
+        MAKE_BINARY_OP(*, arithmetic_mul)
+        MAKE_BINARY_OP(/, arithmetic_div)
+        MAKE_BINARY_OP(%, arithmetic_mod)
+        MAKE_BINARY_OP(&, binary_and)
+        MAKE_BINARY_OP(|, binary_or)
+        MAKE_BINARY_OP(^, binary_xor)
+
+#undef MAKE_BINARY_OP
+
+        constexpr batch_constant<T, A, (T)-Values...> operator-() const
+        {
+            return {};
+        }
+
+        constexpr batch_constant<T, A, (T) + Values...> operator+() const
+        {
+            return {};
+        }
+
+        constexpr batch_constant<T, A, (T)~Values...> operator~() const
+        {
+            return {};
+        }
+    };
+
+    namespace detail
+    {
+        template <typename T, class A, class G, std::size_t... Is>
+        XSIMD_INLINE constexpr auto make_batch_constant(detail::index_sequence<Is...>) noexcept
+            -> batch_constant<T, A, (T)G::get(Is, sizeof...(Is))...>
+        {
+            return {};
+        }
+        template <typename T, class A, class G, std::size_t... Is>
+        XSIMD_INLINE constexpr auto make_batch_bool_constant(detail::index_sequence<Is...>) noexcept
+            -> batch_bool_constant<T, A, G::get(Is, sizeof...(Is))...>
+        {
+            return {};
+        }
+
+    } // namespace detail
+
+    /**
+     * @brief Build a @c batch_constant out of a generator function
+     *
+     * @tparam batch_type type of the (non-constant) batch to build
+     * @tparam G type used to generate that batch. That type must have a static
+     * member @c get that's used to generate the batch constant. Conversely, the
+     * generated batch_constant has value `{G::get(0, batch_size), ... , G::get(batch_size - 1, batch_size)}`
+     *
+     * The following generator produces a batch of `(n - 1, 0, 1, ... n-2)`
+     *
+     * @code
+     * struct Rot
+     * {
+     *     static constexpr unsigned get(unsigned i, unsigned n)
+     *     {
+     *         return (i + n - 1) % n;
+     *     }
+     * };
+     * @endcode
+     */
+    template <typename T, class A, class G>
+    XSIMD_INLINE constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<T, A, G>(detail::make_index_sequence<batch<T, A>::size>()))
+    {
+        return detail::make_batch_constant<T, A, G>(detail::make_index_sequence<batch<T, A>::size>());
+    }
+
+    template <typename T, class A, class G>
+    XSIMD_INLINE constexpr auto make_batch_bool_constant() noexcept
+        -> decltype(detail::make_batch_bool_constant<T, A, G>(
+            detail::make_index_sequence<batch<T, A>::size>()))
+    {
+        return detail::make_batch_bool_constant<T, A, G>(
+            detail::make_index_sequence<batch<T, A>::size>());
+    }
+
+} // namespace xsimd
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_emulated_register.hpp b/include/onnxruntime/xsimd/types/xsimd_emulated_register.hpp
new file mode 100644
index 0000000000000..6e0d659bd9da4
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_emulated_register.hpp
@@ -0,0 +1,80 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_EMULATED_REGISTER_HPP
+#define XSIMD_EMULATED_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+#include "./xsimd_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * emulated instructions
+     */
+    template <size_t N>
+    struct emulated : generic
+    {
+        static constexpr bool supported() noexcept { return true; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return false; }
+        static constexpr std::size_t alignment() noexcept { return 8; }
+        static constexpr char const* name() noexcept { return "emulated"; }
+    };
+
+    namespace types
+    {
+        template <size_t N>
+        struct simd_emulated_bool_register
+        {
+            using register_type = std::array<bool, N>;
+            register_type data;
+            simd_emulated_bool_register() = default;
+            simd_emulated_bool_register(register_type r) { data = r; }
+            operator register_type() const noexcept { return data; }
+        };
+        template <typename T, size_t N>
+        struct get_bool_simd_register<T, emulated<N>>
+        {
+            using type = simd_emulated_bool_register<N / (8 * sizeof(T))>;
+        };
+
+        template <typename T, size_t N>
+        struct simd_register<T, emulated<N>>
+        {
+            static_assert(N % (8 * sizeof(T)) == 0, "bit width must be a multiple of scalar width");
+            using register_type = std::array<T, N / (8 * sizeof(T))>;
+            register_type data;
+            XSIMD_INLINE operator register_type() const noexcept
+            {
+                return data;
+            }
+        };
+        template <typename T, size_t N>
+        struct has_simd_register<T, emulated<N>> : std::is_scalar<T>
+        {
+        };
+        template <typename T, size_t N>
+        struct has_simd_register<std::complex<T>, emulated<N>> : std::true_type
+        {
+        };
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <typename T, bool i3ec, size_t N>
+        struct has_simd_register<xtl::complex<T, T, i3ec>, emulated<N>> : std::true_type
+        {
+        };
+#endif
+    }
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_fma3_avx2_register.hpp b/include/onnxruntime/xsimd/types/xsimd_fma3_avx2_register.hpp
new file mode 100644
index 0000000000000..cf3e26d08da9a
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_fma3_avx2_register.hpp
@@ -0,0 +1,45 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX2_REGISTER_HPP
+#define XSIMD_FMA3_AVX2_REGISTER_HPP
+
+#include "./xsimd_avx2_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct fma3;
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX2 + FMA instructions
+     */
+    template <>
+    struct fma3<avx2> : avx2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "fma3+avx2"; }
+    };
+
+#if XSIMD_WITH_FMA3_AVX2
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx2>, avx2);
+
+    }
+#endif
+
+}
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_fma3_avx_register.hpp b/include/onnxruntime/xsimd/types/xsimd_fma3_avx_register.hpp
new file mode 100644
index 0000000000000..5012d25a06766
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_fma3_avx_register.hpp
@@ -0,0 +1,45 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
+#define XSIMD_FMA3_AVX_REGISTER_HPP
+
+#include "./xsimd_avx_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct fma3;
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX + FMA instructions
+     */
+    template <>
+    struct fma3<avx> : avx
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "fma3+avx"; }
+    };
+
+#if XSIMD_WITH_FMA3_AVX
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx>, avx);
+
+    }
+#endif
+
+}
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_fma3_sse_register.hpp b/include/onnxruntime/xsimd/types/xsimd_fma3_sse_register.hpp
new file mode 100644
index 0000000000000..87ebc27b554af
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_fma3_sse_register.hpp
@@ -0,0 +1,45 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_SSE_REGISTER_HPP
+#define XSIMD_FMA3_SSE_REGISTER_HPP
+
+#include "./xsimd_sse4_2_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct fma3;
+
+    /**
+     * @ingroup architectures
+     *
+     * SSE4.2 + FMA instructions
+     */
+    template <>
+    struct fma3<sse4_2> : sse4_2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_SSE; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "fma3+sse4.2"; }
+    };
+
+#if XSIMD_WITH_FMA3_SSE
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<sse4_2>, sse4_2);
+
+    }
+#endif
+
+}
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_fma4_register.hpp b/include/onnxruntime/xsimd/types/xsimd_fma4_register.hpp
new file mode 100644
index 0000000000000..1a066cd206dd7
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_fma4_register.hpp
@@ -0,0 +1,41 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA4_REGISTER_HPP
+#define XSIMD_FMA4_REGISTER_HPP
+
+#include "./xsimd_sse4_2_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * SSE4.2 + FMA4 instructions
+     */
+    struct fma4 : sse4_2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA4; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "fma4"; }
+    };
+
+#if XSIMD_WITH_FMA4
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma4, sse4_2);
+
+    }
+#endif
+
+}
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_generic_arch.hpp b/include/onnxruntime/xsimd/types/xsimd_generic_arch.hpp
new file mode 100644
index 0000000000000..d16a37fea7e91
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_generic_arch.hpp
@@ -0,0 +1,47 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_ARCH_HPP
+#define XSIMD_GENERIC_ARCH_HPP
+
+#include "../config/xsimd_config.hpp"
+
+/**
+ * @defgroup architectures Architecture description
+ * */
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * Base class for all architectures.
+     */
+    struct generic
+    {
+        /// Whether this architecture is supported at compile-time.
+        static constexpr bool supported() noexcept { return true; }
+        /// Whether this architecture is available at run-time.
+        static constexpr bool available() noexcept { return true; }
+        /// If this architectures supports aligned memory accesses, the required
+        /// alignment.
+        static constexpr std::size_t alignment() noexcept { return 0; }
+        /// Whether this architecture requires aligned memory access.
+        static constexpr bool requires_alignment() noexcept { return false; }
+        /// Name of the architecture.
+        static constexpr char const* name() noexcept { return "generic"; }
+    };
+
+    struct unsupported
+    {
+    };
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_i8mm_neon64_register.hpp b/include/onnxruntime/xsimd/types/xsimd_i8mm_neon64_register.hpp
new file mode 100644
index 0000000000000..0e2b42d8eac14
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_i8mm_neon64_register.hpp
@@ -0,0 +1,50 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_I8MM_NEON64_REGISTER_HPP
+#define XSIMD_I8MM_NEON64_REGISTER_HPP
+
+#include "./xsimd_neon64_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct i8mm;
+
+    /**
+     * @ingroup architectures
+     *
+     * Neon64 + i8mm instructions
+     */
+    template <>
+    struct i8mm<neon64> : neon64
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_I8MM_NEON64; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "i8mm+neon64"; }
+    };
+
+#if XSIMD_WITH_I8MM_NEON64
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(i8mm<neon64>, neon64);
+
+        template <class T>
+        struct get_bool_simd_register<T, i8mm<neon64>>
+            : detail::neon_bool_simd_register<T, i8mm<neon64>>
+        {
+        };
+    }
+#endif
+
+}
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_neon64_register.hpp b/include/onnxruntime/xsimd/types/xsimd_neon64_register.hpp
new file mode 100644
index 0000000000000..709f601a3f01f
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_neon64_register.hpp
@@ -0,0 +1,51 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON64_REGISTER_HPP
+#define XSIMD_NEON64_REGISTER_HPP
+
+#include "xsimd_neon_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * NEON instructions for arm64
+     */
+    struct neon64 : neon
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_NEON64; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr char const* name() noexcept { return "arm64+neon"; }
+    };
+
+#if XSIMD_WITH_NEON64
+
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(neon64, neon);
+        XSIMD_DECLARE_SIMD_REGISTER(double, neon64, float64x2_t);
+
+        template <class T>
+        struct get_bool_simd_register<T, neon64>
+            : detail::neon_bool_simd_register<T, neon64>
+        {
+        };
+    }
+
+#endif
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_neon_register.hpp b/include/onnxruntime/xsimd/types/xsimd_neon_register.hpp
new file mode 100644
index 0000000000000..a9f4a46c8bfe4
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_neon_register.hpp
@@ -0,0 +1,154 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON_REGISTER_HPP
+#define XSIMD_NEON_REGISTER_HPP
+
+#include "xsimd_generic_arch.hpp"
+#include "xsimd_register.hpp"
+
+#if XSIMD_WITH_NEON
+#include <arm_neon.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * NEON instructions for arm32
+     */
+    struct neon : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_NEON; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr char const* name() noexcept { return "arm32+neon"; }
+    };
+
+#if XSIMD_WITH_NEON
+    namespace types
+    {
+        namespace detail
+        {
+            template <size_t S>
+            struct neon_vector_type_impl;
+
+            template <>
+            struct neon_vector_type_impl<8>
+            {
+                using signed_type = int8x16_t;
+                using unsigned_type = uint8x16_t;
+            };
+
+            template <>
+            struct neon_vector_type_impl<16>
+            {
+                using signed_type = int16x8_t;
+                using unsigned_type = uint16x8_t;
+            };
+
+            template <>
+            struct neon_vector_type_impl<32>
+            {
+                using signed_type = int32x4_t;
+                using unsigned_type = uint32x4_t;
+            };
+
+            template <>
+            struct neon_vector_type_impl<64>
+            {
+                using signed_type = int64x2_t;
+                using unsigned_type = uint64x2_t;
+            };
+
+            template <class T>
+            using signed_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::signed_type;
+
+            template <class T>
+            using unsigned_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::unsigned_type;
+
+            template <class T>
+            using neon_vector_type = typename std::conditional<std::is_signed<T>::value,
+                                                               signed_neon_vector_type<T>,
+                                                               unsigned_neon_vector_type<T>>::type;
+
+            using char_neon_vector_type = typename std::conditional<std::is_signed<char>::value,
+                                                                    signed_neon_vector_type<char>,
+                                                                    unsigned_neon_vector_type<char>>::type;
+        }
+
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, neon, detail::neon_vector_type<signed char>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, neon, detail::neon_vector_type<unsigned char>);
+        XSIMD_DECLARE_SIMD_REGISTER(char, neon, detail::char_neon_vector_type);
+        XSIMD_DECLARE_SIMD_REGISTER(short, neon, detail::neon_vector_type<short>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, neon, detail::neon_vector_type<unsigned short>);
+        XSIMD_DECLARE_SIMD_REGISTER(int, neon, detail::neon_vector_type<int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, neon, detail::neon_vector_type<unsigned int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, neon, detail::neon_vector_type<long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, neon, detail::neon_vector_type<unsigned long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, neon, detail::neon_vector_type<long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, neon, detail::neon_vector_type<unsigned long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(float, neon, float32x4_t);
+        XSIMD_DECLARE_INVALID_SIMD_REGISTER(double, neon);
+
+        namespace detail
+        {
+            template <size_t S>
+            struct get_unsigned_type;
+
+            template <>
+            struct get_unsigned_type<1>
+            {
+                using type = uint8_t;
+            };
+
+            template <>
+            struct get_unsigned_type<2>
+            {
+                using type = uint16_t;
+            };
+
+            template <>
+            struct get_unsigned_type<4>
+            {
+                using type = uint32_t;
+            };
+
+            template <>
+            struct get_unsigned_type<8>
+            {
+                using type = uint64_t;
+            };
+
+            template <size_t S>
+            using get_unsigned_type_t = typename get_unsigned_type<S>::type;
+
+            template <class T, class A>
+            struct neon_bool_simd_register
+            {
+                using type = simd_register<get_unsigned_type_t<sizeof(T)>, A>;
+            };
+        }
+
+        template <class T>
+        struct get_bool_simd_register<T, neon>
+            : detail::neon_bool_simd_register<T, neon>
+        {
+        };
+
+    }
+#endif
+
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_register.hpp b/include/onnxruntime/xsimd/types/xsimd_register.hpp
new file mode 100644
index 0000000000000..a838f8786deaa
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_register.hpp
@@ -0,0 +1,94 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_REGISTER_HPP
+#define XSIMD_REGISTER_HPP
+
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace types
+    {
+        template <class T, class A>
+        struct has_simd_register : std::false_type
+        {
+        };
+
+        template <class T, class Arch>
+        struct simd_register
+        {
+            struct register_type
+            {
+            };
+        };
+
+#define XSIMD_DECLARE_SIMD_REGISTER(SCALAR_TYPE, ISA, VECTOR_TYPE) \
+    template <>                                                    \
+    struct simd_register<SCALAR_TYPE, ISA>                         \
+    {                                                              \
+        using register_type = VECTOR_TYPE;                         \
+        register_type data;                                        \
+        XSIMD_INLINE operator register_type() const noexcept       \
+        {                                                          \
+            return data;                                           \
+        }                                                          \
+    };                                                             \
+    template <>                                                    \
+    struct has_simd_register<SCALAR_TYPE, ISA> : std::true_type    \
+    {                                                              \
+    }
+
+#define XSIMD_DECLARE_INVALID_SIMD_REGISTER(SCALAR_TYPE, ISA)    \
+    template <>                                                  \
+    struct has_simd_register<SCALAR_TYPE, ISA> : std::false_type \
+    {                                                            \
+    }
+
+#define XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ISA, ISA_BASE)                          \
+    template <class T>                                                            \
+    struct simd_register<T, ISA> : simd_register<T, ISA_BASE>                     \
+    {                                                                             \
+        using register_type = typename simd_register<T, ISA_BASE>::register_type; \
+        simd_register(register_type reg) noexcept                                 \
+            : simd_register<T, ISA_BASE> { reg }                                  \
+        {                                                                         \
+        }                                                                         \
+        simd_register() = default;                                                \
+    };                                                                            \
+    template <class T>                                                            \
+    struct has_simd_register<T, ISA> : has_simd_register<T, ISA_BASE>             \
+    {                                                                             \
+    }
+
+        template <class T, class Arch>
+        struct get_bool_simd_register
+        {
+            using type = simd_register<T, Arch>;
+        };
+
+        template <class T, class Arch>
+        using get_bool_simd_register_t = typename get_bool_simd_register<T, Arch>::type;
+    }
+
+    namespace kernel
+    {
+        template <class A>
+        // makes requires_arch equal to A const&, using type_traits functions
+        using requires_arch = typename std::add_lvalue_reference<typename std::add_const<A>::type>::type;
+        template <class T>
+        struct convert
+        {
+        };
+    }
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_rvv_register.hpp b/include/onnxruntime/xsimd/types/xsimd_rvv_register.hpp
new file mode 100644
index 0000000000000..4d5258d3021a2
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_rvv_register.hpp
@@ -0,0 +1,497 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Yibo Cai                                                   *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_RVV_REGISTER_HPP
+#define XSIMD_RVV_REGISTER_HPP
+
+#include "xsimd_generic_arch.hpp"
+#include "xsimd_register.hpp"
+
+#if XSIMD_WITH_RVV
+#include <riscv_vector.h>
+#endif
+
+namespace xsimd
+{
+    namespace detail
+    {
+        /**
+         * @ingroup architectures
+         *
+         * RVV instructions (fixed vector size) for riscv
+         */
+        template <size_t Width>
+        struct rvv : xsimd::generic
+        {
+            static constexpr size_t width = Width;
+            static constexpr bool supported() noexcept { return Width == XSIMD_RVV_BITS; }
+            static constexpr bool available() noexcept { return true; }
+            static constexpr bool requires_alignment() noexcept { return true; }
+            static constexpr std::size_t alignment() noexcept { return 16; }
+            static constexpr char const* name() noexcept { return "riscv+rvv"; }
+        };
+    }
+
+#if XSIMD_WITH_RVV
+
+    using rvv = detail::rvv<__riscv_v_fixed_vlen>;
+
+#define XSIMD_RVV_JOINT_(a, b, c) a##b##c
+#define XSIMD_RVV_JOINT(a, b, c) XSIMD_RVV_JOINT_(a, b, c)
+#define XSIMD_RVV_JOINT5(a, b, c, d, e) XSIMD_RVV_JOINT(XSIMD_RVV_JOINT(a, b, c), d, e)
+
+#define XSIMD_RVV_TYPE_i(S, V) XSIMD_RVV_JOINT5(vint, S, m, V, _t)
+#define XSIMD_RVV_TYPE_u(S, V) XSIMD_RVV_JOINT5(vuint, S, m, V, _t)
+#define XSIMD_RVV_TYPE_f(S, V) XSIMD_RVV_JOINT5(vfloat, S, m, V, _t)
+#define XSIMD_RVV_TYPE(T, S, V) XSIMD_RVV_JOINT(XSIMD_RVV_TYPE, _, T)(S, V)
+
+    namespace types
+    {
+        namespace detail
+        {
+            static constexpr size_t rvv_width_mf8 = XSIMD_RVV_BITS / 8;
+            static constexpr size_t rvv_width_mf4 = XSIMD_RVV_BITS / 4;
+            static constexpr size_t rvv_width_mf2 = XSIMD_RVV_BITS / 2;
+            static constexpr size_t rvv_width_m1 = XSIMD_RVV_BITS;
+            static constexpr size_t rvv_width_m2 = XSIMD_RVV_BITS * 2;
+            static constexpr size_t rvv_width_m4 = XSIMD_RVV_BITS * 4;
+            static constexpr size_t rvv_width_m8 = XSIMD_RVV_BITS * 8;
+
+            // rvv_type_info is a utility class to convert scalar type and
+            // bitwidth into rvv register types.
+            //
+            // * `type` is the unadorned vector type.
+            // * `fixed_type` is the same type, but with the storage attribute
+            //    applied.
+            // * `byte_type` is the type which is the same size in unsigned
+            //    bytes, used as an intermediate step for bit-cast operations,
+            //    because only a subset of __riscv_vreinterpret() intrinsics
+            //    exist -- but always enough to get us to bytes and back.
+            //
+            template <class T, size_t Width>
+            struct rvv_type_info;
+#define XSIMD_RVV_MAKE_TYPE(scalar, t, s, vmul)                                           \
+    template <>                                                                           \
+    struct rvv_type_info<scalar, rvv_width_m1 * vmul>                                     \
+    {                                                                                     \
+        static constexpr size_t width = rvv_width_m1 * vmul;                              \
+        using type = XSIMD_RVV_TYPE(t, s, vmul);                                          \
+        using byte_type = XSIMD_RVV_TYPE(u, 8, vmul);                                     \
+        using fixed_type = type __attribute__((riscv_rvv_vector_bits(width)));            \
+        template <class U>                                                                \
+        static XSIMD_INLINE type bitcast(U x) noexcept                                    \
+        {                                                                                 \
+            const auto words = XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, s, m, vmul)(x); \
+            return XSIMD_RVV_JOINT5(__riscv_vreinterpret_, t, s, m, vmul)(words);         \
+        }                                                                                 \
+        template <>                                                                       \
+        XSIMD_INLINE type bitcast<type>(type x) noexcept { return x; }                    \
+        template <class U>                                                                \
+        static XSIMD_INLINE byte_type as_bytes(U x) noexcept                              \
+        {                                                                                 \
+            static_assert(std::is_same<U, type>::value, "inconsistent conversion types"); \
+            const auto words = XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, s, m, vmul)(x); \
+            return XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, 8, m, vmul)(words);         \
+        }                                                                                 \
+    };
+
+#define XSIMD_RVV_MAKE_TYPES(vmul)             \
+    XSIMD_RVV_MAKE_TYPE(int8_t, i, 8, vmul)    \
+    XSIMD_RVV_MAKE_TYPE(uint8_t, u, 8, vmul)   \
+    XSIMD_RVV_MAKE_TYPE(int16_t, i, 16, vmul)  \
+    XSIMD_RVV_MAKE_TYPE(uint16_t, u, 16, vmul) \
+    XSIMD_RVV_MAKE_TYPE(int32_t, i, 32, vmul)  \
+    XSIMD_RVV_MAKE_TYPE(uint32_t, u, 32, vmul) \
+    XSIMD_RVV_MAKE_TYPE(int64_t, i, 64, vmul)  \
+    XSIMD_RVV_MAKE_TYPE(uint64_t, u, 64, vmul) \
+    XSIMD_RVV_MAKE_TYPE(float, f, 32, vmul)    \
+    XSIMD_RVV_MAKE_TYPE(double, f, 64, vmul)
+
+            XSIMD_RVV_MAKE_TYPES(8)
+            XSIMD_RVV_MAKE_TYPES(4)
+            XSIMD_RVV_MAKE_TYPES(2)
+            XSIMD_RVV_MAKE_TYPES(1)
+#undef XSIMD_RVV_TYPE
+#undef XSIMD_RVV_TYPE_f
+#undef XSIMD_RVV_TYPE_u
+#undef XSIMD_RVV_TYPE_i
+#undef XSIMD_RVV_MAKE_TYPES
+#undef XSIMD_RVV_MAKE_TYPE
+
+            // Specialization needed for #1058
+            template <>
+            XSIMD_INLINE rvv_type_info<int8_t, rvv_width_m1 * 8>::type
+            rvv_type_info<int8_t, rvv_width_m1 * 8>::bitcast<__rvv_uint8m8_t>(
+                __rvv_uint8m8_t x) noexcept
+            {
+                return __riscv_vreinterpret_i8m8(x);
+            }
+            template <>
+            XSIMD_INLINE rvv_type_info<int8_t, rvv_width_m1 * 1>::type
+            rvv_type_info<int8_t, rvv_width_m1 * 1>::bitcast<__rvv_uint8m1_t>(
+                __rvv_uint8m1_t x) noexcept
+            {
+                return __riscv_vreinterpret_i8m1(x);
+            }
+            template <>
+            XSIMD_INLINE rvv_type_info<uint16_t, rvv_width_m1 * 1>::type
+            rvv_type_info<uint16_t, rvv_width_m1 * 1>::bitcast<__rvv_uint8m1_t>(
+                __rvv_uint8m1_t x) noexcept
+            {
+                return __riscv_vreinterpret_u16m1(x);
+            }
+            template <>
+            XSIMD_INLINE rvv_type_info<uint32_t, rvv_width_m1 * 1>::type
+            rvv_type_info<uint32_t, rvv_width_m1 * 1>::bitcast<__rvv_uint8m1_t>(
+                __rvv_uint8m1_t x) noexcept
+            {
+                return __riscv_vreinterpret_u32m1(x);
+            }
+            template <>
+            XSIMD_INLINE rvv_type_info<uint64_t, rvv_width_m1 * 1>::type
+            rvv_type_info<uint64_t, rvv_width_m1 * 1>::bitcast<__rvv_uint8m1_t>(
+                __rvv_uint8m1_t x) noexcept
+            {
+                return __riscv_vreinterpret_u64m1(x);
+            }
+
+            //
+
+            template <>
+            XSIMD_INLINE rvv_type_info<int8_t, rvv_width_m1 * 8>::byte_type
+            rvv_type_info<int8_t, rvv_width_m1 * 8>::as_bytes<__rvv_int8m8_t>(__rvv_int8m8_t x) noexcept
+            {
+                return __riscv_vreinterpret_u8m8(x);
+            }
+            template <>
+            XSIMD_INLINE rvv_type_info<int8_t, rvv_width_m1 * 1>::byte_type
+            rvv_type_info<int8_t, rvv_width_m1 * 1>::as_bytes<__rvv_int8m1_t>(__rvv_int8m1_t x) noexcept
+            {
+                return __riscv_vreinterpret_u8m1(x);
+            }
+
+            template <>
+            XSIMD_INLINE rvv_type_info<uint8_t, rvv_width_m1 * 1>::byte_type
+            rvv_type_info<uint8_t, rvv_width_m1 * 1>::as_bytes<__rvv_uint8m1_t>(__rvv_uint8m1_t x) noexcept
+            {
+                return x;
+            }
+            template <>
+            XSIMD_INLINE rvv_type_info<uint16_t, rvv_width_m1 * 1>::byte_type
+            rvv_type_info<uint16_t, rvv_width_m1 * 1>::as_bytes<__rvv_uint16m1_t>(__rvv_uint16m1_t x) noexcept
+            {
+                return __riscv_vreinterpret_u8m1(x);
+            }
+            template <>
+            XSIMD_INLINE rvv_type_info<uint32_t, rvv_width_m1 * 1>::byte_type
+            rvv_type_info<uint32_t, rvv_width_m1 * 1>::as_bytes<__rvv_uint32m1_t>(__rvv_uint32m1_t x) noexcept
+            {
+                return __riscv_vreinterpret_u8m1(x);
+            }
+            template <>
+            XSIMD_INLINE rvv_type_info<uint64_t, rvv_width_m1 * 1>::byte_type
+            rvv_type_info<uint64_t, rvv_width_m1 * 1>::as_bytes<__rvv_uint64m1_t>(__rvv_uint64m1_t x) noexcept
+            {
+                return __riscv_vreinterpret_u8m1(x);
+            }
+
+            // rvv_blob is storage-type abstraction for a vector register.
+            template <class T, size_t Width>
+            struct rvv_blob : public rvv_type_info<T, Width>
+            {
+                using super = rvv_type_info<T, Width>;
+                using typename super::fixed_type;
+                using typename super::type;
+
+                fixed_type value;
+                type get() const { return value; }
+                void set(type v) { value = v; }
+            };
+            //
+            // But sometimes we want our storage type to be less than a whole
+            // register, while presenting as a whole register to the outside
+            // world.  This is because some partial-register types are not
+            // defined, but they can (mostly) be emulated using shorter vl on a
+            // full-width register for arithmetic, and cast back to a partial
+            // byte register for storage.
+            //
+            template <class T, size_t divisor>
+            struct rvv_semiblob : public rvv_type_info<T, rvv_width_m1>
+            {
+                using super = rvv_type_info<T, rvv_width_m1>;
+                static constexpr size_t width = rvv_width_m1 / divisor;
+                using typename super::type;
+                template <size_t div>
+                struct semitype;
+                template <>
+                struct semitype<2>
+                {
+                    using type = vuint8mf2_t __attribute__((riscv_rvv_vector_bits(rvv_width_mf2)));
+                };
+                template <>
+                struct semitype<4>
+                {
+                    using type = vuint8mf4_t __attribute__((riscv_rvv_vector_bits(rvv_width_mf4)));
+                };
+                template <>
+                struct semitype<8>
+                {
+                    using type = vuint8mf8_t __attribute__((riscv_rvv_vector_bits(rvv_width_mf8)));
+                };
+                using fixed_type = typename semitype<divisor>::type;
+                using super::as_bytes;
+                using super::bitcast;
+
+                fixed_type value;
+                template <size_t div>
+                vuint8m1_t get_bytes() const;
+                template <>
+                vuint8m1_t get_bytes<2>() const { return __riscv_vlmul_ext_v_u8mf2_u8m1(value); }
+                template <>
+                vuint8m1_t get_bytes<4>() const { return __riscv_vlmul_ext_v_u8mf4_u8m1(value); }
+                template <>
+                vuint8m1_t get_bytes<8>() const { return __riscv_vlmul_ext_v_u8mf8_u8m1(value); }
+                type get() const noexcept
+                {
+                    vuint8m1_t bytes = get_bytes<divisor>();
+                    return bitcast(bytes);
+                }
+                template <size_t div>
+                void set_bytes(vuint8m1_t);
+                template <>
+                void set_bytes<2>(vuint8m1_t v) { value = __riscv_vlmul_trunc_v_u8m1_u8mf2(v); }
+                template <>
+                void set_bytes<4>(vuint8m1_t v) { value = __riscv_vlmul_trunc_v_u8m1_u8mf4(v); }
+                template <>
+                void set_bytes<8>(vuint8m1_t v) { value = __riscv_vlmul_trunc_v_u8m1_u8mf8(v); }
+                void set(type v)
+                {
+                    vuint8m1_t bytes = as_bytes(v);
+                    set_bytes<divisor>(bytes);
+                }
+            };
+            template <class T>
+            struct rvv_blob<T, rvv_width_mf2> : rvv_semiblob<T, 2>
+            {
+            };
+            template <class T>
+            struct rvv_blob<T, rvv_width_mf4> : rvv_semiblob<T, 4>
+            {
+            };
+            template <class T>
+            struct rvv_blob<T, rvv_width_mf8> : rvv_semiblob<T, 8>
+            {
+            };
+
+            // It's difficult dealing with both char and whichever *int8_t type
+            // is compatible with char, so just avoid it altogether.
+            //
+            using rvv_char_t = typename std::conditional<std::is_signed<char>::value, int8_t, uint8_t>::type;
+            template <class T>
+            using rvv_fix_char_t = typename std::conditional<
+                std::is_same<char, typename std::decay<T>::type>::value,
+                rvv_char_t, T>::type;
+
+            // An explicit constructor isn't really explicit enough to allow
+            // implicit bit-casting operations between incompatible types, so
+            // we add this vacuous flag argument when we're serious:
+            //
+            enum rvv_bitcast_flag
+            {
+                XSIMD_RVV_BITCAST
+            };
+
+            // the general-purpose vector register type, usable within
+            // templates, and supporting arithmetic on partial registers for
+            // which there is no intrinsic type (by casting via a full register
+            // type).
+            //
+            template <class T, size_t Width>
+            struct rvv_reg
+            {
+                static constexpr size_t width = Width;
+                static constexpr size_t vl = Width / (sizeof(T) * 8);
+                using blob_type = rvv_blob<T, Width>;
+                using register_type = typename blob_type::type;
+                using byte_type = typename blob_type::byte_type;
+                blob_type value;
+                rvv_reg() noexcept = default;
+                rvv_reg(register_type x) noexcept { value.set(x); }
+                explicit rvv_reg(byte_type v, rvv_bitcast_flag) { value.set(value.bitcast(v)); }
+                template <class U>
+                explicit rvv_reg(rvv_reg<U, Width> v, rvv_bitcast_flag)
+                    : rvv_reg(v.get_bytes(), XSIMD_RVV_BITCAST)
+                {
+                }
+                byte_type get_bytes() const noexcept
+                {
+                    return blob_type::as_bytes(value.get());
+                }
+                operator register_type() const noexcept { return value.get(); }
+            };
+            template <class T, size_t Width = XSIMD_RVV_BITS>
+            using rvv_reg_t = typename std::conditional<!std::is_void<T>::value, rvv_reg<rvv_fix_char_t<T>, Width>, void>::type;
+
+            // And some more of the same stuff for bool types, which have
+            // similar problems and similar workarounds.
+            //
+            template <size_t>
+            struct rvv_bool_info;
+#define XSIMD_RVV_MAKE_BOOL_TYPE(i)                                                             \
+    template <>                                                                                 \
+    struct rvv_bool_info<i>                                                                     \
+    {                                                                                           \
+        using type = XSIMD_RVV_JOINT(vbool, i, _t);                                             \
+        template <class T>                                                                      \
+        static XSIMD_INLINE type bitcast(T value) noexcept                                      \
+        {                                                                                       \
+            return XSIMD_RVV_JOINT(__riscv_vreinterpret_b, i, )(value);                         \
+        }                                                                                       \
+        /*template <> static XSIMD_INLINE type bitcast(type value) noexcept { return value; }*/ \
+    };
+            XSIMD_RVV_MAKE_BOOL_TYPE(1);
+            XSIMD_RVV_MAKE_BOOL_TYPE(2);
+            XSIMD_RVV_MAKE_BOOL_TYPE(4);
+            XSIMD_RVV_MAKE_BOOL_TYPE(8);
+            XSIMD_RVV_MAKE_BOOL_TYPE(16);
+            XSIMD_RVV_MAKE_BOOL_TYPE(32);
+            XSIMD_RVV_MAKE_BOOL_TYPE(64);
+#undef XSIMD_RVV_MAKE_BOOL_TYPE
+#undef XSIMD_RVV_JOINT5
+#undef XSIMD_RVV_JOINT
+#undef XSIMD_RVV_JOINT_
+
+            template <class T, size_t Width>
+            struct rvv_bool
+            {
+                using bool_info = rvv_bool_info<rvv_width_m1 * sizeof(T) * 8 / Width>;
+                using storage_type = vuint8m1_t __attribute__((riscv_rvv_vector_bits(rvv_width_m1)));
+                using type = typename bool_info::type;
+                storage_type value;
+                rvv_bool() = default;
+                rvv_bool(type v) noexcept
+                    : value(__riscv_vreinterpret_u8m1(v))
+                {
+                }
+                template <class U, typename std::enable_if<sizeof(T) == sizeof(U), int>::type = 0>
+                rvv_bool(rvv_bool<U, Width> v)
+                    : value(v.value)
+                {
+                }
+                explicit rvv_bool(uint8_t mask) noexcept
+                    : value(__riscv_vmv_v_x_u8m1(mask, rvv_width_m1 / 8))
+                {
+                }
+                explicit rvv_bool(uint64_t mask) noexcept
+                    : value(__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vmv_v_x_u64m1(mask, rvv_width_m1 / 64)))
+                {
+                }
+                operator type() const noexcept { return bool_info::bitcast(value); }
+            };
+
+            template <class T, size_t Width = XSIMD_RVV_BITS>
+            using rvv_bool_t = typename std::enable_if < !std::is_void<T>::value,
+                  rvv_bool<rvv_fix_char_t<T>, Width<rvv_width_m1 ? rvv_width_m1 : Width>>::type;
+
+            template <size_t S>
+            struct rvv_vector_type_impl;
+
+            template <>
+            struct rvv_vector_type_impl<8>
+            {
+                using signed_type = rvv_reg_t<int8_t>;
+                using unsigned_type = rvv_reg_t<uint8_t>;
+                using floating_point_type = void;
+            };
+
+            template <>
+            struct rvv_vector_type_impl<16>
+            {
+                using signed_type = rvv_reg_t<int16_t>;
+                using unsigned_type = rvv_reg_t<uint16_t>;
+                using floating_point_type = rvv_reg_t<_Float16>;
+            };
+
+            template <>
+            struct rvv_vector_type_impl<32>
+            {
+                using signed_type = rvv_reg_t<int32_t>;
+                using unsigned_type = rvv_reg_t<uint32_t>;
+                using floating_point_type = rvv_reg_t<float>;
+            };
+
+            template <>
+            struct rvv_vector_type_impl<64>
+            {
+                using signed_type = rvv_reg_t<int64_t>;
+                using unsigned_type = rvv_reg_t<uint64_t>;
+                using floating_point_type = rvv_reg_t<double>;
+            };
+
+            template <class T>
+            using signed_int_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::signed_type;
+
+            template <class T>
+            using unsigned_int_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::unsigned_type;
+
+            template <class T>
+            using floating_point_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::floating_point_type;
+
+            template <class T>
+            using signed_int_or_floating_point_rvv_vector_type = typename std::conditional<std::is_floating_point<T>::value,
+                                                                                           floating_point_rvv_vector_type<T>,
+                                                                                           signed_int_rvv_vector_type<T>>::type;
+
+            template <class T>
+            using rvv_vector_type = typename std::conditional<std::is_signed<T>::value,
+                                                              signed_int_or_floating_point_rvv_vector_type<T>,
+                                                              unsigned_int_rvv_vector_type<T>>::type;
+        } // namespace detail
+
+        XSIMD_DECLARE_SIMD_REGISTER(bool, rvv, detail::rvv_vector_type<unsigned char>);
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, rvv, detail::rvv_vector_type<signed char>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, rvv, detail::rvv_vector_type<unsigned char>);
+        XSIMD_DECLARE_SIMD_REGISTER(char, rvv, detail::rvv_vector_type<char>);
+        XSIMD_DECLARE_SIMD_REGISTER(short, rvv, detail::rvv_vector_type<short>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, rvv, detail::rvv_vector_type<unsigned short>);
+        XSIMD_DECLARE_SIMD_REGISTER(int, rvv, detail::rvv_vector_type<int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, rvv, detail::rvv_vector_type<unsigned int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, rvv, detail::rvv_vector_type<long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, rvv, detail::rvv_vector_type<unsigned long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, rvv, detail::rvv_vector_type<long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, rvv, detail::rvv_vector_type<unsigned long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(float, rvv, detail::rvv_vector_type<float>);
+        XSIMD_DECLARE_SIMD_REGISTER(double, rvv, detail::rvv_vector_type<double>);
+
+        namespace detail
+        {
+            template <class T>
+            struct rvv_bool_simd_register
+            {
+                using register_type = rvv_bool_t<T>;
+                register_type data;
+                operator register_type() const noexcept { return data; }
+            };
+        } // namespace detail
+
+        template <class T>
+        struct get_bool_simd_register<T, rvv>
+        {
+            using type = detail::rvv_bool_simd_register<T>;
+        };
+    } // namespace types
+#else
+    using rvv = detail::rvv<0xFFFFFFFF>;
+#endif
+} // namespace xsimd
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_sse2_register.hpp b/include/onnxruntime/xsimd/types/xsimd_sse2_register.hpp
new file mode 100644
index 0000000000000..e6eabec7adee4
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_sse2_register.hpp
@@ -0,0 +1,59 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE2_REGISTER_HPP
+#define XSIMD_SSE2_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+#include "./xsimd_register.hpp"
+
+#if XSIMD_WITH_SSE2
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * SSE2 instructions
+     */
+    struct sse2 : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr char const* name() noexcept { return "sse2"; }
+    };
+
+#if XSIMD_WITH_SSE2
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(char, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(short, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(float, sse2, __m128);
+        XSIMD_DECLARE_SIMD_REGISTER(double, sse2, __m128d);
+    }
+#endif
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_sse3_register.hpp b/include/onnxruntime/xsimd/types/xsimd_sse3_register.hpp
new file mode 100644
index 0000000000000..6f216bb812962
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_sse3_register.hpp
@@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE3_REGISTER_HPP
+#define XSIMD_SSE3_REGISTER_HPP
+
+#include "./xsimd_sse2_register.hpp"
+
+#if XSIMD_WITH_SSE3
+#include <pmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * SSE3 instructions
+     */
+    struct sse3 : sse2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE3; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "sse3"; }
+    };
+
+#if XSIMD_WITH_SSE3
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse3, sse2);
+    }
+#endif
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_sse4_1_register.hpp b/include/onnxruntime/xsimd/types/xsimd_sse4_1_register.hpp
new file mode 100644
index 0000000000000..f7f6c06575ba8
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_sse4_1_register.hpp
@@ -0,0 +1,43 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_1_REGISTER_HPP
+#define XSIMD_SSE4_1_REGISTER_HPP
+
+#include "./xsimd_ssse3_register.hpp"
+
+#if XSIMD_WITH_SSE4_1
+#include <smmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * SSE4.1 instructions
+     */
+    struct sse4_1 : ssse3
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_1; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "sse4.1"; }
+    };
+
+#if XSIMD_WITH_SSE4_1
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_1, ssse3);
+    }
+#endif
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_sse4_2_register.hpp b/include/onnxruntime/xsimd/types/xsimd_sse4_2_register.hpp
new file mode 100644
index 0000000000000..e92e4987243db
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_sse4_2_register.hpp
@@ -0,0 +1,43 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_2_REGISTER_HPP
+#define XSIMD_SSE4_2_REGISTER_HPP
+
+#include "./xsimd_sse4_1_register.hpp"
+
+#if XSIMD_WITH_SSE4_2
+#include <nmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * SSE4.2 instructions
+     */
+    struct sse4_2 : sse4_1
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "sse4.2"; }
+    };
+
+#if XSIMD_WITH_SSE4_2
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_2, sse4_1);
+    }
+#endif
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_ssse3_register.hpp b/include/onnxruntime/xsimd/types/xsimd_ssse3_register.hpp
new file mode 100644
index 0000000000000..fc1c0f82dec87
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_ssse3_register.hpp
@@ -0,0 +1,43 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSSE3_REGISTER_HPP
+#define XSIMD_SSSE3_REGISTER_HPP
+
+#include "./xsimd_sse3_register.hpp"
+
+#if XSIMD_WITH_SSSE3
+#include <tmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * SSSE3 instructions
+     */
+    struct ssse3 : sse3
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSSE3; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "ssse3"; }
+    };
+
+#if XSIMD_WITH_SSSE3
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ssse3, sse3);
+    }
+#endif
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_sve_register.hpp b/include/onnxruntime/xsimd/types/xsimd_sve_register.hpp
new file mode 100644
index 0000000000000..29564d02375d7
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_sve_register.hpp
@@ -0,0 +1,156 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Yibo Cai                                                   *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SVE_REGISTER_HPP
+#define XSIMD_SVE_REGISTER_HPP
+
+#include "xsimd_generic_arch.hpp"
+#include "xsimd_register.hpp"
+
+#if XSIMD_WITH_SVE
+#include <arm_sve.h>
+#endif
+
+namespace xsimd
+{
+    namespace detail
+    {
+        /**
+         * @ingroup architectures
+         *
+         * SVE instructions (fixed vector size) for arm64
+         */
+        template <size_t Width>
+        struct sve : xsimd::generic
+        {
+            static constexpr bool supported() noexcept { return Width == XSIMD_SVE_BITS; }
+            static constexpr bool available() noexcept { return true; }
+            static constexpr bool requires_alignment() noexcept { return true; }
+            static constexpr std::size_t alignment() noexcept { return 16; }
+            static constexpr char const* name() noexcept { return "arm64+sve"; }
+        };
+    }
+
+#if XSIMD_WITH_SVE
+
+    using sve = detail::sve<__ARM_FEATURE_SVE_BITS>;
+
+    namespace types
+    {
+        namespace detail
+        {
+// define fixed size alias per SVE sizeless type
+#define SVE_TO_FIXED_SIZE(ty) ty __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)))
+            using sve_int8_t = SVE_TO_FIXED_SIZE(svint8_t);
+            using sve_uint8_t = SVE_TO_FIXED_SIZE(svuint8_t);
+            using sve_int16_t = SVE_TO_FIXED_SIZE(svint16_t);
+            using sve_uint16_t = SVE_TO_FIXED_SIZE(svuint16_t);
+            using sve_int32_t = SVE_TO_FIXED_SIZE(svint32_t);
+            using sve_uint32_t = SVE_TO_FIXED_SIZE(svuint32_t);
+            using sve_int64_t = SVE_TO_FIXED_SIZE(svint64_t);
+            using sve_uint64_t = SVE_TO_FIXED_SIZE(svuint64_t);
+            using sve_float32_t = SVE_TO_FIXED_SIZE(svfloat32_t);
+            using sve_float64_t = SVE_TO_FIXED_SIZE(svfloat64_t);
+            using sve_bool_t = SVE_TO_FIXED_SIZE(svbool_t);
+#undef SVE_TO_FIXED_SIZE
+
+            template <size_t S>
+            struct sve_vector_type_impl;
+
+            template <>
+            struct sve_vector_type_impl<8>
+            {
+                using signed_type = sve_int8_t;
+                using unsigned_type = sve_uint8_t;
+                using floating_point_type = void;
+            };
+
+            template <>
+            struct sve_vector_type_impl<16>
+            {
+                using signed_type = sve_int16_t;
+                using unsigned_type = sve_uint16_t;
+                using floating_point_type = void;
+            };
+
+            template <>
+            struct sve_vector_type_impl<32>
+            {
+                using signed_type = sve_int32_t;
+                using unsigned_type = sve_uint32_t;
+                using floating_point_type = sve_float32_t;
+            };
+
+            template <>
+            struct sve_vector_type_impl<64>
+            {
+                using signed_type = sve_int64_t;
+                using unsigned_type = sve_uint64_t;
+                using floating_point_type = sve_float64_t;
+            };
+
+            template <class T>
+            using signed_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::signed_type;
+
+            template <class T>
+            using unsigned_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::unsigned_type;
+
+            template <class T>
+            using floating_point_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::floating_point_type;
+
+            template <class T>
+            using signed_int_or_floating_point_sve_vector_type = typename std::conditional<std::is_floating_point<T>::value,
+                                                                                           floating_point_sve_vector_type<T>,
+                                                                                           signed_int_sve_vector_type<T>>::type;
+
+            template <class T>
+            using sve_vector_type = typename std::conditional<std::is_signed<T>::value,
+                                                              signed_int_or_floating_point_sve_vector_type<T>,
+                                                              unsigned_int_sve_vector_type<T>>::type;
+        } // namespace detail
+
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, sve, detail::sve_vector_type<signed char>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sve, detail::sve_vector_type<unsigned char>);
+        XSIMD_DECLARE_SIMD_REGISTER(char, sve, detail::sve_vector_type<char>);
+        XSIMD_DECLARE_SIMD_REGISTER(short, sve, detail::sve_vector_type<short>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sve, detail::sve_vector_type<unsigned short>);
+        XSIMD_DECLARE_SIMD_REGISTER(int, sve, detail::sve_vector_type<int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sve, detail::sve_vector_type<unsigned int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, sve, detail::sve_vector_type<long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sve, detail::sve_vector_type<unsigned long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, sve, detail::sve_vector_type<long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sve, detail::sve_vector_type<unsigned long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(float, sve, detail::sve_vector_type<float>);
+        XSIMD_DECLARE_SIMD_REGISTER(double, sve, detail::sve_vector_type<double>);
+
+        namespace detail
+        {
+            struct sve_bool_simd_register
+            {
+                using register_type = sve_bool_t;
+                register_type data;
+                operator register_type() const noexcept { return data; }
+            };
+        } // namespace detail
+
+        template <class T>
+        struct get_bool_simd_register<T, sve>
+        {
+            using type = detail::sve_bool_simd_register;
+        };
+    } // namespace types
+#else
+    using sve = detail::sve<0xFFFFFFFF>;
+#endif
+} // namespace xsimd
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_traits.hpp b/include/onnxruntime/xsimd/types/xsimd_traits.hpp
new file mode 100644
index 0000000000000..c6436aef71744
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_traits.hpp
@@ -0,0 +1,324 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_TRAITS_HPP
+#define XSIMD_TRAITS_HPP
+
+#include <type_traits>
+
+#include "xsimd_batch.hpp"
+
+/**
+ * high level type traits
+ *
+ * @defgroup batch_traits Type traits
+ *
+ **/
+
+namespace xsimd
+{
+
+    /**************************************
+     * simd_traits and revert_simd_traits *
+     **************************************/
+
+    template <class T, class A = default_arch>
+    struct has_simd_register : types::has_simd_register<T, A>
+    {
+    };
+
+    template <class T, class A>
+    struct has_simd_register<std::complex<T>, A> : has_simd_register<T, A>
+    {
+    };
+
+    namespace detail
+    {
+        template <class T, bool>
+        struct simd_traits_impl;
+
+        template <class T>
+        struct simd_traits_impl<T, false>
+        {
+            using type = T;
+            using bool_type = bool;
+            static constexpr size_t size = 1;
+        };
+
+        template <class T>
+        constexpr size_t simd_traits_impl<T, false>::size;
+
+        template <class T>
+        struct simd_traits_impl<T, true>
+        {
+            using type = batch<T>;
+            using bool_type = typename type::batch_bool_type;
+            static constexpr size_t size = type::size;
+        };
+
+        template <class T>
+        constexpr size_t simd_traits_impl<T, true>::size;
+
+        template <class T, class A>
+        struct static_check_supported_config_emitter
+        {
+
+            static_assert(A::supported(),
+                          "usage of batch type with unsupported architecture");
+            static_assert(!A::supported() || xsimd::has_simd_register<T, A>::value,
+                          "usage of batch type with unsupported type");
+        };
+
+        template <class T, class A>
+        struct static_check_supported_config_emitter<std::complex<T>, A> : static_check_supported_config_emitter<T, A>
+        {
+        };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <class T, class A, bool i3ec>
+        struct static_check_supported_config_emitter<xtl::xcomplex<T, T, i3ec>, A> : static_check_supported_config_emitter<T, A>
+        {
+        };
+#endif
+
+        // consistency checker
+        template <class T, class A>
+        XSIMD_INLINE void static_check_supported_config()
+        {
+            (void)static_check_supported_config_emitter<T, A>();
+        }
+    }
+
+    template <class T>
+    struct simd_traits : detail::simd_traits_impl<T, xsimd::has_simd_register<T>::value>
+    {
+    };
+
+    template <class T>
+    struct simd_traits<std::complex<T>>
+        : detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
+    {
+    };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    struct simd_traits<xtl::xcomplex<T, T, i3ec>>
+        : detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
+    {
+    };
+#endif
+
+    template <class T>
+    struct revert_simd_traits
+    {
+        using type = T;
+        static constexpr size_t size = simd_traits<type>::size;
+    };
+
+    template <class T>
+    constexpr size_t revert_simd_traits<T>::size;
+
+    template <class T>
+    struct revert_simd_traits<batch<T>>
+    {
+        using type = T;
+        static constexpr size_t size = batch<T>::size;
+    };
+
+    template <class T>
+    constexpr size_t revert_simd_traits<batch<T>>::size;
+
+    template <class T>
+    using simd_type = typename simd_traits<T>::type;
+
+    template <class T>
+    using simd_bool_type = typename simd_traits<T>::bool_type;
+
+    template <class T>
+    using revert_simd_type = typename revert_simd_traits<T>::type;
+
+    /********************
+     * simd_return_type *
+     ********************/
+
+    namespace detail
+    {
+        template <class T1, class T2>
+        struct simd_condition
+        {
+            static constexpr bool value = (std::is_same<T1, T2>::value && !std::is_same<T1, bool>::value) || (std::is_same<T1, bool>::value && !std::is_same<T2, bool>::value) || std::is_same<T1, float>::value || std::is_same<T1, double>::value || std::is_same<T1, int8_t>::value || std::is_same<T1, uint8_t>::value || std::is_same<T1, int16_t>::value || std::is_same<T1, uint16_t>::value || std::is_same<T1, int32_t>::value || std::is_same<T1, uint32_t>::value || std::is_same<T1, int64_t>::value || std::is_same<T1, uint64_t>::value || std::is_same<T1, char>::value || detail::is_complex<T1>::value;
+        };
+
+        template <class T1, class T2, class A>
+        struct simd_return_type_impl
+            : std::enable_if<simd_condition<T1, T2>::value, batch<T2, A>>
+        {
+        };
+
+        template <class T2, class A>
+        struct simd_return_type_impl<bool, T2, A>
+            : std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
+        {
+        };
+
+        template <class T2, class A>
+        struct simd_return_type_impl<bool, std::complex<T2>, A>
+            : std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
+        {
+        };
+
+        template <class T1, class T2, class A>
+        struct simd_return_type_impl<std::complex<T1>, T2, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, class A>
+        struct simd_return_type_impl<std::complex<T1>, std::complex<T2>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, T2, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, std::complex<T2>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, xtl::xcomplex<T2, T2, I3EC>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<std::complex<T1>, xtl::xcomplex<T2, T2, I3EC>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+#endif
+    }
+
+    template <class T1, class T2, class A = default_arch>
+    using simd_return_type = typename detail::simd_return_type_impl<T1, T2, A>::type;
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits that inherits from @c std::true_type for @c batch<...> types and from
+     * @c std::false_type otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+    template <class T>
+    struct is_batch;
+
+    template <class T>
+    struct is_batch : std::false_type
+    {
+    };
+
+    template <class T, class A>
+    struct is_batch<batch<T, A>> : std::true_type
+    {
+    };
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits that inherits from @c std::true_type for @c batch_bool<...> types and from
+     * @c std::false_type otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+
+    template <class T>
+    struct is_batch_bool : std::false_type
+    {
+    };
+
+    template <class T, class A>
+    struct is_batch_bool<batch_bool<T, A>> : std::true_type
+    {
+    };
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits that inherits from @c std::true_type for @c batch<std::complex<...>>
+     * types and from @c std::false_type otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+
+    template <class T>
+    struct is_batch_complex : std::false_type
+    {
+    };
+
+    template <class T, class A>
+    struct is_batch_complex<batch<std::complex<T>, A>> : std::true_type
+    {
+    };
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits whose @c type field is set to @c T::value_type if @c
+     * is_batch<T>::value and to @c T otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+    template <class T>
+    struct scalar_type
+    {
+        using type = T;
+    };
+    template <class T, class A>
+    struct scalar_type<batch<T, A>>
+    {
+        using type = T;
+    };
+
+    template <class T>
+    using scalar_type_t = typename scalar_type<T>::type;
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits whose @c type field is set to @c T::value_type if @c
+     * is_batch_bool<T>::value and to @c bool otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+    template <class T>
+    struct mask_type
+    {
+        using type = bool;
+    };
+    template <class T, class A>
+    struct mask_type<batch<T, A>>
+    {
+        using type = typename batch<T, A>::batch_bool_type;
+    };
+
+    template <class T>
+    using mask_type_t = typename mask_type<T>::type;
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_utils.hpp b/include/onnxruntime/xsimd/types/xsimd_utils.hpp
new file mode 100644
index 0000000000000..aa890f2410b36
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_utils.hpp
@@ -0,0 +1,530 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_UTILS_HPP
+#define XSIMD_UTILS_HPP
+
+#include <complex>
+#include <cstdint>
+#include <cstring>
+#include <tuple>
+#include <type_traits>
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+#include "xtl/xcomplex.hpp"
+#endif
+
+namespace xsimd
+{
+
+    template <class T, class A>
+    class batch;
+
+    template <class T, class A>
+    class batch_bool;
+
+    /**************
+     * index      *
+     **************/
+
+    template <size_t I>
+    using index = std::integral_constant<size_t, I>;
+
+    /**************
+     * as_integer *
+     **************/
+
+    template <class T>
+    struct as_integer : std::make_signed<T>
+    {
+    };
+
+    template <>
+    struct as_integer<float>
+    {
+        using type = int32_t;
+    };
+
+    template <>
+    struct as_integer<double>
+    {
+        using type = int64_t;
+    };
+
+    template <class T, class A>
+    struct as_integer<batch<T, A>>
+    {
+        using type = batch<typename as_integer<T>::type, A>;
+    };
+
+    template <class B>
+    using as_integer_t = typename as_integer<B>::type;
+
+    /***********************
+     * as_unsigned_integer *
+     ***********************/
+
+    template <class T>
+    struct as_unsigned_integer : std::make_unsigned<T>
+    {
+    };
+
+    template <>
+    struct as_unsigned_integer<float>
+    {
+        using type = uint32_t;
+    };
+
+    template <>
+    struct as_unsigned_integer<double>
+    {
+        using type = uint64_t;
+    };
+
+    template <class T, class A>
+    struct as_unsigned_integer<batch<T, A>>
+    {
+        using type = batch<typename as_unsigned_integer<T>::type, A>;
+    };
+
+    template <class T>
+    using as_unsigned_integer_t = typename as_unsigned_integer<T>::type;
+
+    /*********************
+     * as_signed_integer *
+     *********************/
+
+    template <class T>
+    struct as_signed_integer : std::make_signed<T>
+    {
+    };
+
+    template <class T>
+    using as_signed_integer_t = typename as_signed_integer<T>::type;
+
+    /******************
+     * flip_sign_type *
+     ******************/
+
+    namespace detail
+    {
+        template <class T, bool is_signed>
+        struct flipped_sign_type_impl : std::make_signed<T>
+        {
+        };
+
+        template <class T>
+        struct flipped_sign_type_impl<T, true> : std::make_unsigned<T>
+        {
+        };
+    }
+
+    template <class T>
+    struct flipped_sign_type
+        : detail::flipped_sign_type_impl<T, std::is_signed<T>::value>
+    {
+    };
+
+    template <class T>
+    using flipped_sign_type_t = typename flipped_sign_type<T>::type;
+
+    /***********
+     * as_float *
+     ************/
+
+    template <class T>
+    struct as_float;
+
+    template <>
+    struct as_float<int32_t>
+    {
+        using type = float;
+    };
+
+    template <>
+    struct as_float<int64_t>
+    {
+        using type = double;
+    };
+
+    template <class T, class A>
+    struct as_float<batch<T, A>>
+    {
+        using type = batch<typename as_float<T>::type, A>;
+    };
+
+    template <class T>
+    using as_float_t = typename as_float<T>::type;
+
+    /**************
+     * as_logical *
+     **************/
+
+    template <class T>
+    struct as_logical;
+
+    template <class T, class A>
+    struct as_logical<batch<T, A>>
+    {
+        using type = batch_bool<T, A>;
+    };
+
+    template <class T>
+    using as_logical_t = typename as_logical<T>::type;
+
+    /********************
+     * bit_cast *
+     ********************/
+
+    template <class To, class From>
+    inline To bit_cast(From val) noexcept
+    {
+        static_assert(sizeof(From) == sizeof(To), "casting between compatible layout");
+        // FIXME: Some old version of GCC don't support that trait
+        // static_assert(std::is_trivially_copyable<From>::value, "input type is trivially copyable");
+        // static_assert(std::is_trivially_copyable<To>::value, "output type is trivially copyable");
+        To res;
+        std::memcpy(&res, &val, sizeof(val));
+        return res;
+    }
+
+    namespace kernel
+    {
+        namespace detail
+        {
+            /**************************************
+             * enabling / disabling metafunctions *
+             **************************************/
+
+            template <class T>
+            using enable_integral_t = typename std::enable_if<std::is_integral<T>::value, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_signed_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value && sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_unsigned_t = typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value && sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_t = typename std::enable_if<sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_max_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) <= S, int>::type;
+
+            /********************************
+             * Matching & mismatching sizes *
+             ********************************/
+
+            template <class T, class U, class B = int>
+            using sizes_match_t = typename std::enable_if<sizeof(T) == sizeof(U), B>::type;
+
+            template <class T, class U, class B = int>
+            using sizes_mismatch_t = typename std::enable_if<sizeof(T) != sizeof(U), B>::type;
+
+            template <class T, class U, class B = int>
+            using stride_match_t = typename std::enable_if<!std::is_same<T, U>::value && sizeof(T) == sizeof(U), B>::type;
+        } // namespace detail
+    } // namespace kernel
+
+    /*****************************************
+     * Backport of index_sequence from c++14 *
+     *****************************************/
+
+    // TODO: Remove this once we drop C++11 support
+    namespace detail
+    {
+        template <typename T>
+        struct identity
+        {
+            using type = T;
+        };
+
+#ifdef __cpp_lib_integer_sequence
+        using std::index_sequence;
+        using std::integer_sequence;
+        using std::make_index_sequence;
+        using std::make_integer_sequence;
+
+        using std::index_sequence_for;
+#else
+        template <typename T, T... Is>
+        struct integer_sequence
+        {
+            using value_type = T;
+            static constexpr std::size_t size() noexcept { return sizeof...(Is); }
+        };
+
+        template <typename Lhs, typename Rhs>
+        struct make_integer_sequence_concat;
+
+        template <typename T, T... Lhs, T... Rhs>
+        struct make_integer_sequence_concat<integer_sequence<T, Lhs...>,
+                                            integer_sequence<T, Rhs...>>
+            : identity<integer_sequence<T, Lhs..., (sizeof...(Lhs) + Rhs)...>>
+        {
+        };
+
+        template <typename T>
+        struct make_integer_sequence_impl;
+
+        template <typename T>
+        struct make_integer_sequence_impl<std::integral_constant<T, (T)0>> : identity<integer_sequence<T>>
+        {
+        };
+
+        template <typename T>
+        struct make_integer_sequence_impl<std::integral_constant<T, (T)1>> : identity<integer_sequence<T, 0>>
+        {
+        };
+
+        template <typename T, T N>
+        struct make_integer_sequence_impl<std::integral_constant<T, N>>
+            : make_integer_sequence_concat<typename make_integer_sequence_impl<std::integral_constant<T, N / 2>>::type,
+                                           typename make_integer_sequence_impl<std::integral_constant<T, N - (N / 2)>>::type>
+        {
+        };
+
+        template <typename T, T N>
+        using make_integer_sequence = typename make_integer_sequence_impl<std::integral_constant<T, N>>::type;
+
+        template <std::size_t... Is>
+        using index_sequence = integer_sequence<std::size_t, Is...>;
+
+        template <std::size_t N>
+        using make_index_sequence = make_integer_sequence<std::size_t, N>;
+
+        template <typename... Ts>
+        using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
+
+#endif
+
+        template <int... Is>
+        using int_sequence = integer_sequence<int, Is...>;
+
+        template <int N>
+        using make_int_sequence = make_integer_sequence<int, N>;
+
+        template <typename... Ts>
+        using int_sequence_for = make_int_sequence<(int)sizeof...(Ts)>;
+
+        // Type-casted index sequence.
+        template <class P, size_t... Is>
+        inline P indexes_from(index_sequence<Is...>) noexcept
+        {
+            return { static_cast<typename P::value_type>(Is)... };
+        }
+
+        template <class P>
+        inline P make_sequence_as_batch() noexcept
+        {
+            return indexes_from<P>(make_index_sequence<P::size>());
+        }
+    }
+
+    /***********************************
+     * Backport of std::get from C++14 *
+     ***********************************/
+
+    namespace detail
+    {
+        template <class T, class... Types, size_t I, size_t... Is>
+        inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, T>, index_sequence<I, Is...>) noexcept
+        {
+            return std::get<I>(t);
+        }
+
+        template <class T, class U, class... Types, size_t I, size_t... Is>
+        inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, U>, index_sequence<I, Is...>) noexcept
+        {
+            using tuple_elem = typename std::tuple_element<I + 1, std::tuple<Types...>>::type;
+            return get_impl<T>(t, std::is_same<T, tuple_elem>(), index_sequence<Is...>());
+        }
+
+        template <class T, class... Types>
+        inline const T& get(const std::tuple<Types...>& t) noexcept
+        {
+            using tuple_elem = typename std::tuple_element<0, std::tuple<Types...>>::type;
+            return get_impl<T>(t, std::is_same<T, tuple_elem>(), make_index_sequence<sizeof...(Types)>());
+        }
+    }
+
+    /*********************************
+     * Backport of void_t from C++17 *
+     *********************************/
+
+    namespace detail
+    {
+        template <class... T>
+        struct make_void
+        {
+            using type = void;
+        };
+
+        template <class... T>
+        using void_t = typename make_void<T...>::type;
+    }
+
+    /**************************************************
+     * Equivalent of void_t but with size_t parameter *
+     **************************************************/
+
+    namespace detail
+    {
+        template <std::size_t>
+        struct check_size
+        {
+            using type = void;
+        };
+
+        template <std::size_t S>
+        using check_size_t = typename check_size<S>::type;
+    }
+
+    /*****************************************
+     * Supplementary std::array constructors *
+     *****************************************/
+
+    namespace detail
+    {
+        // std::array constructor from scalar value ("broadcast")
+        template <typename T, std::size_t... Is>
+        inline constexpr std::array<T, sizeof...(Is)>
+        array_from_scalar_impl(const T& scalar, index_sequence<Is...>) noexcept
+        {
+            // You can safely ignore this silly ternary, the "scalar" is all
+            // that matters. The rest is just a dirty workaround...
+            return std::array<T, sizeof...(Is)> { (Is + 1) ? scalar : T()... };
+        }
+
+        template <typename T, std::size_t N>
+        inline constexpr std::array<T, N>
+        array_from_scalar(const T& scalar) noexcept
+        {
+            return array_from_scalar_impl(scalar, make_index_sequence<N>());
+        }
+
+        // std::array constructor from C-style pointer (handled as an array)
+        template <typename T, std::size_t... Is>
+        inline constexpr std::array<T, sizeof...(Is)>
+        array_from_pointer_impl(const T* c_array, index_sequence<Is...>) noexcept
+        {
+            return std::array<T, sizeof...(Is)> { c_array[Is]... };
+        }
+
+        template <typename T, std::size_t N>
+        inline constexpr std::array<T, N>
+        array_from_pointer(const T* c_array) noexcept
+        {
+            return array_from_pointer_impl(c_array, make_index_sequence<N>());
+        }
+    }
+
+    /************************
+     * is_array_initializer *
+     ************************/
+
+    namespace detail
+    {
+        template <bool...>
+        struct bool_pack;
+
+        template <bool... bs>
+        using all_true = std::is_same<
+            bool_pack<bs..., true>, bool_pack<true, bs...>>;
+
+        template <typename T, typename... Args>
+        using is_all_convertible = all_true<std::is_convertible<Args, T>::value...>;
+
+        template <typename T, std::size_t N, typename... Args>
+        using is_array_initializer = std::enable_if<
+            (sizeof...(Args) == N) && is_all_convertible<T, Args...>::value>;
+
+        // Check that a variadic argument pack is a list of N values of type T,
+        // as usable for instantiating a value of type std::array<T, N>.
+        template <typename T, std::size_t N, typename... Args>
+        using is_array_initializer_t = typename is_array_initializer<T, N, Args...>::type;
+    }
+
+    /**************
+     * is_complex *
+     **************/
+
+    // This is used in both xsimd_complex_base.hpp and xsimd_traits.hpp
+    // However xsimd_traits.hpp indirectly includes xsimd_complex_base.hpp
+    // so we cannot define is_complex in xsimd_traits.hpp. Besides, if
+    // no file defining batches is included, we still need this definition
+    // in xsimd_traits.hpp, so let's define it here.
+
+    namespace detail
+    {
+        template <class T>
+        struct is_complex : std::false_type
+        {
+        };
+
+        template <class T>
+        struct is_complex<std::complex<T>> : std::true_type
+        {
+        };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <class T, bool i3ec>
+        struct is_complex<xtl::xcomplex<T, T, i3ec>> : std::true_type
+        {
+        };
+#endif
+    }
+
+    /*******************
+     * real_batch_type *
+     *******************/
+
+    template <class B>
+    struct real_batch_type
+    {
+        using type = B;
+    };
+
+    template <class T, class A>
+    struct real_batch_type<batch<std::complex<T>, A>>
+    {
+        using type = batch<T, A>;
+    };
+
+    template <class B>
+    using real_batch_type_t = typename real_batch_type<B>::type;
+
+    /**********************
+     * complex_batch_type *
+     **********************/
+
+    template <class B>
+    struct complex_batch_type
+    {
+        using real_value_type = typename B::value_type;
+        using arch_type = typename B::arch_type;
+        using type = batch<std::complex<real_value_type>, arch_type>;
+    };
+
+    template <class T, class A>
+    struct complex_batch_type<batch<std::complex<T>, A>>
+    {
+        using type = batch<std::complex<T>, A>;
+    };
+
+    template <class B>
+    using complex_batch_type_t = typename complex_batch_type<B>::type;
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/types/xsimd_wasm_register.hpp b/include/onnxruntime/xsimd/types/xsimd_wasm_register.hpp
new file mode 100644
index 0000000000000..a1b8403603eab
--- /dev/null
+++ b/include/onnxruntime/xsimd/types/xsimd_wasm_register.hpp
@@ -0,0 +1,59 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Anutosh Bhat                                               *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_WASM_REGISTER_HPP
+#define XSIMD_WASM_REGISTER_HPP
+
+#include "xsimd_generic_arch.hpp"
+#include "xsimd_register.hpp"
+
+#if XSIMD_WITH_WASM
+#include <wasm_simd128.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * WASM instructions
+     */
+    struct wasm : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_WASM; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr char const* name() noexcept { return "wasm"; }
+    };
+
+#if XSIMD_WITH_WASM
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(char, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(short, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(float, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(double, wasm, v128_t);
+    }
+#endif
+}
+
+#endif
diff --git a/include/onnxruntime/xsimd/xsimd.hpp b/include/onnxruntime/xsimd/xsimd.hpp
new file mode 100644
index 0000000000000..b5548e7ac9c72
--- /dev/null
+++ b/include/onnxruntime/xsimd/xsimd.hpp
@@ -0,0 +1,69 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_HPP
+#define XSIMD_HPP
+
+#if defined(__has_cpp_attribute)
+// if this check passes, then the compiler supports feature test macros
+#if __has_cpp_attribute(nodiscard) >= 201603L
+// if this check passes, then the compiler supports [[nodiscard]] without a message
+#define XSIMD_NO_DISCARD [[nodiscard]]
+#endif
+#endif
+
+#if !defined(XSIMD_NO_DISCARD) && __cplusplus >= 201703L
+// this means that the previous tests failed, but we are using C++17 or higher
+#define XSIMD_NO_DISCARD [[nodiscard]]
+#endif
+
+#if !defined(XSIMD_NO_DISCARD) && (defined(__GNUC__) || defined(__clang__))
+// this means that the previous checks failed, but we are using GCC or Clang
+#define XSIMD_NO_DISCARD __attribute__((warn_unused_result))
+#endif
+
+#if !defined(XSIMD_NO_DISCARD)
+// this means that all the previous checks failed, so we fallback to doing nothing
+#define XSIMD_NO_DISCARD
+#endif
+
+#ifdef __cpp_if_constexpr
+// this means that the compiler supports the `if constexpr` construct
+#define XSIMD_IF_CONSTEXPR if constexpr
+#endif
+
+#if !defined(XSIMD_IF_CONSTEXPR) && __cplusplus >= 201703L
+// this means that the previous test failed, but we are using C++17 or higher
+#define XSIMD_IF_CONSTEXPR if constexpr
+#endif
+
+#if !defined(XSIMD_IF_CONSTEXPR)
+// this means that all the previous checks failed, so we fallback to a normal `if`
+#define XSIMD_IF_CONSTEXPR if
+#endif
+
+#include "config/xsimd_config.hpp"
+#include "config/xsimd_inline.hpp"
+
+#include "arch/xsimd_scalar.hpp"
+#include "memory/xsimd_aligned_allocator.hpp"
+
+#if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
+// to type definition or anything appart from scalar definition and aligned allocator
+#else
+#include "types/xsimd_batch.hpp"
+#include "types/xsimd_batch_constant.hpp"
+#include "types/xsimd_traits.hpp"
+
+// This include must come last
+#include "types/xsimd_api.hpp"
+#endif
+#endif
diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index c742cd1e95bdd..f42fd54c3bc00 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -62,7 +62,8 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, NGram
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BifurcationDetector);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QuickGelu);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, DecoderMaskedMultiHeadAttention);
-
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, FirefoxMatMulInteger8);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, FirefoxMatMulInteger8);
 // ******** Start: Quantization ******************* //
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulInteger16);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearGlobalAveragePool);
@@ -285,6 +286,8 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SampleOp)>,
 
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, FirefoxMatMulInteger8)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, FirefoxMatMulInteger8)>,
       // add more kernels here
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GridSample)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Attention)>,
@@ -364,7 +367,6 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Trilu)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, UnfoldTensor)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, DynamicTimeWarping)>,
-
 #ifdef ENABLE_ATEN
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
 #endif
diff --git a/onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.cc b/onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.cc
new file mode 100644
index 0000000000000..8c71c88b25a51
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.cc
@@ -0,0 +1,236 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <iostream>
+
+#ifndef __EMSCRIPTEN__
+#include "gemmology.h"
+#endif
+
+#include "firefox_matmul_integer.h"
+#include "core/providers/cpu/math/matmul_helper.h"
+#include "core/providers/common.h"
+#include "core/util/math_cpuonly.h"
+#include "core/util/qmath.h"
+
+using Index = std::size_t;
+
+namespace onnxruntime {
+namespace contrib {
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(
+    FirefoxMatMulInteger8,
+    kMSDomain,
+    1,
+    uint8_t,
+    kCpuExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<uint8_t>())
+        .TypeConstraint("T2", {DataTypeImpl::GetTensorType<uint8_t>(), DataTypeImpl::GetTensorType<int8_t>()})
+        .TypeConstraint("T3", DataTypeImpl::GetTensorType<int32_t>()),
+    FirefoxMatMulInteger8);
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(
+    FirefoxMatMulInteger8,
+    kMSDomain,
+    1,
+    int8_t,
+    kCpuExecutionProvider,
+    KernelDefBuilder()
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<int8_t>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<int8_t>())
+        .TypeConstraint("T3", DataTypeImpl::GetTensorType<int32_t>()),
+    FirefoxMatMulInteger8);
+
+
+
+/** Typical Call 
+
+Input Tensor A shape: {1,171,1024}
+Input Tensor B shape: {1024,1024}
+A Zero Point shape: {} 
+A Zero Point value: 123
+B Zero Point shape: {1024}
+B Zero Point is per-column: 1
+Computing helper with A and B shapes.
+Output Tensor Y shape: {1,171,1024}
+GEMM Shape - M: 171, N: 1024, K: 1024, AIsSigned: 0, BIsSigned: 1 
+Batch size: 1 
+
+*/ 
+Status FirefoxMatMulInteger8::Compute(OpKernelContext* ctx) const {
+  std::cout << "FirefoxMatMulInteger8::Compute started" << std::endl;
+  const auto* a = ctx->Input<Tensor>(IN_A);
+  const auto* b = packed_b_ ? nullptr : ctx->Input<Tensor>(IN_B);
+
+  // Validate zero points
+  uint8_t a_offset = 0;
+  const auto* a_zero_point = ctx->Input<Tensor>(IN_A_ZERO_POINT);
+  if (a_zero_point != nullptr) {
+    ORT_ENFORCE(IsScalarOr1ElementVector(a_zero_point),
+                "MatmulInteger : input1 zero point must be a scalar or 1D tensor of size 1");
+    a_offset = *(static_cast<const uint8_t*>(a_zero_point->DataRaw()));
+  }
+
+  bool is_b_zp_per_column = false;
+  uint8_t b_default_offset = 0;
+  const uint8_t* b_offset_ptr = &b_default_offset;
+  const auto* b_zero_point = ctx->Input<Tensor>(IN_B_ZERO_POINT);
+  if (b_zero_point != nullptr) {
+    ORT_ENFORCE(IsBQuantParamSupported(b_zero_point->Shape(), b ? b->Shape() : b_shape_),
+                "MatmulInteger : B zero point is not valid");
+    is_b_zp_per_column = !IsScalarOr1ElementVector(b_zero_point);
+    b_offset_ptr = static_cast<const uint8_t*>(b_zero_point->DataRaw());
+  }
+
+  MatMulComputeHelper helper;
+  const uint8_t* b_data;
+  bool b_is_signed;
+  if (nullptr != b) {
+    ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b->Shape(), nullptr, b_zero_point ? &b_zero_point->Shape() : nullptr));
+    b_data = static_cast<const uint8_t*>(b->DataRaw());
+    b_is_signed = b->IsDataType<int8_t>();
+  } else {
+    ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b_shape_, nullptr, b_zero_point ? &b_zero_point->Shape() : nullptr));
+    b_data = static_cast<const uint8_t*>(packed_b_.get());
+    b_is_signed = b_is_signed_;
+  }
+
+  Tensor* y = ctx->Output(OUT_Y, helper.OutputShape());
+  if (y->Shape().Size() == 0) {
+    return Status::OK();
+  }
+  const uint8_t* a_data = static_cast<const uint8_t*>(a->DataRaw());
+  auto* y_data = y->MutableData<int32_t>();
+
+  MLAS_GEMM_QUANT_SHAPE_PARAMS gemm_shape;
+  gemm_shape.M = static_cast<size_t>(helper.M());
+  gemm_shape.N = static_cast<size_t>(helper.N());
+  gemm_shape.K = static_cast<size_t>(helper.K());
+  gemm_shape.AIsSigned = a->IsDataType<int8_t>();
+  gemm_shape.BIsSigned = b_is_signed;
+
+  const size_t batch_size = helper.OutputOffsets().size();
+
+  std::vector<MLAS_GEMM_QUANT_DATA_PARAMS> gemm_data_vec(batch_size);
+
+  for (size_t batch = 0; batch < batch_size; batch++) {
+    auto& gemm_params = gemm_data_vec[batch];
+    gemm_params.lda = gemm_shape.K;
+    gemm_params.ZeroPointA = a_offset;
+    gemm_params.ldb = gemm_shape.N;
+    gemm_params.ZeroPointB = b_offset_ptr + helper.RightZeroPointOffsets()[batch];
+    gemm_params.PerColumnZeroPoints = is_b_zp_per_column;
+    gemm_params.ldc = gemm_shape.N;
+    gemm_params.BIsPacked = bool(packed_b_);
+    gemm_params.A = a_data + helper.LeftOffsets()[batch];
+    gemm_params.B = b_data + helper.RightOffsets()[batch];
+    gemm_params.C = y_data + helper.OutputOffsets()[batch];
+  }
+ 
+  #ifdef __EMSCRIPTEN__
+  //MlasGemmBatch(gemm_shape, gemm_data_vec.data(), batch_size, ctx->GetOperatorThreadPool());
+
+  // moz gemmology will be called here...
+    Index rows_A = 4;
+    Index width = 64; // Must be a multiple of 64
+    Index cols_B = 8; // Must be a multiple of 8
+
+    // Generate example data for A and B
+    std::vector<int8_t> A(rows_A * width, 1); // Example data for matrix A
+    std::vector<int8_t> B(width * cols_B, 1); // Example data for matrix B
+    std::vector<float> bias(cols_B, 0.0f);    // Example bias, set to 0
+
+    // Prepare output buffer
+    std::vector<float> output(rows_A * cols_B, 0.0f);
+
+    // Quantization parameters
+    float scale_A = 0.1f; // Example scale factor for A
+    float zero_point_A = 0.0f; // Example zero point for A
+    float scale_B = 0.2f; // Example scale factor for B
+    float zero_point_B = 0.0f; // Example zero point for B
+    float unquant_multiplier = 1.0f; // Example multiplier
+
+    // Call the function
+    int8MultiplyAndAddBias(A.data(),
+                           scale_A,
+                           zero_point_A,
+                           B.data(),
+                           scale_B,
+                           zero_point_B,
+                           bias.data(),
+                           unquant_multiplier,
+                           rows_A,
+                           width,
+                           cols_B,
+                           output.data());
+
+    // Print the output
+    std::cout << "Output matrix:\n";
+    for (Index i = 0; i < rows_A; ++i) {
+        for (Index j = 0; j < cols_B; ++j) {
+            std::cout << output[i * cols_B + j] << " ";
+        }
+        std::cout << "\n";
+    }
+
+  #else 
+
+  std::cout << "Calling J'aime l'euneology" << std::endl;
+  std::cout << "A shape: " << a->Shape() << std::endl;
+  std::cout << "B shape: " << b->Shape() << std::endl;
+  size_t a_1 = static_cast<size_t>(a->Shape()[0]);
+  size_t a_2 = static_cast<size_t>(a->Shape()[1]);  // <----
+  size_t b_1 = static_cast<size_t>(b->Shape()[1]);
+
+  const int8_t* casted_b_data = reinterpret_cast<const int8_t*>(b_data);  
+  const int8_t* casted_a_data = static_cast<const int8_t*>(a->DataRaw());
+
+  // Print A data
+  std::cout << "Input Tensor A (casted_a_data):" << std::endl;
+  for (size_t i = 0; i < a_1; ++i) {
+    for (size_t j = 0; j < a_2; ++j) {
+      std::cout << static_cast<int>(casted_a_data[i * a_2 + j]) << " ";
+    }
+    std::cout << std::endl; // Move to the next row
+  }
+
+  // Print casted B data
+  std::cout << "Input Tensor B (casted_b_data):" << std::endl;
+  for (size_t i = 0; i < a_2; ++i) { // Rows of B
+    for (size_t j = 0; j < b_1; ++j) {
+      std::cout << static_cast<int>(casted_b_data[i * b_1 + j]) << " ";
+    }
+    std::cout << std::endl; // Move to the next row
+  }
+
+  gemmology::Shift::Multiply(
+      reinterpret_cast<const uint8_t*>(casted_a_data), 
+      casted_b_data,  
+      a_1,
+      a_2,
+      b_1,
+      gemmology::callbacks::Write(reinterpret_cast<float*>(y_data))
+  );
+
+  // Get the shape of the tensor
+  std::cout << "y data result:" << std::endl;
+
+  size_t M = helper.M();
+  size_t N = helper.N();
+  for (size_t i = 0; i < M; ++i) {
+    for (size_t j = 0; j < N; ++j) {
+      // Access the element at row i and column j
+      std::cout << y_data[i * N + j] << " ";
+    }
+    std::cout << std::endl; // Move to the next row
+  }
+
+   #endif
+  std::cout << "Exiting FirefoxMatMulInteger8::Compute" << std::endl;
+  return Status::OK();
+}
+
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.h b/onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.h
new file mode 100644
index 0000000000000..d20a1aa123397
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.h
@@ -0,0 +1,309 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/cpu/quantization/matmul_integer_base.h"
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+#include "core/util/math_cpuonly.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+class FirefoxMatMulInteger8 final : public MatMulIntegerBase {
+ public:
+  FirefoxMatMulInteger8(const OpKernelInfo& info) : MatMulIntegerBase(info) {}
+  Status Compute(OpKernelContext* context) const override;
+
+  enum InputTensors : int {
+    IN_A = 0,
+    IN_B = 1,
+    IN_A_ZERO_POINT = 2,
+    IN_B_ZERO_POINT = 3
+  };
+
+  enum OutputTensors : int { OUT_Y = 0 };
+
+ protected:
+  int GetBIdx() const override { return IN_B; }
+};
+
+/** 
+ * Headers for the gemmology functions 
+ */
+#ifdef __EMSCRIPTEN__
+#include <emscripten/emscripten.h>
+
+
+/** Main interface for integer matrix multiplication followed by addition of bias for wasm.
+ *
+ * C = A * B + Bias
+ *
+ * Input matrix A:
+ *   - is a 2-D matrix that typically represents activations as floating point values
+ *   - no. of rows should be a multiple of 1 (i.e. no restriction)
+ *   - no. of columns should be a multiple of 64
+ *   - is represented as array (contiguous memory locations) in row-major format
+ *
+ * Input matrix B:
+ *   - is a 2-D matrix that typically represents fixed model parameters as floating point values
+ *   - no. of rows should be:
+ *     -- equal to no. of columns of Input matrix A
+ *     -- a multiple of 64
+ *   - no. of columns should be a multiple of 8
+ *   - is represented as array (contiguous memory locations) in row-major format
+ *
+ *   Please note that it is also possible to pass Input matrix B in 2 more forms:
+ *    - One that is already a quantized and transposed version of Input matrix B
+ *    - Other that is already a transposed version of Input matrix B
+ *
+ * Input Bias:
+ *   - is an array (contiguous memory locations) that represents bias
+ *   - size of the array should be equal to the no. of columns of Input matrix B
+ *
+ * Output matrix C:
+ *   - is a 2-D matrix that represents the result (= A * B + Bias)
+ *   - no. of rows will be equal to no. of rows of Input matrix A
+ *   - no. of columns will be equal to no. of columns of Input matrix B (in untransposed form)
+ *   - is represented as array (contiguous memory locations) in row-major format
+ *
+ * Please note that most of the functions in this interface might have architecture specific
+ * implementations.
+ *
+ * Conventions followed throughout this file:
+ *  - Unless explicitly mentioned, Input matrix B always means an unquantized (i.e. float values)
+ *    and non-transposed version
+ *  - no. of rows of Input matrix A = `rows_A`
+ *  - no. of columns of Input matrix A = no. of rows of Input matrix B = `width`
+ *  - no. of columns of Input matrix B = `cols_B`
+ */
+
+#include <cstdint>
+
+using Index = uint32_t;
+
+/**
+ * Prepare B for the Matrix Multiply function from Input matrix B.
+ *
+ * Quantization is performed on the input.
+ * The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
+ * function (`int8MultiplyAndAddBias`).
+ *
+ * Please note that this interface might have architecture specific implementation.
+ *
+ * @param[in]   input_B             An array representing the Input matrix B in row-major format.
+ *                                  Size of the array = `width` * `cols_B`.
+ *                                  Shape of the matrix: (`width`, `cols_B`)
+ * @param[in]   scale               The scaling factor (for quantization)
+ * @param[in]   zero_point          The zero point (for quantization)
+ * @param[in]   width               No. of rows of Input matrix B. It should be a multiple of 64.
+ * @param[in]   cols_B              No. of columns of Input matrix B. It should be a multiple of 8.
+ * @param[out]  output              An array representing the prepared B matrix.
+ *                                  Size of the array = `width` * `cols_B`.
+ */
+extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_prepare_b")))
+int8PrepareB(const float* input_B,
+             float scale,
+             float zero_point,
+             Index width,
+             Index cols_B,
+             int8_t* output);
+
+/**
+ * Prepare B for the Matrix Multiply function from transposed version of Input matrix B.
+ *
+ * Quantization is performed on floating values of input.
+ * The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
+ * function (`int8MultiplyAndAddBias`).
+ *
+ * Please note that this interface might have architecture specific implementation.
+ *
+ * @param[in]   input_B_transposed     An array representing transposed version of Input matrix B.
+ *                                     It is in column-major format.
+ *                                     Size of the array = `width` * `cols_B`.
+ *                                     Shape of the matrix: (`cols_B`, `width`)
+ * @param[in]   scale                  The scaling factor (for quantization)
+ * @param[in]   zero_point             The zero point (for quantization)
+ * @param[in]   width                  No. of rows of Input matrix B. It should be a multiple of 64.
+ * @param[in]   cols_B                 No. of columns of Input matrix B. Should be a multiple of 8.
+ * @param[out]  output                 An array representing the prepared B matrix.
+ *                                     Size of the array = `width` * `cols_B`.
+ */
+extern "C" void
+    __attribute__((import_module("wasm_gemm"), import_name("int8_prepare_b_from_transposed")))
+    int8PrepareBFromTransposed(const float* input_B_transposed,
+                               float scale,
+                               float zero_point,
+                               Index width,
+                               Index cols_B,
+                               int8_t* output);
+
+/**
+ * Prepare B for the Matrix Multiply function from a quantized and transposed version of Input
+ * matrix B which is also in a CPU-independent format.
+ *
+ * The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
+ * function (`int8MultiplyAndAddBias`).
+ *
+ * This function is useful while using the quantized models that are stored in a CPU-independent
+ * format on the disk.
+ *
+ * @param[in]   input_B_quant_transposed   An array representing the quantized and transposed
+ *                                         version of Input matrix B. It is in column-major format.
+ *                                         Size of the array = `width` * `cols_B`.
+ *                                         Shape of the matrix: (`cols_B`, `width`)
+ * @param[in]   width                      No. of rows of Input matrix B. Should be multiple of 64
+ * @param[in]   cols_B                     No. of columns of Input matrix B. Should be multiple of 8
+ * @param[out]  output                     An array representing the prepared B matrix.
+ *                                         Size of the array = `width` * `cols_B`.
+ */
+extern "C" void __attribute__((import_module("wasm_gemm"),
+                               import_name("int8_prepare_b_from_quantized_transposed")))
+int8PrepareBFromQuantizedTransposed(const int8_t* input_B_quant_transposed,
+                                    Index width,
+                                    Index cols_B,
+                                    int8_t* output);
+
+/**
+ * Prepare A for the Matrix Multiply function from Input matrix A.
+ *
+ * It performs quantization on floating values of input.
+ * The final prepared A might be architecture dependent. e.g. On some architectures like x86, it
+ * might be unsigned (achieved by adding 127 to quantized values) while on others like Arm, it might
+ * be signed.
+ * The final prepared A can be used as an input to matrix multiply function
+ * (`int8MultiplyAndAddBias`).
+ *
+ * Please note that this interface might have architecture specific implementation.
+ *
+ * @param[in]   input_A        An array representing the Input matrix A in row-major format.
+ *                             Size of the array = `rows_A` * `width`.
+ *                             Shape of the matrix: (`rows_A`, `width`)
+ * @param[in]   scale          The scaling factor (for quantization)
+ * @param[in]   zero_point     The zero point (for quantization)
+ * @param[in]   rows_A         No. of rows of Input matrix A. No restriction on its size.
+ * @param[in]   width          No. of columns of Input matrix A. It should be a multiple of 64.
+ * @param[out]  output         An array representing the prepared A matrix.
+ *                             Size of the array = `rows_A` * `width`.
+ */
+extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_prepare_a")))
+int8PrepareA(const float* input_A,
+             float scale,
+             float zero_point,
+             Index rows_A,
+             Index width,
+             int8_t* output);
+
+/**
+ * Prepares bias for the Matrix Multiply function.
+ *
+ * It uses the prepared B (which must be obtained by using any of the int8PrepareB* functions) and
+ * a bias input to prepare the final bias.
+ *
+ * The final bias can be used as an input to matrix multiply function (`int8MultiplyAndAddBias`).
+ *
+ * @param[in]   input_B_prepared    An array representing the prepared B matrix.
+ *                                  Size of the array = `width` * `cols_B`.
+ * @param[in]   scale_A             The scaling factor (for quantization) of A
+ * @param[in]   zero_point_A        The zero point (for quantization) of A
+ * @param[in]   scale_B             The scaling factor (for quantization) of B
+ * @param[in]   zero_point_B        The zero point (for quantization) of B
+ *                                  factor that is prepared from `scale_A` and `scale_B`.
+ * @param[in]   width               No. of rows of Input matrix B (unquantized & non-transposed).
+ *                                  It should be a multiple of 64.
+ * @param[in]   cols_B              No. of columns of Input matrix B (unquantized & non-transposed)
+ *                                  It should be a multiple of 8.
+ * @param[in]   input_bias          An array representing the input bias. Size of array = `cols_B`
+ * @param[out]  output              An array representing the final prepared bias.
+ *                                  Size of the array = `cols_B`
+ */
+extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_prepare_bias")))
+int8PrepareBias(const int8_t* input_B_prepared,
+                float scale_A,
+                float zero_point_A,
+                float scale_B,
+                float zero_point_B,
+                Index width,
+                Index cols_B,
+                const float* input_bias,
+                float* output);
+
+/**
+ * Perform multiplication of 2 matrices followed by adding a bias.
+ *
+ * i.e Output = A_prepared * B_prepared + Bias_prepared
+ *
+ * The inputs A_prepared, B_prepared and Bias_prepared of this function must be
+ * obtained by using `int8PrepareA`, one of the `int8PrepareB*` and `int8PrepareBias`
+ * functions respectively.
+ *
+ * Please note that this interface might have architecture specific implementation.
+ *
+ * @param[in]   input_A_prepared       An array representing the prepared A matrix.
+ *                                     This must be obtained by using `int8PrepareA` function.
+ *                                     Size of the array = `rows_A` * `width`.
+ * @param[in]   scale_A                The scaling factor (for quantization) of A
+ * @param[in]   zero_point_A           The zero point (for quantization) of A
+ * @param[in]   input_B_prepared       An array representing the prepared B matrix.
+ *                                     This must be obtained by using one of `int8PrepareB*`
+ *                                     functions. Size of the array = `width` * `cols_B`.
+ * @param[in]   scale_B                The scaling factor (for quantization) of B
+ * @param[in]   zero_point_B           The zero point (for quantization) of B
+ * @param[in]   input_bias_prepared    An array representing the prepared bias.
+ *                                     This must be obtained by using `int8PrepareBias` function.
+ *                                     Size of the array = `cols_B`
+ * @param[in]   unquant_multiplier     A value that will be multiplied to the final unquantization
+ *                                     factor that is prepared from `scale_A` and `scale_B`.
+ * @param[in]   rows_A                 No. of rows of Input matrix A. No restriction on its size.
+ * @param[in]   width                  No. of columns of Input matrix A (same as no. of columns of
+ *                                     Input matrix B). It should be a multiple of 64.
+ * @param[in]   cols_B                 No. of columns of Input matrix B. Should be a multiple of 8.
+ * @param[out]  output                 An array representing the result matrix in row-major format.
+ *                                     Size of the array = `rows_A` * `cols_B`.
+ */
+extern "C" void
+    __attribute__((import_module("wasm_gemm"), import_name("int8_multiply_and_add_bias")))
+    int8MultiplyAndAddBias(const int8_t* input_A_prepared,
+                           float scale_A,
+                           float zero_point_A,
+                           const int8_t* input_B_prepared,
+                           float scale_B,
+                           float zero_point_B,
+                           const float* input_bias_prepared,
+                           float unquant_multiplier,
+                           Index rows_A,
+                           Index width,
+                           Index cols_B,
+                           float* output);
+
+/**
+ * Select a subset of columns of prepared B.
+ *
+ * Indices of the columns to be selected are specified by an array.
+ *
+ * @param[in]   input_B_prepared   An array representing the prepared B matrix.
+ *                                 This must be obtained by using one of the `int8PrepareB*`
+ *                                 functions Size of the array = `width` * `cols_B`.
+ * @param[in]   width              No. of rows of Input matrix B. It should be a multiple of 64.
+ * @param[in]   cols_B             No. of columns of Input matrix B. It should be a multiple of 8.
+ * @param[in]   cols               An array of column indices to be selected from prepared B.
+ *                                 All indices of the array should be valid. i.e.
+ *                                 0 <= cols[N] < cols_B   where N = 0, 1, 2 .... (`num_cols`-1)
+ * @param[in]   num_cols           Size of the `cols` array. It should be a multiple of 8.
+ * @param[out]  output             An array representing the selected columns of prepared B.
+ *                                 Size of the array = `width` * `num_cols`.
+ */
+extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_select_columns_of_b")))
+int8SelectColumnsOfB(const int8_t* input_B_prepared,
+                     Index width,
+                     Index cols_B,
+                     const Index* cols,
+                     const Index num_cols,
+                     int8_t* output);
+
+
+#endif
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 0d0b22ff61e01..0571942a2b30a 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -1100,7 +1100,10 @@ static Status VerifyEachNodeIsAssignedToAnEpImpl(const Graph& graph, bool is_ver
                                                  NodePlacementMap& node_placements,
                                                  NodePlacementSet& node_placement_provider_set) {
   for (const auto& node : graph.Nodes()) {
+    
     const auto& node_provider = node.GetExecutionProviderType();
+    printf("%s(%s), provider: %s\n", node.OpType().c_str(), node.Name().c_str(), node_provider.c_str());
+
     if (node_provider.empty()) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
                              "Could not find an implementation for ",
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 09a4a77780916..02bfed2b9660f 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -1980,6 +1980,56 @@ Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-
                                   ONNX_NAMESPACE::defs::math::utils::MatMulShapeInference(ctx, 0, 1);
                                 }));
 
+
+constexpr const char* FirefoxMatMulInteger_doc = R"DOC(
+Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
+)DOC";
+
+
+
+ONNX_MS_OPERATOR_SET_SCHEMA(FirefoxMatMulInteger8, 1,
+                            OpSchema()
+                                .SetDoc(FirefoxMatMulInteger_doc)
+                                .Input(0, "A", "N-dimensional matrix A", "T1")
+                                .Input(1, "B", "N-dimensional matrix B", "T2")
+                                .Input(2, "a_zero_point",
+                                   "Zero point tensor for input 'A'. It's optional and default value is 0.  It could be a scalar or a 1-D "
+                                   "tensor, "
+                                   "which means a per-tensor or per-column quantization. If it's a 1-D tensor, its number "
+                                   "of elements should be equal to the number of columns of input 'A'.",
+                                 "T1", OpSchema::Optional)
+                                .Input(3, "b_zero_point",
+                                   "Zero point tensor for input 'B'. It's optional and default value is 0.  It could be a scalar or a 1-D "
+                                   "tensor, "
+                                   "which means a per-tensor or per-column quantization. If it's a 1-D tensor, its number "
+                                   "of elements should be equal to the number of columns of input 'B'.",
+                                 "T2", OpSchema::Optional)
+                                .Output(0, "Y", "Matrix multiply results from A * B", "T3")
+                                .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input A data types as 8-bit integer tensor")
+                                .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input B data types as 8-bit integer tensor")
+                                .TypeConstraint("T3",
+                                                {"tensor(int32)", "tensor(uint32)"},
+                                                "Constrain output Y data types as 32-bit integer tensor."
+                                                "T3 must be tensor(uint16) when both T1 and T2 are tensor(uint8),"
+                                                "or must be tensor(int16) when either T1 or T2 is tensor(int8).")
+                                .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+                                  auto a_type = ctx.getInputType(0);
+                                  auto b_type = ctx.getInputType(1);
+                                  auto y_type = ctx.getOutputType(0);
+                                  if (nullptr == a_type || nullptr == b_type || nullptr == y_type ||
+                                      a_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType ||
+                                      b_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
+                                    fail_type_inference(
+                                        "inputs are expected to have tensor type and output type should not be null.");
+                                  }
+
+                                  // Right now we only support int16
+                                  y_type->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto::INT32);
+
+                                  ONNX_NAMESPACE::defs::math::utils::MatMulShapeInference(ctx, 0, 1);
+                                }));
+
+
 /**
  * @brief Shape inference for MatMul with right hand side matrix quantized into int4
  * @param ctx
@@ -3780,6 +3830,7 @@ Having this op allows runtime to do operator re-ordering to reduce compute FLOPs
 
 #endif
 
+
 #ifndef _OPSCHEMA_LIB_
   // Register the NCHWc schemas if supported by the platform.
   if (MlasNchwcGetBlockSize() > 1) {
diff --git a/onnxruntime/core/graph/contrib_ops/ms_opset.h b/onnxruntime/core/graph/contrib_ops/ms_opset.h
index a9a89f756b071..5d98743f4078d 100644
--- a/onnxruntime/core/graph/contrib_ops/ms_opset.h
+++ b/onnxruntime/core/graph/contrib_ops/ms_opset.h
@@ -79,6 +79,7 @@ class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Irfft);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, IsAllFinite);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, LongformerAttention);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MatMulInteger16);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, FirefoxMatMulInteger8);
 #ifndef ORT_MINIMAL_BUILD
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MatMulFpQ4);
 #endif
@@ -189,6 +190,7 @@ class OpSet_Microsoft_ver1 {
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, IsAllFinite)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, LongformerAttention)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MatMulInteger16)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, FirefoxMatMulInteger8)>());
 #ifndef ORT_MINIMAL_BUILD
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MatMulFpQ4)>());
 #endif
diff --git a/onnxruntime/test/contrib_ops/firefox_matmul_integer_test.cc b/onnxruntime/test/contrib_ops/firefox_matmul_integer_test.cc
new file mode 100644
index 0000000000000..3b8e079ec705a
--- /dev/null
+++ b/onnxruntime/test/contrib_ops/firefox_matmul_integer_test.cc
@@ -0,0 +1,50 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+#include "core/util/math_cpuonly.h"
+
+namespace onnxruntime {
+namespace test {
+
+TEST(FirefoxMatMulInteger8OpTest, FirefoxMatMulInteger8_1) {
+  OpTester test("FirefoxMatMulInteger8", 1, onnxruntime::kMSDomain);
+  test.AddInput<int8_t>("T1", {1, 1}, {15});
+  test.AddInput<int8_t>("T2", {1, 1}, {8});
+  test.AddOutput<int32_t>("T3", {1, 1}, {120});  // Result is 15 * 8
+  test.Run();
+}
+
+TEST(FirefoxMatMulInteger8OpTest, FirefoxMatMulInteger8_2) {
+  OpTester test("FirefoxMatMulInteger8", 1, onnxruntime::kMSDomain);
+  test.AddInput<int8_t>("T1", {1, 2}, {-7, 10});
+  test.AddInput<int8_t>("T2", {2, 1}, {-8, -11});
+  test.AddOutput<int32_t>("T3", {1, 1}, {8});  // Result is (-7 * -8) + (10 * -11)
+  test.Run();
+}
+
+TEST(FirefoxMatMulInteger8OpTest, FirefoxMatMulInteger8_Empty_input) {
+  OpTester test("FirefoxMatMulInteger8", 1, onnxruntime::kMSDomain);
+  test.AddInput<int8_t>("T1", {0, 2}, {});
+  test.AddInput<int8_t>("T2", {2, 1}, {-8, -11});
+  test.AddOutput<int32_t>("T3", {0, 1}, {});  // Empty input produces an empty output
+  test.Run();
+}
+
+TEST(FirefoxMatMulInteger8OpTest, FirefoxMatMulInteger8_3) {
+  OpTester test("FirefoxMatMulInteger8", 1, onnxruntime::kMSDomain);
+  test.AddInput<int8_t>("T1", {3, 2}, {-7, 10, 10, -113, 22, -36});
+  test.AddInput<int8_t>("T2", {2, 4}, {-8, -11, 13, 14, -9, 12, 3, -6});
+  test.AddOutput<int32_t>("T3", {3, 4},
+                          {-158, 97, -61, -2,          // First row results
+                           989, -1426, 1693, 1682,     // Second row results
+                           282, -518, 280, -372});     // Third row results
+  test.Run();
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 7f4616c964e33..8d8227acb48ab 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -1329,8 +1329,11 @@ TEST(InferenceSessionTests, TestOptionalInputs) {
                                         "Invalid input name");
 
     // missing required
+    printf("here");
+
     ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(RunOptionalInputTest(false, true, false, version, sess_env),
                                         (version == 3 ? "Invalid input name" : "Missing Input:"));
+    printf("here 2");
   }
 }
 
diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js
index 45e2475548df5..d23200a389100 100644
--- a/onnxruntime/wasm/pre-jsep.js
+++ b/onnxruntime/wasm/pre-jsep.js
@@ -3,13 +3,13 @@
 
 'use strict';
 
+
 //
 // This file contains the pre-run code for the ORT WebAssembly module. The code in this file will be injected into the
 // final module using Emscripten's `--pre-js` option.
 //
 // This file will only be used in build with flag `--use_jsep`.
 
-
 /**
  * initialize JSEP for asyncify support.
  */
@@ -109,7 +109,7 @@ let jsepInitAsync = () => {
         if (Module.jsepSessionState) {
           throw new Error('Session already started');
         }
-        const state = Module.jsepSessionState = {sessionHandle: args[0], errors: []};
+        const state = Module.jsepSessionState = { sessionHandle: args[0], errors: [] };
 
         // Run the acyncified function: OrtRun() or OrtRunWithBinding()
         const ret = await runAsyncFunc(...args);
@@ -141,21 +141,21 @@ let jsepInitAsync = () => {
 
   // replace the original functions with asyncified versions
   Module['_OrtCreateSession'] = jsepWrapAsync(
-      Module['_OrtCreateSession'],
-      () => Module['_OrtCreateSession'],
-      v => Module['_OrtCreateSession'] = v);
+    Module['_OrtCreateSession'],
+    () => Module['_OrtCreateSession'],
+    v => Module['_OrtCreateSession'] = v);
   Module['_OrtRun'] = runAsync(jsepWrapAsync(
-      Module['_OrtRun'],
-      () => Module['_OrtRun'],
-      v => Module['_OrtRun'] = v));
+    Module['_OrtRun'],
+    () => Module['_OrtRun'],
+    v => Module['_OrtRun'] = v));
   Module['_OrtRunWithBinding'] = runAsync(jsepWrapAsync(
-      Module['_OrtRunWithBinding'],
-      () => Module['_OrtRunWithBinding'],
-      v => Module['_OrtRunWithBinding'] = v));
+    Module['_OrtRunWithBinding'],
+    () => Module['_OrtRunWithBinding'],
+    v => Module['_OrtRunWithBinding'] = v));
   Module['_OrtBindInput'] = jsepWrapAsync(
-      Module['_OrtBindInput'],
-      () => Module['_OrtBindInput'],
-      v => Module['_OrtBindInput'] = v);
+    Module['_OrtBindInput'],
+    () => Module['_OrtBindInput'],
+    v => Module['_OrtBindInput'] = v);
 
   // remove this function to make sure it is called only once.
   jsepInitAsync = undefined;
@@ -170,16 +170,16 @@ Module['jsepInit'] = (name, params) => {
 
   if (name === 'webgpu') {
     [Module.jsepBackend,
-     Module.jsepAlloc,
-     Module.jsepFree,
-     Module.jsepCopy,
-     Module.jsepCopyAsync,
-     Module.jsepCreateKernel,
-     Module.jsepReleaseKernel,
-     Module.jsepRunKernel,
-     Module.jsepCaptureBegin,
-     Module.jsepCaptureEnd,
-     Module.jsepReplay] = params;
+    Module.jsepAlloc,
+    Module.jsepFree,
+    Module.jsepCopy,
+    Module.jsepCopyAsync,
+    Module.jsepCreateKernel,
+    Module.jsepReleaseKernel,
+    Module.jsepRunKernel,
+    Module.jsepCaptureBegin,
+    Module.jsepCaptureEnd,
+    Module.jsepReplay] = params;
 
     // expose webgpu backend functions
     const backend = Module.jsepBackend;
@@ -211,11 +211,11 @@ Module['jsepInit'] = (name, params) => {
     // change the name.
 
     [Module.jsepBackend,
-     Module.jsepReserveTensorId,
-     Module.jsepReleaseTensorId,
-     Module['jsepEnsureTensor'],
-     Module.jsepUploadTensor,
-     Module['jsepDownloadTensor'],
+    Module.jsepReserveTensorId,
+    Module.jsepReleaseTensorId,
+    Module['jsepEnsureTensor'],
+    Module.jsepUploadTensor,
+    Module['jsepDownloadTensor'],
     ] = params;
 
     // This function is called from both JS and an EM_ASM block, it needs both a minifiable name and an explicit name.
@@ -243,7 +243,7 @@ Module['jsepInit'] = (name, params) => {
     };
     Module['jsepRegisterMLConstant'] = (externalFilePath, dataOffset, dataLength, builder, desc) => {
       return backend['registerMLConstant'](
-          externalFilePath, dataOffset, dataLength, builder, desc, Module.MountedFiles);
+        externalFilePath, dataOffset, dataLength, builder, desc, Module.MountedFiles);
     };
   }
 };
diff --git a/onnxruntime/wasm/pre.js b/onnxruntime/wasm/pre.js
index 9b5f3ce545b78..2d786a844f259 100644
--- a/onnxruntime/wasm/pre.js
+++ b/onnxruntime/wasm/pre.js
@@ -49,4 +49,93 @@ Module['unmountExternalData'] = () => {
  * @suppress {checkVars}
  */
 var SharedArrayBuffer = globalThis.SharedArrayBuffer ??
-    new WebAssembly.Memory({'initial': 0, 'maximum': 0, 'shared': true}).buffer.constructor;
+  new WebAssembly.Memory({ 'initial': 0, 'maximum': 0, 'shared': true }).buffer.constructor;
+
+
+
+function asmjsMangle(x) {
+  var unmangledSymbols = ["stackAlloc", "stackSave", "stackRestore"];
+  return x.indexOf("dynCall_") == 0 || unmangledSymbols.includes(x) ? x : "_" + x;
+}
+
+function exportAsmFunctions(asm) {
+  var global_object = this;
+  for (var __exportedFunc in asm) {
+    var jsname = asmjsMangle(__exportedFunc);
+    Module[jsname] = asm[__exportedFunc];
+    if (global_object) {
+      global_object[__exportedFunc] = asm[__exportedFunc];
+    }
+  }
+}
+
+
+function fallbackGemm(gemmToFallbackFunctionsMap) {
+  // The fallback gemm implementation
+  const FALLBACK_GEMM = "asm";
+
+  let fallbackGemmModuleExports = {};
+  for (let key in gemmToFallbackFunctionsMap) {
+    fallbackGemmModuleExports[key] = (...a) =>
+      Module[FALLBACK_GEMM][gemmToFallbackFunctionsMap[key]](...a);
+  }
+  return fallbackGemmModuleExports;
+}
+
+/**
+* Custom call to instantiate WebAssembly module. so we can use custom imports 
+*/ Module["instantiateWasm"] = async (info, receiveInstance) => {
+  const wasmBinaryFile = findWasmBinary();
+  const bytes = await getBinaryPromise(wasmBinaryFile);
+  const module = await WebAssembly.compile(bytes);
+  let imports = getWasmImports();
+
+  // XXX mozIntGemm can't be used from web pages - we use a fallback if we are not privileged 
+  const OPTIMIZED_GEMM = "mozIntGemm";
+
+  const optimizedGemmModule = WebAssembly[OPTIMIZED_GEMM];
+  if (!optimizedGemmModule) {
+    const GEMM_TO_FALLBACK_FUNCTIONS_MAP = {
+      int8_prepare_a: "int8PrepareAFallback",
+      int8_prepare_b: "int8PrepareBFallback",
+      int8_prepare_b_from_transposed: "int8PrepareBFromTransposedFallback",
+      int8_prepare_b_from_quantized_transposed:
+        "int8PrepareBFromQuantizedTransposedFallback",
+      int8_prepare_bias: "int8PrepareBiasFallback",
+      int8_multiply_and_add_bias: "int8MultiplyAndAddBiasFallback",
+      int8_select_columns_of_b: "int8SelectColumnsOfBFallback",
+    };
+    imports.wasm_gemm = fallbackGemm(GEMM_TO_FALLBACK_FUNCTIONS_MAP);
+  }
+
+  else {
+    var INITIAL_MEMORY = 16777216;
+    var gemmWasmMemory = new WebAssembly.Memory({
+      "initial": INITIAL_MEMORY / 65536,
+      "maximum": 4294967296 / 65536,
+      "shared": false
+    });
+    const optimizedGemmModuleExports = new WebAssembly.Instance(optimizedGemmModule(), {
+      "": {
+        memory: gemmWasmMemory
+      }
+    }).exports;
+    imports.wasm_gemm = optimizedGemmModuleExports;
+  }
+  function mozReceiveInstance(instance) {
+    // XXX do we need a moz specific stuff here?
+    //var exports = instance.exports;
+    //Module.asm = exports;
+    // wasmTable = Module.asm.__indirect_function_table; ???
+    //exportAsmFunctions(exports);
+    return receiveInstance(instance);
+  }
+  try {
+    var instance = new WebAssembly.Instance(module, imports);
+    mozReceiveInstance(instance);
+  } catch (error) {
+    console.error("Error creating WebAssembly instance:", error);
+    throw error;
+  }
+};
+