From ed8a14e3c9aaae880ed746c4244e44cb948d58f6 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 20 Jun 2024 09:29:32 +0200 Subject: [PATCH] docs: add AVX and AVX2 instructions notes Signed-off-by: Daniel Bevenius --- notes/simd.md | 289 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 289 insertions(+) diff --git a/notes/simd.md b/notes/simd.md index 80358793..31ef9e19 100644 --- a/notes/simd.md +++ b/notes/simd.md @@ -82,6 +82,7 @@ _mm_ `_mm` is a prefix for all SIMD functions. The `` is the width of the return type register. The `` is the name of the function and the `` is the data type of the function arguments: + Data types: * __m128 128-bit register 4 floats * __m128d 128-bit register 2 doubles @@ -169,6 +170,193 @@ which are ZMM0-ZMM31. So if we see YMM2 in the disassembly then we know that Now AVX is mostly focused on floating point operations and AVX2 extends this to integer operations but still uses the YMM registers. +### AVX instructions +``` +__m256d _mm256_add_pd (__m256d a, __m256d b) vaddps +__m256 _mm256_add_ps (__m256 a, __m256 b) vaddsubpd +__m256d _mm256_addsub_pd (__m256d a, __m256d b) vaddsubps +__m256 _mm256_addsub_ps (__m256 a, __m256 b) vandpd +__m256d _mm256_and_pd (__m256d a, __m256d b) vandps +__m256 _mm256_and_ps (__m256 a, __m256 b) vandnpd +__m256d _mm256_andnot_pd (__m256d a, __m256d b) vandnps +__m256 _mm256_andnot_ps (__m256 a, __m256 b) vblendpd +__m256d _mm256_blend_pd (__m256d a, __m256d b, const int imm8) vblendps +__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8) vblendvpd +__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask) vblendvps +__m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask) vbroadcastf128 +__m256d _mm256_broadcast_pd (__m128d const * mem_addr) vbroadcastf128 +__m256 _mm256_broadcast_ps (__m128 const * mem_addr) vbroadcastsd +__m256d _mm256_broadcast_sd (double const * mem_addr) vbroadcastss +__m128 _mm_broadcast_ss (float const * mem_addr) vbroadcastss +__m256 _mm256_broadcast_ss (float const * mem_addr) __m256 _mm256_castpd_ps (__m256d a) +__m256i _mm256_castpd_si256 (__m256d a) __m256d _mm256_castpd128_pd256 (__m128d a) +__m128d _mm256_castpd256_pd128 (__m256d a) __m256d _mm256_castps_pd (__m256 a) +__m256i _mm256_castps_si256 (__m256 a) __m256 _mm256_castps128_ps256 (__m128 a) +__m128 _mm256_castps256_ps128 (__m256 a) __m256i _mm256_castsi128_si256 (__m128i a) +__m256d _mm256_castsi256_pd (__m256i a) __m256 _mm256_castsi256_ps (__m256i a) +__m128i _mm256_castsi256_si128 (__m256i a) vroundpd +__m256d _mm256_ceil_pd (__m256d a) vroundps +__m256 _mm256_ceil_ps (__m256 a) vcmppd +__m128d _mm_cmp_pd (__m128d a, __m128d b, const int imm8) vcmppd +__m256d _mm256_cmp_pd (__m256d a, __m256d b, const int imm8) vcmpps +__m128 _mm_cmp_ps (__m128 a, __m128 b, const int imm8) vcmpps +__m256 _mm256_cmp_ps (__m256 a, __m256 b, const int imm8) vcmpsd +__m128d _mm_cmp_sd (__m128d a, __m128d b, const int imm8) vcmpss +__m128 _mm_cmp_ss (__m128 a, __m128 b, const int imm8) vcvtdq2pd +__m256d _mm256_cvtepi32_pd (__m128i a) vcvtdq2ps +__m256 _mm256_cvtepi32_ps (__m256i a) vcvtpd2dq +__m128i _mm256_cvtpd_epi32 (__m256d a) vcvtpd2ps +__m128 _mm256_cvtpd_ps (__m256d a) vcvtps2dq +__m256i _mm256_cvtps_epi32 (__m256 a) vcvtps2pd +__m256d _mm256_cvtps_pd (__m128 a) vmovsd +double _mm256_cvtsd_f64 (__m256d a) vmovd +int _mm256_cvtsi256_si32 (__m256i a) vmovss +float _mm256_cvtss_f32 (__m256 a) vcvttpd2dq +__m128i _mm256_cvttpd_epi32 (__m256d a) vcvttps2dq +__m256i _mm256_cvttps_epi32 (__m256 a) vdivpd +__m256d _mm256_div_pd (__m256d a, __m256d b) vdivps +__m256 _mm256_div_ps (__m256 a, __m256 b) vdpps +__m256 _mm256_dp_ps (__m256 a, __m256 b, const int imm8) +__int32 _mm256_extract_epi32 (__m256i a, const int index) +__int64 _mm256_extract_epi64 (__m256i a, const int index) vextractf128 +__m128d _mm256_extractf128_pd (__m256d a, const int imm8) vextractf128 +__m128 _mm256_extractf128_ps (__m256 a, const int imm8) vextractf128 +__m128i _mm256_extractf128_si256 (__m256i a, const int imm8) vroundpd +__m256d _mm256_floor_pd (__m256d a) vroundps +__m256 _mm256_floor_ps (__m256 a) vhaddpd +__m256d _mm256_hadd_pd (__m256d a, __m256d b) vhaddps +__m256 _mm256_hadd_ps (__m256 a, __m256 b) vhsubpd +__m256d _mm256_hsub_pd (__m256d a, __m256d b) vhsubps +__m256 _mm256_hsub_ps (__m256 a, __m256 b) +__m256i _mm256_insert_epi16 (__m256i a, __int16 i, const int index) +__m256i _mm256_insert_epi32 (__m256i a, __int32 i, const int index) +__m256i _mm256_insert_epi64 (__m256i a, __int64 i, const int index) +__m256i _mm256_insert_epi8 (__m256i a, __int8 i, const int index) vinsertf128 +__m256d _mm256_insertf128_pd (__m256d a, __m128d b, int imm8) vinsertf128 +__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8) vinsertf128 +__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8) vlddqu +__m256i _mm256_lddqu_si256 (__m256i const * mem_addr) vmovapd +__m256d _mm256_load_pd (double const * mem_addr) vmovaps +__m256 _mm256_load_ps (float const * mem_addr) vmovdqa +__m256i _mm256_load_si256 (__m256i const * mem_addr) vmovupd +__m256d _mm256_loadu_pd (double const * mem_addr) vmovups +__m256 _mm256_loadu_ps (float const * mem_addr) vmovdqu +__m256i _mm256_loadu_si256 (__m256i const * mem_addr) +__m256 _mm256_loadu2_m128 (float const* hiaddr, float const* loaddr) +__m256d _mm256_loadu2_m128d (double const* hiaddr, double const* loaddr) +__m256i _mm256_loadu2_m128i (__m128i const* hiaddr, __m128i const* loaddr) vmaskmovpd +__m128d _mm_maskload_pd (double const * mem_addr, __m128i mask) vmaskmovpd +__m256d _mm256_maskload_pd (double const * mem_addr, __m256i mask) vmaskmovps +__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask) vmaskmovps +__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask) vmaskmovpd +void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a) vmaskmovpd +void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a) vmaskmovps +void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a) vmaskmovps +void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a) vmaxpd +__m256d _mm256_max_pd (__m256d a, __m256d b) vmaxps +__m256 _mm256_max_ps (__m256 a, __m256 b) vminpd +__m256d _mm256_min_pd (__m256d a, __m256d b) vminps +__m256 _mm256_min_ps (__m256 a, __m256 b) vmovddup +__m256d _mm256_movedup_pd (__m256d a) vmovshdup +__m256 _mm256_movehdup_ps (__m256 a) vmovsldup +__m256 _mm256_moveldup_ps (__m256 a) vmovmskpd +int _mm256_movemask_pd (__m256d a) vmovmskps +int _mm256_movemask_ps (__m256 a) vmulpd +__m256d _mm256_mul_pd (__m256d a, __m256d b) vmulps +__m256 _mm256_mul_ps (__m256 a, __m256 b) vorpd +__m256d _mm256_or_pd (__m256d a, __m256d b) vorps +__m256 _mm256_or_ps (__m256 a, __m256 b) vpermilpd +__m128d _mm_permute_pd (__m128d a, int imm8) vpermilpd +__m256d _mm256_permute_pd (__m256d a, int imm8) vpermilps +__m128 _mm_permute_ps (__m128 a, int imm8) vpermilps +__m256 _mm256_permute_ps (__m256 a, int imm8) vperm2f128 +__m256d _mm256_permute2f128_pd (__m256d a, __m256d b, int imm8) vperm2f128 +__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8) vperm2f128 +__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8) vpermilpd +__m128d _mm_permutevar_pd (__m128d a, __m128i b) vpermilpd +__m256d _mm256_permutevar_pd (__m256d a, __m256i b) vpermilps +__m128 _mm_permutevar_ps (__m128 a, __m128i b) vpermilps +__m256 _mm256_permutevar_ps (__m256 a, __m256i b) vrcpps +__m256 _mm256_rcp_ps (__m256 a) vroundpd +__m256d _mm256_round_pd (__m256d a, int rounding) vroundps +__m256 _mm256_round_ps (__m256 a, int rounding) vrsqrtps +__m256 _mm256_rsqrt_ps (__m256 a) ... +__m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) +__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) +__m256i _mm256_set_epi64x (__int64 e3, __int64 e2, __int64 e1, __int64 e0) +__m256i _mm256_set_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) vinsertf128 +__m256 _mm256_set_m128 (__m128 hi, __m128 lo) vinsertf128 +__m256d _mm256_set_m128d (__m128d hi, __m128d lo) vinsertf128 +__m256i _mm256_set_m128i (__m128i hi, __m128i lo) +__m256d _mm256_set_pd (double e3, double e2, double e1, double e0) +__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) +__m256i _mm256_set1_epi16 (short a) +__m256i _mm256_set1_epi32 (int a) +__m256i _mm256_set1_epi64x (long long a) +__m256i _mm256_set1_epi8 (char a) +__m256d _mm256_set1_pd (double a) +__m256 _mm256_set1_ps (float a) +__m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) +__m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) +__m256i _mm256_setr_epi64x (__int64 e3, __int64 e2, __int64 e1, __int64 e0) +__m256i _mm256_setr_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) vinsertf128 +__m256 _mm256_setr_m128 (__m128 lo, __m128 hi) vinsertf128 +__m256d _mm256_setr_m128d (__m128d lo, __m128d hi) vinsertf128 +__m256i _mm256_setr_m128i (__m128i lo, __m128i hi) +__m256d _mm256_setr_pd (double e3, double e2, double e1, double e0) +__m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) vxorpd +__m256d _mm256_setzero_pd (void) vxorps +__m256 _mm256_setzero_ps (void) vpxor +__m256i _mm256_setzero_si256 (void) vshufpd +__m256d _mm256_shuffle_pd (__m256d a, __m256d b, const int imm8) vshufps +__m256 _mm256_shuffle_ps (__m256 a, __m256 b, const int imm8) vsqrtpd +__m256d _mm256_sqrt_pd (__m256d a) vsqrtps +__m256 _mm256_sqrt_ps (__m256 a) vmovapd +void _mm256_store_pd (double * mem_addr, __m256d a) vmovaps +void _mm256_store_ps (float * mem_addr, __m256 a) vmovdqa +void _mm256_store_si256 (__m256i * mem_addr, __m256i a) vmovupd +void _mm256_storeu_pd (double * mem_addr, __m256d a) vmovups +void _mm256_storeu_ps (float * mem_addr, __m256 a) vmovdqu +void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a) +void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a) +void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a) +void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a) vmovntpd +void _mm256_stream_pd (void* mem_addr, __m256d a) vmovntps +void _mm256_stream_ps (void* mem_addr, __m256 a) vmovntdq +void _mm256_stream_si256 (void* mem_addr, __m256i a) vsubpd +__m256d _mm256_sub_pd (__m256d a, __m256d b) vsubps +__m256 _mm256_sub_ps (__m256 a, __m256 b) vtestpd +int _mm_testc_pd (__m128d a, __m128d b) vtestpd +int _mm256_testc_pd (__m256d a, __m256d b) vtestps +int _mm_testc_ps (__m128 a, __m128 b) vtestps +int _mm256_testc_ps (__m256 a, __m256 b) vptest +int _mm256_testc_si256 (__m256i a, __m256i b) vtestpd +int _mm_testnzc_pd (__m128d a, __m128d b) vtestpd +int _mm256_testnzc_pd (__m256d a, __m256d b) vtestps +int _mm_testnzc_ps (__m128 a, __m128 b) vtestps +int _mm256_testnzc_ps (__m256 a, __m256 b) vptest +int _mm256_testnzc_si256 (__m256i a, __m256i b) vtestpd +int _mm_testz_pd (__m128d a, __m128d b) vtestpd +int _mm256_testz_pd (__m256d a, __m256d b) vtestps +int _mm_testz_ps (__m128 a, __m128 b) vtestps +int _mm256_testz_ps (__m256 a, __m256 b) vptest +int _mm256_testz_si256 (__m256i a, __m256i b) +__m256d _mm256_undefined_pd (void) +__m256 _mm256_undefined_ps (void) +__m256i _mm256_undefined_si256 (void) vunpckhpd +__m256d _mm256_unpackhi_pd (__m256d a, __m256d b) vunpckhps +__m256 _mm256_unpackhi_ps (__m256 a, __m256 b) vunpcklpd +__m256d _mm256_unpacklo_pd (__m256d a, __m256d b) vunpcklps +__m256 _mm256_unpacklo_ps (__m256 a, __m256 b) vxorpd +__m256d _mm256_xor_pd (__m256d a, __m256d b) vxorps +__m256 _mm256_xor_ps (__m256 a, __m256 b) vzeroall +void _mm256_zeroall (void) vzeroupper +void _mm256_zeroupper (void) +__m256d _mm256_zextpd128_pd256 (__m128d a) +__m256 _mm256_zextps128_ps256 (__m128 a) +__m256i _mm256_zextsi128_si256 (__m128i a) + +``` ### AVX2 instructions ``` @@ -370,3 +558,104 @@ __m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b) vpunpcklbw __m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b) vpxor __m256i _mm256_xor_si256 (__m256i a, __m256i b) ``` + +### Instructions common to AVX and AVX2 +``` +AVX : __m256 _mm256_broadcast_ps (__m128 const * mem_addr) vbroadcastsd +AVX2 : __m128d _mm_broadcastsd_pd (__m128d a) vbroadcastsd + +AVX : __m256d _mm256_broadcast_sd (double const * mem_addr) vbroadcastss +AVX : __m256d _mm256_broadcast_sd (double const * mem_addr) vbroadcastss +AVX : __m128 _mm_broadcast_ss (float const * mem_addr) vbroadcastss +AVX2 : __m256i _mm256_broadcastsi128_si256 (__m128i a) vbroadcastss +AVX2 : __m128 _mm_broadcastss_ps (__m128 a) vbroadcastss + +AVX : __m256 _mm256_setzero_ps (void) vpxor +AVX2 : __m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b) vpxor + +### Order of instruction set extensions flags +This section is about GCC and an issue that I have encountered related to the +order of machine specific options (`-m`). + +The intention of this test is to check the availability of the SSSE3 +macro is being set correctly. This is sensitive to the order of options +specified to the compiler. For example, if -mssse3 is specified and later +-mavx is specified, the SSSE3 macro will be defined which can be somewhat +surprising. This is not really an issue with this simple test and one +Makefile but in a larger project using CMake and including multiple +directories all with their own CMakeLists.txt files, it can be difficult +to ensure that the correct flags are being passed to the compiler. Whan even +more concerning is that you probably won't notice this issue until you +run the code or inspect it. + +### +```c +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) +// multiply int8_t, add results pairwise twice +static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { + // Get absolute values of x vectors. This instruction is available in SSSE3 + const __m128i ax = _mm_sign_epi8(x, x); + + // Sign the values of the y vectors. This instruction is available in SSSE3 + const __m128i sy = _mm_sign_epi8(y, x); + + // Perform multiplication and create 16-bit values. This instruction is available in SSSE3 + const __m128i dot = _mm_maddubs_epi16(ax, sy); + + // Sets all elements of the vector 128-bit integers to 1. From SSE2 + const __m128i ones = _mm_set1_epi16(1); + + // Multiply packed signed 16-bit integers, from SSE2 + return _mm_madd_epi16(ones, dot); +} +``` +If we dump the assembly code for this function we see something interesting: +```console +000000000048609f : + 48609f: 55 push rbp + 4860a0: 48 89 e5 mov rbp,rsp + 4860a3: 48 81 ec 88 00 00 00 sub rsp,0x88 + 4860aa: c5 f9 7f 85 10 ff ff vmovdqa XMMWORD PTR [rbp-0xf0],xmm0 + 4860b1: ff + 4860b2: c5 f9 7f 8d 00 ff ff vmovdqa XMMWORD PTR [rbp-0x100],xmm1 + 4860b9: ff + 4860ba: c5 f9 6f 85 10 ff ff vmovdqa xmm0,XMMWORD PTR [rbp-0xf0] + 4860c1: ff + 4860c2: c5 f9 7f 45 e0 vmovdqa XMMWORD PTR [rbp-0x20],xmm0 + 4860c7: c5 f9 6f 85 10 ff ff vmovdqa xmm0,XMMWORD PTR [rbp-0xf0] + 4860ce: ff + 4860cf: c5 f9 7f 45 f0 vmovdqa XMMWORD PTR [rbp-0x10],xmm0 + 4860d4: c5 f9 6f 4d f0 vmovdqa xmm1,XMMWORD PTR [rbp-0x10] + 4860d9: c5 f9 6f 45 e0 vmovdqa xmm0,XMMWORD PTR [rbp-0x20] + 4860de: c4 e2 79 08 c1 vpsignb xmm0,xmm0,xmm1 + 4860e3: c5 f9 7f 85 40 ff ff vmovdqa XMMWORD PTR [rbp-0xc0],xmm0 + 4860ea: ff + 4860eb: c5 f9 6f 85 00 ff ff vmovdqa xmm0,XMMWORD PTR [rbp-0x100] + 4860f2: ff + 4860f3: c5 f9 7f 45 c0 vmovdqa XMMWORD PTR [rbp-0x40],xmm0 + 4860f8: c5 f9 6f 85 10 ff ff vmovdqa xmm0,XMMWORD PTR [rbp-0xf0] + 4860ff: ff + 486100: c5 f9 7f 45 d0 vmovdqa XMMWORD PTR [rbp-0x30],xmm0 + 486105: c5 f9 6f 4d d0 vmovdqa xmm1,XMMWORD PTR [rbp-0x30] + 48610a: c5 f9 6f 45 c0 vmovdqa xmm0,XMMWORD PTR [rbp-0x40] + 48610f: c4 e2 79 08 c1 vpsignb xmm0,xmm0,xmm1 + 486114: c5 f9 7f 85 50 ff ff vmovdqa XMMWORD PTR [rbp-0xb0],xmm0 + 48611b: ff + 48611c: c5 f9 6f 85 40 ff ff vmovdqa xmm0,XMMWORD PTR [rbp-0xc0] + 486123: ff + 486124: c5 f9 7f 45 a0 vmovdqa XMMWORD PTR [rbp-0x60],xmm0 + 486129: c5 f9 6f 85 50 ff ff vmovdqa xmm0,XMMWORD PTR [rbp-0xb0] + 486130: ff + 486131: c5 f9 7f 45 b0 vmovdqa XMMWORD PTR [rbp-0x50],xmm0 + 486136: c5 f9 6f 4d b0 vmovdqa xmm1,XMMWORD PTR [rbp-0x50] + 48613b: c5 f9 6f 45 a0 vmovdqa xmm0,XMMWORD PTR [rbp-0x60] + 486140: c4 e2 79 04 c1 vpmaddubsw xmm0,xmm0,xmm1 + 486145: c5 f9 7f 85 60 ff ff vmovdqa XMMWORD PTR [rbp-0xa0],xmm0 + 48614c: ff +``` +Notice the `vpmaddubsw` instruction which is only availabe in AVX and AVX2. +``` +486140: c4 e2 79 04 c1 vpmaddubsw xmm0,xmm0,xmm1 +``` +Now, the `xmm0` and `xmm1` registers are 128-bit registers and the `vpmaddubsw` +and AVX2 uses 256-bit registers `ymm0, ymm1...ymm31`. This is a clear indication