Fix AVX2 int4pack_mm_kernel crash if weighs are unaligned (pytorch#12…

…4433) Followup after pytorch#124128 `s/_mm256_load_si128/_mm256_loadu_si128/` Pull Request resolved: pytorch#124433 Approved by: https://github.com/desertfire
masnesral · Apr 19, 2024 · b2f6cfd · b2f6cfd
1 parent a6f044a
commit b2f6cfd
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/aten/src/ATen/native/cpu/int4mm_kernel.cpp b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
@@ -280,7 +280,7 @@ inline void tinygemm_kernel(
         // when BLOCK_N = 32, handle each row at a time
         if constexpr (col == 0) {
           __m256i mask = _mm256_set1_epi32(0xF);
-          __m128i b4 = _mm_load_si128((__m128i*)(B + k * ldb));
+          __m128i b4 = _mm_loadu_si128((__m128i*)(B + k * ldb));
           if (k + PREFETCH_SIZE_K < K) {
             _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb, _MM_HINT_T0);
           }