Add Neon I8MM implementation of vpx_convolve12

Add an Armv8.6 Neon I8MM implementation of vpx_convolve12 and associated unit tests. Change-Id: Ib3891565bb2b64b6690ac1ee61d82a4906f81d07
webmproject · Jan 8, 2025 · dbd0d11 · dbd0d11
1 parent bc04738
commit dbd0d11
Show file tree

Hide file tree

Showing 3 changed files with 103 additions and 2 deletions.
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
@@ -2165,7 +2165,7 @@ INSTANTIATE_TEST_SUITE_P(NEON_I8MM, ConvolveTest,
 #if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
 const ConvolveFunctions12Tap convolve12Tap_neon_i8mm(
     vpx_convolve12_horiz_neon_i8mm, vpx_convolve12_vert_neon_i8mm,
-    vpx_convolve12_neon, 0);
+    vpx_convolve12_neon_i8mm, 0);
 const Convolve12TapParam kArrayConvolve12Tap_neon_i8mm[] = { ALL_SIZES_12TAP(
     convolve12Tap_neon_i8mm) };
 INSTANTIATE_TEST_SUITE_P(NEON_I8MM, ConvolveTest12Tap,

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
@@ -202,7 +202,7 @@ ()
   specialize qw/vpx_convolve12_horiz ssse3 avx2 neon neon_dotprod neon_i8mm/;
 
   add_proto qw/void vpx_convolve12/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-  specialize qw/vpx_convolve12 ssse3 avx2 neon neon_dotprod/;
+  specialize qw/vpx_convolve12 ssse3 avx2 neon neon_dotprod neon_i8mm/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_convolve12_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";

diff --git a/vp9/encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c b/vp9/encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c
@@ -273,3 +273,104 @@ void vpx_convolve12_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
     w -= 8;
   } while (w != 0);
 }
+
+static INLINE void vpx_convolve12_2d_horiz_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int w,
+    int h) {
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h % 4 == 3);
+
+  // Split 12-tap filter into two 6-tap filters, masking the top two elements.
+  // { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }
+  const int8x8_t mask = vcreate_s8(0x0000ffffffffffff);
+  const int8x8_t filter_0 = vand_s8(vmovn_s16(vld1q_s16(filter[x0_q4])), mask);
+  const int8x8_t filter_1 =
+      vext_s8(vmovn_s16(vld1q_s16(filter[x0_q4] + 4)), vdup_n_s8(0), 2);
+
+  // Stagger each 6-tap filter to enable use of matrix multiply instructions.
+  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
+  const int8x16_t x_filter[2] = {
+    vcombine_s8(filter_0, vext_s8(filter_0, filter_0, 7)),
+    vcombine_s8(filter_1, vext_s8(filter_1, filter_1, 7))
+  };
+
+  const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
+
+  src -= MAX_FILTER_TAP / 2 - 1;
+
+  do {
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int width = w;
+
+    do {
+      uint8x16_t s0[2], s1[2], s2[2], s3[2];
+      load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+      load_u8_16x4(s + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+      uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl);
+      uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl);
+      uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl);
+      uint8x8_t d3 = convolve12_8_h(s3, x_filter, permute_tbl);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    h -= 4;
+  } while (h != 3);
+
+  do {
+    uint8x16_t s0[2], s1[2], s2[2];
+    load_u8_16x3(src, src_stride, &s0[0], &s1[0], &s2[0]);
+    load_u8_16x3(src + 6, src_stride, &s0[1], &s1[1], &s2[1]);
+
+    uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl);
+    uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl);
+    uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl);
+
+    store_u8_8x3(dst, dst_stride, d0, d1, d2);
+
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+void vpx_convolve12_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel12 *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  // Scaling not supported by Neon implementation.
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vpx_convolve12_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                     y0_q4, y_step_q4, w, h);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h == 32 || h == 16 || h == 8);
+
+  DECLARE_ALIGNED(32, uint8_t, im_block[BW * (BH + MAX_FILTER_TAP)]);
+
+  const int im_stride = BW;
+  // Account for the vertical pass needing MAX_FILTER_TAP / 2 - 1 lines prior
+  // and MAX_FILTER_TAP / 2 lines post.
+  const int im_height = h + MAX_FILTER_TAP - 1;
+  const ptrdiff_t border_offset = MAX_FILTER_TAP / 2 - 1;
+
+  // Filter starting border_offset rows up.
+  vpx_convolve12_2d_horiz_neon_i8mm(src - src_stride * border_offset,
+                                    src_stride, im_block, im_stride, filter,
+                                    x0_q4, w, im_height);
+
+  vpx_convolve12_vert_neon_i8mm(im_block + im_stride * border_offset, im_stride,
+                                dst, dst_stride, filter, x0_q4, x_step_q4,
+                                y0_q4, y_step_q4, w, h);
+}