Skip to content

Commit

Permalink
Add Neon I8MM implementation of vpx_convolve12
Browse files Browse the repository at this point in the history
Add an Armv8.6 Neon I8MM implementation of vpx_convolve12 and
associated unit tests.

Change-Id: Ib3891565bb2b64b6690ac1ee61d82a4906f81d07
  • Loading branch information
jwright-arm committed Jan 8, 2025
1 parent bc04738 commit dbd0d11
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 2 deletions.
2 changes: 1 addition & 1 deletion test/convolve_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2165,7 +2165,7 @@ INSTANTIATE_TEST_SUITE_P(NEON_I8MM, ConvolveTest,
#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
const ConvolveFunctions12Tap convolve12Tap_neon_i8mm(
vpx_convolve12_horiz_neon_i8mm, vpx_convolve12_vert_neon_i8mm,
vpx_convolve12_neon, 0);
vpx_convolve12_neon_i8mm, 0);
const Convolve12TapParam kArrayConvolve12Tap_neon_i8mm[] = { ALL_SIZES_12TAP(
convolve12Tap_neon_i8mm) };
INSTANTIATE_TEST_SUITE_P(NEON_I8MM, ConvolveTest12Tap,
Expand Down
2 changes: 1 addition & 1 deletion vp9/common/vp9_rtcd_defs.pl
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ ()
specialize qw/vpx_convolve12_horiz ssse3 avx2 neon neon_dotprod neon_i8mm/;

add_proto qw/void vpx_convolve12/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
specialize qw/vpx_convolve12 ssse3 avx2 neon neon_dotprod/;
specialize qw/vpx_convolve12 ssse3 avx2 neon neon_dotprod neon_i8mm/;

if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_convolve12_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
Expand Down
101 changes: 101 additions & 0 deletions vp9/encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,104 @@ void vpx_convolve12_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
w -= 8;
} while (w != 0);
}

static INLINE void vpx_convolve12_2d_horiz_neon_i8mm(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int w,
int h) {
assert(w == 32 || w == 16 || w == 8);
assert(h % 4 == 3);

// Split 12-tap filter into two 6-tap filters, masking the top two elements.
// { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }
const int8x8_t mask = vcreate_s8(0x0000ffffffffffff);
const int8x8_t filter_0 = vand_s8(vmovn_s16(vld1q_s16(filter[x0_q4])), mask);
const int8x8_t filter_1 =
vext_s8(vmovn_s16(vld1q_s16(filter[x0_q4] + 4)), vdup_n_s8(0), 2);

// Stagger each 6-tap filter to enable use of matrix multiply instructions.
// { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 }
const int8x16_t x_filter[2] = {
vcombine_s8(filter_0, vext_s8(filter_0, filter_0, 7)),
vcombine_s8(filter_1, vext_s8(filter_1, filter_1, 7))
};

const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);

src -= MAX_FILTER_TAP / 2 - 1;

do {
const uint8_t *s = src;
uint8_t *d = dst;
int width = w;

do {
uint8x16_t s0[2], s1[2], s2[2], s3[2];
load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
load_u8_16x4(s + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);

uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl);
uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl);
uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl);
uint8x8_t d3 = convolve12_8_h(s3, x_filter, permute_tbl);

store_u8_8x4(d, dst_stride, d0, d1, d2, d3);

s += 8;
d += 8;
width -= 8;
} while (width != 0);
src += 4 * src_stride;
dst += 4 * dst_stride;
h -= 4;
} while (h != 3);

do {
uint8x16_t s0[2], s1[2], s2[2];
load_u8_16x3(src, src_stride, &s0[0], &s1[0], &s2[0]);
load_u8_16x3(src + 6, src_stride, &s0[1], &s1[1], &s2[1]);

uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl);
uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl);
uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl);

store_u8_8x3(dst, dst_stride, d0, d1, d2);

src += 8;
dst += 8;
w -= 8;
} while (w != 0);
}

void vpx_convolve12_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel12 *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
// Scaling not supported by Neon implementation.
if (x_step_q4 != 16 || y_step_q4 != 16) {
vpx_convolve12_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
y0_q4, y_step_q4, w, h);
return;
}

assert(w == 32 || w == 16 || w == 8);
assert(h == 32 || h == 16 || h == 8);

DECLARE_ALIGNED(32, uint8_t, im_block[BW * (BH + MAX_FILTER_TAP)]);

const int im_stride = BW;
// Account for the vertical pass needing MAX_FILTER_TAP / 2 - 1 lines prior
// and MAX_FILTER_TAP / 2 lines post.
const int im_height = h + MAX_FILTER_TAP - 1;
const ptrdiff_t border_offset = MAX_FILTER_TAP / 2 - 1;

// Filter starting border_offset rows up.
vpx_convolve12_2d_horiz_neon_i8mm(src - src_stride * border_offset,
src_stride, im_block, im_stride, filter,
x0_q4, w, im_height);

vpx_convolve12_vert_neon_i8mm(im_block + im_stride * border_offset, im_stride,
dst, dst_stride, filter, x0_q4, x_step_q4,
y0_q4, y_step_q4, w, h);
}

0 comments on commit dbd0d11

Please sign in to comment.