tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

convolve_neon_i8mm.c (57512B)


      1 /*
      2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 
     14 #include "config/aom_config.h"
     15 #include "config/av1_rtcd.h"
     16 
     17 #include "aom_dsp/aom_dsp_common.h"
     18 #include "aom_dsp/arm/mem_neon.h"
     19 #include "aom_dsp/arm/transpose_neon.h"
     20 #include "aom_ports/mem.h"
     21 #include "av1/common/arm/convolve_neon.h"
     22 #include "av1/common/arm/convolve_neon_i8mm.h"
     23 #include "av1/common/convolve.h"
     24 #include "av1/common/filter.h"
     25 
     26 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
     27  // Shift left and insert new last column in transposed 4x4 block.
     28  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
     29  // Shift left and insert two new columns in transposed 4x4 block.
     30  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
     31  // Shift left and insert three new columns in transposed 4x4 block.
     32  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
     33 };
     34 
     35 static inline int16x4_t convolve12_4_x(uint8x16_t samples[2],
     36                                       const int8x16_t filter[2],
     37                                       const uint8x16_t permute_tbl,
     38                                       const int32x4_t horiz_const) {
     39  // Permute samples ready for matrix multiply.
     40  // {  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
     41  // {  4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
     42  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples[0], permute_tbl),
     43                                 vqtbl1q_u8(samples[1], permute_tbl) };
     44 
     45  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
     46  // (filter), destructively accumulating into the destination register.
     47  int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]);
     48  sum = vusmmlaq_s32(sum, perm_samples[1], filter[1]);
     49 
     50  return vshrn_n_s32(sum, 1);
     51 }
     52 
     53 static inline uint8x8_t convolve12_8_x(uint8x16_t samples[2],
     54                                       const int8x16_t filter[2],
     55                                       const uint8x16x2_t permute_tbl,
     56                                       const int32x4_t horiz_const) {
     57  // Permute samples ready for matrix multiply.
     58  // {  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
     59  // {  4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
     60  // {  6,  7,  8,  9, 10, 11, 12, 13,  8,  9, 10, 11, 12, 13, 14, 15 }
     61  // { 10, 11, 12, 13, 14, 15, 16, 17, 12, 13, 14, 15, 16, 17, 18, 19 }
     62  uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]),
     63                                 vqtbl1q_u8(samples[0], permute_tbl.val[1]),
     64                                 vqtbl1q_u8(samples[1], permute_tbl.val[0]),
     65                                 vqtbl1q_u8(samples[1], permute_tbl.val[1]) };
     66 
     67  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
     68  // (filter), destructively accumulating into the destination register.
     69  int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]);
     70  int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter[0]);
     71  sum0123 = vusmmlaq_s32(sum0123, perm_samples[2], filter[1]);
     72  sum4567 = vusmmlaq_s32(sum4567, perm_samples[3], filter[1]);
     73 
     74  // Narrow and re-pack.
     75  int16x8_t sum_s16 =
     76      vcombine_s16(vshrn_n_s32(sum0123, 1), vshrn_n_s32(sum4567, 1));
     77  return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
     78 }
     79 
     80 static inline void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src,
     81                                                 int src_stride, uint8_t *dst,
     82                                                 int dst_stride, int w, int h,
     83                                                 const int16_t *x_filter_ptr) {
     84  // The no-op filter should never be used here.
     85  assert(x_filter_ptr[5] != 128);
     86 
     87  // Split 12-tap filter into two 6-tap filters, masking the top two elements.
     88  // { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }
     89  const int8x8_t mask = vcreate_s8(0x0000ffffffffffff);
     90  const int8x8_t filter_0 = vand_s8(vmovn_s16(vld1q_s16(x_filter_ptr)), mask);
     91  const int8x8_t filter_1 =
     92      vext_s8(vmovn_s16(vld1q_s16(x_filter_ptr + 4)), vdup_n_s8(0), 2);
     93 
     94  // Stagger each 6-tap filter to enable use of matrix multiply instructions.
     95  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
     96  const int8x16_t filter[2] = {
     97    vcombine_s8(filter_0, vext_s8(filter_0, filter_0, 7)),
     98    vcombine_s8(filter_1, vext_s8(filter_1, filter_1, 7))
     99  };
    100 
    101  // A shim of 1 << (ROUND0_BITS - 1) enables us to simplify computation in the
    102  // convolution kernels: Adding this shim enables us to use a single rounding
    103  // right shift by FILTER_BITS instead of two rounding right shifts: first by
    104  // ROUND0_BITS, and then subsequently by FILTER_BITS - ROUND0_BITS.
    105  const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
    106 
    107  if (w <= 4) {
    108    const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
    109 
    110    do {
    111      uint8x16_t s0[2], s1[2], s2[2], s3[2];
    112      load_u8_16x4(src, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
    113      load_u8_16x4(src + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
    114 
    115      int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const);
    116      int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const);
    117      int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const);
    118      int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const);
    119 
    120      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
    121      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
    122 
    123      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
    124      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
    125 
    126      dst += 4 * dst_stride;
    127      src += 4 * src_stride;
    128      h -= 4;
    129    } while (h != 0);
    130  } else {
    131    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
    132 
    133    do {
    134      const uint8_t *s = src;
    135      uint8_t *d = dst;
    136      int width = w;
    137 
    138      do {
    139        uint8x16_t s0[2], s1[2], s2[2], s3[2];
    140        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
    141        load_u8_16x4(s + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
    142 
    143        uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const);
    144        uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const);
    145        uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const);
    146        uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const);
    147 
    148        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    149 
    150        s += 8;
    151        d += 8;
    152        width -= 8;
    153      } while (width != 0);
    154      src += 4 * src_stride;
    155      dst += 4 * dst_stride;
    156      h -= 4;
    157    } while (h != 0);
    158  }
    159 }
    160 
    161 static inline uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
    162                                      const uint8x16x3_t permute_tbl,
    163                                      const int32x4_t horiz_const) {
    164  // Permute samples ready for dot product.
    165  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
    166  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
    167  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
    168  uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
    169                                 vqtbl1q_u8(samples, permute_tbl.val[1]),
    170                                 vqtbl1q_u8(samples, permute_tbl.val[2]) };
    171 
    172  int32x4_t sum0123 = vusdotq_lane_s32(horiz_const, perm_samples[0], filter, 0);
    173  sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filter, 1);
    174 
    175  int32x4_t sum4567 = vusdotq_lane_s32(horiz_const, perm_samples[1], filter, 0);
    176  sum4567 = vusdotq_lane_s32(sum4567, perm_samples[2], filter, 1);
    177 
    178  int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
    179  // We halved the convolution filter values so - 1 from the right shift.
    180  return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
    181 }
    182 
    183 static inline void convolve_x_sr_8tap_neon_i8mm(
    184    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
    185    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x,
    186    const int32x4_t horiz_const) {
    187  // Filter values are even, so halve to reduce intermediate precision reqs.
    188  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(filter_x), 1);
    189  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
    190 
    191  do {
    192    const uint8_t *s = src;
    193    uint8_t *d = dst;
    194    int w = width;
    195 
    196    do {
    197      uint8x16_t s0, s1, s2, s3;
    198      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
    199 
    200      uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const);
    201      uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const);
    202      uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const);
    203      uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const);
    204 
    205      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    206 
    207      s += 8;
    208      d += 8;
    209      w -= 8;
    210    } while (w != 0);
    211    src += 4 * src_stride;
    212    dst += 4 * dst_stride;
    213    height -= 4;
    214  } while (height != 0);
    215 }
    216 
    217 static inline int16x4_t convolve6_4_x(uint8x16_t samples,
    218                                      const int8x16_t filter,
    219                                      const uint8x16_t permute_tbl,
    220                                      const int32x4_t horiz_const) {
    221  // Permute samples ready for matrix multiply.
    222  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
    223  uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
    224 
    225  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
    226  // (filter), destructively accumulating into the destination register.
    227  int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples, filter);
    228 
    229  // Further narrowing and packing is performed by the caller.
    230  return vmovn_s32(sum);
    231 }
    232 
    233 static inline uint8x8_t convolve6_8_x(uint8x16_t samples,
    234                                      const int8x16_t filter,
    235                                      const uint8x16x2_t permute_tbl,
    236                                      const int32x4_t horiz_const) {
    237  // Permute samples ready for matrix multiply.
    238  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
    239  // { 4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
    240  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
    241                                 vqtbl1q_u8(samples, permute_tbl.val[1]) };
    242 
    243  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
    244  // (filter), destructively accumulating into the destination register.
    245  int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter);
    246  int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter);
    247 
    248  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
    249  // We halved the convolution filter values so - 1 from the right shift.
    250  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    251 }
    252 
    253 static inline void convolve_x_sr_6tap_neon_i8mm(
    254    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
    255    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x,
    256    const int32x4_t horiz_const) {
    257  // Filter values are even, so halve to reduce intermediate precision reqs.
    258  const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(filter_x), 1);
    259  // Stagger the filter for use with the matrix multiply instructions.
    260  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
    261  const int8x16_t x_filter =
    262      vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8);
    263 
    264  if (width == 4) {
    265    const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
    266    do {
    267      uint8x16_t s0, s1, s2, s3;
    268      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
    269 
    270      int16x4_t t0 = convolve6_4_x(s0, x_filter, permute_tbl, horiz_const);
    271      int16x4_t t1 = convolve6_4_x(s1, x_filter, permute_tbl, horiz_const);
    272      int16x4_t t2 = convolve6_4_x(s2, x_filter, permute_tbl, horiz_const);
    273      int16x4_t t3 = convolve6_4_x(s3, x_filter, permute_tbl, horiz_const);
    274      // We halved the filter values so -1 from right shift.
    275      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
    276      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
    277 
    278      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
    279      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
    280 
    281      src += 4 * src_stride;
    282      dst += 4 * dst_stride;
    283      height -= 4;
    284    } while (height != 0);
    285  } else {
    286    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
    287    do {
    288      const uint8_t *s = src;
    289      uint8_t *d = dst;
    290      int w = width;
    291 
    292      do {
    293        uint8x16_t s0, s1, s2, s3;
    294        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
    295 
    296        uint8x8_t d0 = convolve6_8_x(s0, x_filter, permute_tbl, horiz_const);
    297        uint8x8_t d1 = convolve6_8_x(s1, x_filter, permute_tbl, horiz_const);
    298        uint8x8_t d2 = convolve6_8_x(s2, x_filter, permute_tbl, horiz_const);
    299        uint8x8_t d3 = convolve6_8_x(s3, x_filter, permute_tbl, horiz_const);
    300 
    301        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    302 
    303        s += 8;
    304        d += 8;
    305        w -= 8;
    306      } while (w != 0);
    307      src += 4 * src_stride;
    308      dst += 4 * dst_stride;
    309      height -= 4;
    310    } while (height != 0);
    311  }
    312 }
    313 
    314 void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride,
    315                                 uint8_t *dst, int dst_stride, int w, int h,
    316                                 const InterpFilterParams *filter_params_x,
    317                                 const int subpel_x_qn,
    318                                 ConvolveParams *conv_params) {
    319  if (w == 2 || h == 2) {
    320    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
    321                        subpel_x_qn, conv_params);
    322    return;
    323  }
    324 
    325  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
    326  src -= horiz_offset;
    327 
    328  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
    329      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    330 
    331  int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
    332 
    333  // A shim of 1 << (ROUND0_BITS - 1) enables us to simplify computation in the
    334  // convolution kernels: Adding this shim enables us to use a single rounding
    335  // right shift by FILTER_BITS instead of two rounding right shifts: first by
    336  // ROUND0_BITS, and then subsequently by FILTER_BITS - ROUND0_BITS.
    337  // Halve the total because we will halve the filter values.
    338  const int32x4_t horiz_const = vdupq_n_s32((1 << ((ROUND0_BITS - 1)) / 2));
    339 
    340  if (filter_taps <= 6) {
    341    convolve_x_sr_6tap_neon_i8mm(src + 1, src_stride, dst, dst_stride, w, h,
    342                                 x_filter_ptr, horiz_const);
    343    return;
    344  }
    345 
    346  if (filter_taps > 8) {
    347    convolve_x_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
    348                                  x_filter_ptr);
    349    return;
    350  }
    351 
    352  convolve_x_sr_8tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h,
    353                               x_filter_ptr, horiz_const);
    354 }
    355 
    356 static inline int16x4_t convolve12_4_y(const uint8x16_t s0, const uint8x16_t s1,
    357                                       const uint8x16_t s2,
    358                                       const int8x8_t filters_0_7,
    359                                       const int8x8_t filters_4_11) {
    360  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters_0_7, 0);
    361  sum = vusdotq_lane_s32(sum, s1, filters_0_7, 1);
    362  sum = vusdotq_lane_s32(sum, s2, filters_4_11, 1);
    363 
    364  // Further narrowing and packing is performed by the caller.
    365  return vshrn_n_s32(sum, 1);
    366 }
    367 
    368 static inline uint8x8_t convolve12_8_y(
    369    const uint8x16_t s0_lo, const uint8x16_t s0_hi, const uint8x16_t s1_lo,
    370    const uint8x16_t s1_hi, const uint8x16_t s2_lo, const uint8x16_t s2_hi,
    371    const int8x8_t filters_0_7, const int8x8_t filters_4_11) {
    372  int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0_lo, filters_0_7, 0);
    373  sum0123 = vusdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1);
    374  sum0123 = vusdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1);
    375 
    376  int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0_hi, filters_0_7, 0);
    377  sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1);
    378  sum4567 = vusdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1);
    379 
    380  // Narrow and re-pack.
    381  int16x8_t sum =
    382      vcombine_s16(vshrn_n_s32(sum0123, 1), vshrn_n_s32(sum4567, 1));
    383  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    384 }
    385 
    386 static inline void convolve_y_sr_12tap_neon_i8mm(const uint8_t *src_ptr,
    387                                                 int src_stride,
    388                                                 uint8_t *dst_ptr,
    389                                                 int dst_stride, int w, int h,
    390                                                 const int16_t *y_filter_ptr) {
    391  // The no-op filter should never be used here.
    392  assert(y_filter_ptr[5] != 128);
    393 
    394  const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr));
    395  const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4));
    396 
    397  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
    398 
    399  if (w == 4) {
    400    uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
    401    load_u8_8x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
    402                 &s8, &s9, &sA);
    403    src_ptr += 11 * src_stride;
    404 
    405    // This operation combines a conventional transpose and the sample permute
    406    // (see horizontal case) required before computing the dot product.
    407    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
    408    transpose_concat_elems_u8_4x4(s0, s1, s2, s3, &s0123);
    409    transpose_concat_elems_u8_4x4(s1, s2, s3, s4, &s1234);
    410    transpose_concat_elems_u8_4x4(s2, s3, s4, s5, &s2345);
    411    transpose_concat_elems_u8_4x4(s3, s4, s5, s6, &s3456);
    412    transpose_concat_elems_u8_4x4(s4, s5, s6, s7, &s4567);
    413    transpose_concat_elems_u8_4x4(s5, s6, s7, s8, &s5678);
    414    transpose_concat_elems_u8_4x4(s6, s7, s8, s9, &s6789);
    415    transpose_concat_elems_u8_4x4(s7, s8, s9, sA, &s789A);
    416 
    417    do {
    418      uint8x8_t sB, sC, sD, sE;
    419      load_u8_8x4(src_ptr, src_stride, &sB, &sC, &sD, &sE);
    420 
    421      uint8x16_t s89AB, s9ABC, sABCD, sBCDE;
    422      transpose_concat_elems_u8_4x4(sB, sC, sD, sE, &sBCDE);
    423 
    424      // Merge new data into block from previous iteration.
    425      uint8x16x2_t samples_LUT = { { s789A, sBCDE } };
    426      s89AB = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
    427      s9ABC = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
    428      sABCD = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
    429 
    430      int16x4_t d0 =
    431          convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11);
    432      int16x4_t d1 =
    433          convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11);
    434      int16x4_t d2 =
    435          convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
    436      int16x4_t d3 =
    437          convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
    438      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
    439      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
    440 
    441      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
    442      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
    443 
    444      // Prepare block for next iteration - re-using as much as possible.
    445      // Shuffle everything up four rows.
    446      s0123 = s4567;
    447      s1234 = s5678;
    448      s2345 = s6789;
    449      s3456 = s789A;
    450      s4567 = s89AB;
    451      s5678 = s9ABC;
    452      s6789 = sABCD;
    453      s789A = sBCDE;
    454 
    455      src_ptr += 4 * src_stride;
    456      dst_ptr += 4 * dst_stride;
    457      h -= 4;
    458    } while (h != 0);
    459  } else {
    460    do {
    461      int height = h;
    462      const uint8_t *s = src_ptr;
    463      uint8_t *d = dst_ptr;
    464 
    465      uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
    466      load_u8_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
    467                   &s9, &sA);
    468      s += 11 * src_stride;
    469 
    470      // This operation combines a conventional transpose and the sample
    471      // permute (see horizontal case) required before computing the dot
    472      // product.
    473      uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
    474          s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
    475          s6789_hi, s789A_lo, s789A_hi;
    476      transpose_concat_elems_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
    477      transpose_concat_elems_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
    478      transpose_concat_elems_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
    479      transpose_concat_elems_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
    480      transpose_concat_elems_u8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
    481      transpose_concat_elems_u8_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
    482      transpose_concat_elems_u8_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
    483      transpose_concat_elems_u8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
    484 
    485      do {
    486        uint8x8_t sB, sC, sD, sE;
    487        load_u8_8x4(s, src_stride, &sB, &sC, &sD, &sE);
    488 
    489        uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
    490            sBCDE_lo, sBCDE_hi;
    491        transpose_concat_elems_u8_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
    492 
    493        // Merge new data into block from previous iteration.
    494        uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
    495        s89AB_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
    496        s9ABC_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
    497        sABCD_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
    498 
    499        uint8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
    500        s89AB_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
    501        s9ABC_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
    502        sABCD_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
    503 
    504        uint8x8_t d0 =
    505            convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
    506                           s89AB_hi, filter_0_7, filter_4_11);
    507        uint8x8_t d1 =
    508            convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
    509                           s9ABC_hi, filter_0_7, filter_4_11);
    510        uint8x8_t d2 =
    511            convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
    512                           sABCD_hi, filter_0_7, filter_4_11);
    513        uint8x8_t d3 =
    514            convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
    515                           sBCDE_hi, filter_0_7, filter_4_11);
    516 
    517        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    518 
    519        // Prepare block for next iteration - re-using as much as possible.
    520        // Shuffle everything up four rows.
    521        s0123_lo = s4567_lo;
    522        s0123_hi = s4567_hi;
    523        s1234_lo = s5678_lo;
    524        s1234_hi = s5678_hi;
    525        s2345_lo = s6789_lo;
    526        s2345_hi = s6789_hi;
    527        s3456_lo = s789A_lo;
    528        s3456_hi = s789A_hi;
    529        s4567_lo = s89AB_lo;
    530        s4567_hi = s89AB_hi;
    531        s5678_lo = s9ABC_lo;
    532        s5678_hi = s9ABC_hi;
    533        s6789_lo = sABCD_lo;
    534        s6789_hi = sABCD_hi;
    535        s789A_lo = sBCDE_lo;
    536        s789A_hi = sBCDE_hi;
    537 
    538        s += 4 * src_stride;
    539        d += 4 * dst_stride;
    540        height -= 4;
    541      } while (height != 0);
    542      src_ptr += 8;
    543      dst_ptr += 8;
    544      w -= 8;
    545    } while (w != 0);
    546  }
    547 }
    548 
    549 static inline int16x4_t convolve8_4_y(const uint8x16_t s0, const uint8x16_t s1,
    550                                      const int8x8_t filters) {
    551  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters, 0);
    552  sum = vusdotq_lane_s32(sum, s1, filters, 1);
    553 
    554  // Further narrowing and packing is performed by the caller.
    555  return vmovn_s32(sum);
    556 }
    557 
    558 static inline uint8x8_t convolve8_8_y(const uint8x16_t s0_lo,
    559                                      const uint8x16_t s0_hi,
    560                                      const uint8x16_t s1_lo,
    561                                      const uint8x16_t s1_hi,
    562                                      const int8x8_t filters) {
    563  int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0_lo, filters, 0);
    564  sum0123 = vusdotq_lane_s32(sum0123, s1_lo, filters, 1);
    565 
    566  int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0_hi, filters, 0);
    567  sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters, 1);
    568 
    569  // Narrow and re-pack.
    570  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
    571  // We halved the filter values so -1 from right shift.
    572  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    573 }
    574 
    575 static inline void convolve_y_sr_8tap_neon_i8mm(const uint8_t *src_ptr,
    576                                                int src_stride,
    577                                                uint8_t *dst_ptr,
    578                                                int dst_stride, int w, int h,
    579                                                const int16_t *y_filter_ptr) {
    580  // Filter values are even, so halve to reduce intermediate precision reqs.
    581  const int8x8_t filter = vshrn_n_s16(vld1q_s16(y_filter_ptr), 1);
    582 
    583  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
    584 
    585  if (w == 4) {
    586    uint8x8_t s0, s1, s2, s3, s4, s5, s6;
    587    load_u8_8x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
    588    src_ptr += 7 * src_stride;
    589 
    590    // This operation combines a conventional transpose and the sample permute
    591    // (see horizontal case) required before computing the dot product.
    592    uint8x16_t s0123, s1234, s2345, s3456;
    593    transpose_concat_elems_u8_4x4(s0, s1, s2, s3, &s0123);
    594    transpose_concat_elems_u8_4x4(s1, s2, s3, s4, &s1234);
    595    transpose_concat_elems_u8_4x4(s2, s3, s4, s5, &s2345);
    596    transpose_concat_elems_u8_4x4(s3, s4, s5, s6, &s3456);
    597 
    598    do {
    599      uint8x8_t s7, s8, s9, sA;
    600      load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &sA);
    601 
    602      uint8x16_t s4567, s5678, s6789, s789A;
    603      transpose_concat_elems_u8_4x4(s7, s8, s9, sA, &s789A);
    604 
    605      // Merge new data into block from previous iteration.
    606      uint8x16x2_t samples_LUT = { { s3456, s789A } };
    607      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
    608      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
    609      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
    610 
    611      int16x4_t d0 = convolve8_4_y(s0123, s4567, filter);
    612      int16x4_t d1 = convolve8_4_y(s1234, s5678, filter);
    613      int16x4_t d2 = convolve8_4_y(s2345, s6789, filter);
    614      int16x4_t d3 = convolve8_4_y(s3456, s789A, filter);
    615      // We halved the filter values so -1 from right shift.
    616      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
    617      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
    618 
    619      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
    620      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
    621 
    622      // Prepare block for next iteration - re-using as much as possible.
    623      // Shuffle everything up four rows.
    624      s0123 = s4567;
    625      s1234 = s5678;
    626      s2345 = s6789;
    627      s3456 = s789A;
    628 
    629      src_ptr += 4 * src_stride;
    630      dst_ptr += 4 * dst_stride;
    631      h -= 4;
    632    } while (h != 0);
    633  } else {
    634    do {
    635      int height = h;
    636      const uint8_t *s = src_ptr;
    637      uint8_t *d = dst_ptr;
    638 
    639      uint8x8_t s0, s1, s2, s3, s4, s5, s6;
    640      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
    641      s += 7 * src_stride;
    642 
    643      // This operation combines a conventional transpose and the sample
    644      // permute (see horizontal case) required before computing the dot
    645      // product.
    646      uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
    647          s3456_lo, s3456_hi;
    648      transpose_concat_elems_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
    649      transpose_concat_elems_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
    650      transpose_concat_elems_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
    651      transpose_concat_elems_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
    652 
    653      do {
    654        uint8x8_t s7, s8, s9, sA;
    655        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &sA);
    656 
    657        uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
    658            s789A_lo, s789A_hi;
    659        transpose_concat_elems_u8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
    660 
    661        // Merge new data into block from previous iteration.
    662        uint8x16x2_t samples_LUT_lo = { { s3456_lo, s789A_lo } };
    663        s4567_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
    664        s5678_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
    665        s6789_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
    666 
    667        uint8x16x2_t samples_LUT_hi = { { s3456_hi, s789A_hi } };
    668        s4567_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
    669        s5678_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
    670        s6789_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
    671 
    672        uint8x8_t d0 =
    673            convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter);
    674        uint8x8_t d1 =
    675            convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter);
    676        uint8x8_t d2 =
    677            convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter);
    678        uint8x8_t d3 =
    679            convolve8_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, filter);
    680 
    681        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    682 
    683        // Prepare block for next iteration - re-using as much as possible.
    684        // Shuffle everything up four rows.
    685        s0123_lo = s4567_lo;
    686        s0123_hi = s4567_hi;
    687        s1234_lo = s5678_lo;
    688        s1234_hi = s5678_hi;
    689        s2345_lo = s6789_lo;
    690        s2345_hi = s6789_hi;
    691        s3456_lo = s789A_lo;
    692        s3456_hi = s789A_hi;
    693 
    694        s += 4 * src_stride;
    695        d += 4 * dst_stride;
    696        height -= 4;
    697      } while (height != 0);
    698      src_ptr += 8;
    699      dst_ptr += 8;
    700      w -= 8;
    701    } while (w != 0);
    702  }
    703 }
    704 
    705 static inline int16x4_t convolve4_4_y(const uint8x16_t s0,
    706                                      const int8x8_t filters) {
    707  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters, 0);
    708 
    709  // Further narrowing and packing is performed by the caller.
    710  return vmovn_s32(sum);
    711 }
    712 
    713 static inline uint8x8_t convolve4_8_y(const uint8x16_t s0, const uint8x16_t s1,
    714                                      const int8x8_t filters) {
    715  int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters, 0);
    716  int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s1, filters, 0);
    717 
    718  // Narrow and re-pack.
    719  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
    720  // We halved the filter values so -1 from right shift.
    721  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    722 }
    723 
    724 static inline void convolve_y_sr_4tap_neon_i8mm(const uint8_t *src_ptr,
    725                                                int src_stride,
    726                                                uint8_t *dst_ptr,
    727                                                int dst_stride, int w, int h,
    728                                                const int16_t *y_filter_ptr) {
    729  // Filter values are even, so halve to reduce intermediate precision reqs.
    730  const int16x8_t filter_s16 =
    731      vcombine_s16(vld1_s16(y_filter_ptr + 2), vdup_n_s16(0));
    732  const int8x8_t filter = vshrn_n_s16(filter_s16, 1);
    733  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
    734  uint8x16x2_t samples_LUT;
    735 
    736  if (w == 4) {
    737    uint8x8_t s0, s1, s2, s3;
    738    load_u8_8x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
    739    src_ptr += 4 * src_stride;
    740 
    741    // This operation combines a conventional transpose and the sample permute
    742    // required before computing the dot product.
    743    uint8x16_t s0123;
    744    transpose_concat_elems_u8_4x4(s0, s1, s2, s3, &s0123);
    745 
    746    do {
    747      uint8x8_t s4, s5, s6, s7;
    748      load_u8_8x4(src_ptr, src_stride, &s4, &s5, &s6, &s7);
    749 
    750      uint8x16_t s4567;
    751      transpose_concat_elems_u8_4x4(s4, s5, s6, s7, &s4567);
    752 
    753      // Merge new data into block from previous iteration.
    754      samples_LUT.val[0] = s0123;
    755      samples_LUT.val[1] = s4567;
    756      uint8x16_t s1234 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
    757      uint8x16_t s2345 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
    758      uint8x16_t s3456 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
    759 
    760      int16x4_t d0 = convolve4_4_y(s0123, filter);
    761      int16x4_t d1 = convolve4_4_y(s1234, filter);
    762      int16x4_t d2 = convolve4_4_y(s2345, filter);
    763      int16x4_t d3 = convolve4_4_y(s3456, filter);
    764      // We halved the filter values so -1 from right shift.
    765      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
    766      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
    767 
    768      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
    769      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
    770 
    771      // Prepare block for next iteration - re-using as much as possible.
    772      // Shuffle everything up four rows.
    773      s0123 = s4567;
    774 
    775      src_ptr += 4 * src_stride;
    776      dst_ptr += 4 * dst_stride;
    777      h -= 4;
    778    } while (h != 0);
    779  } else {
    780    do {
    781      int height = h;
    782      const uint8_t *s = src_ptr;
    783      uint8_t *d = dst_ptr;
    784 
    785      uint8x8_t s0, s1, s2, s3;
    786      load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
    787      s += 4 * src_stride;
    788 
    789      // This operation combines a conventional transpose and the sample permute
    790      // required before computing the dot product.
    791      uint8x16_t s0123_lo, s0123_hi;
    792      transpose_concat_elems_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
    793 
    794      do {
    795        uint8x8_t s4, s5, s6, s7;
    796        load_u8_8x4(s, src_stride, &s4, &s5, &s6, &s7);
    797 
    798        uint8x16_t s4567_lo, s4567_hi;
    799        transpose_concat_elems_u8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
    800 
    801        // Merge new data into block from previous iteration.
    802        samples_LUT.val[0] = s0123_lo;
    803        samples_LUT.val[1] = s4567_lo;
    804        uint8x16_t s1234_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
    805        uint8x16_t s2345_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
    806        uint8x16_t s3456_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
    807 
    808        samples_LUT.val[0] = s0123_hi;
    809        samples_LUT.val[1] = s4567_hi;
    810        uint8x16_t s1234_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
    811        uint8x16_t s2345_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
    812        uint8x16_t s3456_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
    813 
    814        uint8x8_t d0 = convolve4_8_y(s0123_lo, s0123_hi, filter);
    815        uint8x8_t d1 = convolve4_8_y(s1234_lo, s1234_hi, filter);
    816        uint8x8_t d2 = convolve4_8_y(s2345_lo, s2345_hi, filter);
    817        uint8x8_t d3 = convolve4_8_y(s3456_lo, s3456_hi, filter);
    818 
    819        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    820 
    821        // Prepare block for next iteration - re-using as much as possible.
    822        // Shuffle everything up four rows.
    823        s0123_lo = s4567_lo;
    824        s0123_hi = s4567_hi;
    825 
    826        s += 4 * src_stride;
    827        d += 4 * dst_stride;
    828        height -= 4;
    829      } while (height != 0);
    830      src_ptr += 8;
    831      dst_ptr += 8;
    832      w -= 8;
    833    } while (w != 0);
    834  }
    835 }
    836 
    837 void av1_convolve_y_sr_neon_i8mm(const uint8_t *src, int src_stride,
    838                                 uint8_t *dst, int dst_stride, int w, int h,
    839                                 const InterpFilterParams *filter_params_y,
    840                                 const int subpel_y_qn) {
    841  if (w == 2 || h == 2) {
    842    av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
    843                        subpel_y_qn);
    844    return;
    845  }
    846 
    847  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
    848  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
    849      filter_params_y, subpel_y_qn & SUBPEL_MASK);
    850 
    851  if (y_filter_taps <= 4) {
    852    convolve_y_sr_4tap_neon_i8mm(src - src_stride, src_stride, dst, dst_stride,
    853                                 w, h, y_filter_ptr);
    854  } else if (y_filter_taps == 12) {
    855    convolve_y_sr_12tap_neon_i8mm(src - 5 * src_stride, src_stride, dst,
    856                                  dst_stride, w, h, y_filter_ptr);
    857  } else {
    858    // 6-tap or 8-tap.
    859    convolve_y_sr_8tap_neon_i8mm(src - 3 * src_stride, src_stride, dst,
    860                                 dst_stride, w, h, y_filter_ptr);
    861  }
    862 }
    863 
    864 static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples,
    865                                         const int8x8_t filters,
    866                                         const uint8x16x3_t permute_tbl,
    867                                         const int32x4_t horiz_const) {
    868  // Permute samples ready for dot product.
    869  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
    870  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
    871  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
    872  uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
    873                                 vqtbl1q_u8(samples, permute_tbl.val[1]),
    874                                 vqtbl1q_u8(samples, permute_tbl.val[2]) };
    875 
    876  int32x4_t sum0123 =
    877      vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0);
    878  sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filters, 1);
    879 
    880  int32x4_t sum4567 =
    881      vusdotq_lane_s32(horiz_const, perm_samples[1], filters, 0);
    882  sum4567 = vusdotq_lane_s32(sum4567, perm_samples[2], filters, 1);
    883 
    884  // Narrow and re-pack.
    885  // We halved the convolution filter values so -1 from the right shift.
    886  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
    887                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
    888 }
    889 
    890 static inline void convolve_2d_sr_horiz_8tap_neon_i8mm(
    891    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
    892    int im_h, const int16_t *x_filter_ptr) {
    893  // Filter values are even, so halve to reduce intermediate precision reqs.
    894  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
    895 
    896  const int bd = 8;
    897  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
    898  // shifts - which are generally faster than rounding shifts on modern CPUs.
    899  // The outermost -1 is needed because we halved the filter values.
    900  const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
    901                                            (1 << ((ROUND0_BITS - 1) - 1)));
    902 
    903  const uint8_t *src_ptr = src;
    904  int16_t *dst_ptr = im_block;
    905  int dst_stride = im_stride;
    906  int height = im_h;
    907 
    908  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
    909  do {
    910    const uint8_t *s = src_ptr;
    911    int16_t *d = dst_ptr;
    912    int width = w;
    913 
    914    do {
    915      uint8x16_t s0, s1, s2, s3;
    916      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
    917 
    918      int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
    919      int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
    920      int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
    921      int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
    922 
    923      store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
    924 
    925      s += 8;
    926      d += 8;
    927      width -= 8;
    928    } while (width != 0);
    929    src_ptr += 4 * src_stride;
    930    dst_ptr += 4 * dst_stride;
    931    height -= 4;
    932  } while (height > 4);
    933 
    934  do {
    935    const uint8_t *s = src_ptr;
    936    int16_t *d = dst_ptr;
    937    int width = w;
    938 
    939    do {
    940      uint8x16_t s0 = vld1q_u8(s);
    941      int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
    942      vst1q_s16(d, d0);
    943 
    944      s += 8;
    945      d += 8;
    946      width -= 8;
    947    } while (width != 0);
    948    src_ptr += src_stride;
    949    dst_ptr += dst_stride;
    950  } while (--height != 0);
    951 }
    952 
    953 static inline int16x4_t convolve4_4_2d_h(const uint8x16_t samples,
    954                                         const int8x8_t filters,
    955                                         const uint8x16_t permute_tbl,
    956                                         const int32x4_t horiz_const) {
    957  // Permute samples ready for dot product.
    958  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
    959  uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
    960 
    961  int32x4_t sum = vusdotq_lane_s32(horiz_const, perm_samples, filters, 0);
    962 
    963  // We halved the convolution filter values so -1 from the right shift.
    964  return vshrn_n_s32(sum, ROUND0_BITS - 1);
    965 }
    966 
    967 static inline int16x8_t convolve4_8_2d_h(const uint8x16_t samples,
    968                                         const int8x8_t filters,
    969                                         const uint8x16x2_t permute_tbl,
    970                                         const int32x4_t horiz_const) {
    971  // Permute samples ready for dot product.
    972  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
    973  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
    974  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
    975                                 vqtbl1q_u8(samples, permute_tbl.val[1]) };
    976 
    977  int32x4_t sum0123 =
    978      vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0);
    979  int32x4_t sum4567 =
    980      vusdotq_lane_s32(horiz_const, perm_samples[1], filters, 0);
    981 
    982  // Narrow and re-pack.
    983  // We halved the filter values so -1 from right shift.
    984  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
    985                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
    986 }
    987 
    988 static inline void convolve_2d_sr_horiz_4tap_neon_i8mm(
    989    const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int width,
    990    int height, const int16_t *filter_x) {
    991  const int bd = 8;
    992  const int16x4_t x_filter = vld1_s16(filter_x + 2);
    993  // All 4-tap and bilinear filter values are even, so halve them to reduce
    994  // intermediate precision requirements.
    995  const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
    996 
    997  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
    998  // shifts - which are generally faster than rounding shifts on modern CPUs.
    999  // Halve the total because we halved the filter values.
   1000  const int32x4_t horiz_const = vdupq_n_s32(
   1001      (((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) / 2));
   1002 
   1003  if (width == 4) {
   1004    const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl);
   1005    do {
   1006      uint8x16_t s0, s1, s2, s3;
   1007      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
   1008 
   1009      int16x4_t d0 = convolve4_4_2d_h(s0, filter, perm_tbl, horiz_const);
   1010      int16x4_t d1 = convolve4_4_2d_h(s1, filter, perm_tbl, horiz_const);
   1011      int16x4_t d2 = convolve4_4_2d_h(s2, filter, perm_tbl, horiz_const);
   1012      int16x4_t d3 = convolve4_4_2d_h(s3, filter, perm_tbl, horiz_const);
   1013 
   1014      store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
   1015 
   1016      src += 4 * src_stride;
   1017      dst += 4 * dst_stride;
   1018      height -= 4;
   1019    } while (height > 4);
   1020 
   1021    do {
   1022      uint8x16_t s0 = vld1q_u8(src);
   1023      int16x4_t d0 = convolve4_4_2d_h(s0, filter, perm_tbl, horiz_const);
   1024      vst1_s16(dst, d0);
   1025 
   1026      src += src_stride;
   1027      dst += dst_stride;
   1028    } while (--height != 0);
   1029  } else {
   1030    const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
   1031    do {
   1032      int w = width;
   1033      const uint8_t *s = src;
   1034      int16_t *d = dst;
   1035 
   1036      do {
   1037        uint8x16_t s0, s1, s2, s3;
   1038        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
   1039 
   1040        int16x8_t d0 = convolve4_8_2d_h(s0, filter, perm_tbl, horiz_const);
   1041        int16x8_t d1 = convolve4_8_2d_h(s1, filter, perm_tbl, horiz_const);
   1042        int16x8_t d2 = convolve4_8_2d_h(s2, filter, perm_tbl, horiz_const);
   1043        int16x8_t d3 = convolve4_8_2d_h(s3, filter, perm_tbl, horiz_const);
   1044 
   1045        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
   1046 
   1047        s += 8;
   1048        d += 8;
   1049        w -= 8;
   1050      } while (w != 0);
   1051      src += 4 * src_stride;
   1052      dst += 4 * dst_stride;
   1053      height -= 4;
   1054    } while (height > 4);
   1055 
   1056    do {
   1057      const uint8_t *s = src;
   1058      int16_t *d = dst;
   1059      int w = width;
   1060 
   1061      do {
   1062        uint8x16_t s0 = vld1q_u8(s);
   1063        int16x8_t d0 = convolve4_8_2d_h(s0, filter, perm_tbl, horiz_const);
   1064        vst1q_s16(d, d0);
   1065 
   1066        s += 8;
   1067        d += 8;
   1068        w -= 8;
   1069      } while (w != 0);
   1070      src += src_stride;
   1071      dst += dst_stride;
   1072    } while (--height != 0);
   1073  }
   1074 }
   1075 
   1076 static inline int16x4_t convolve6_4_2d_h(uint8x16_t samples,
   1077                                         const int8x16_t filter,
   1078                                         const uint8x16_t permute_tbl,
   1079                                         const int32x4_t horiz_const) {
   1080  // Permute samples ready for matrix multiply.
   1081  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
   1082  uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
   1083 
   1084  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
   1085  // (filter), destructively accumulating into the destination register.
   1086  int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples, filter);
   1087 
   1088  // We halved the convolution filter values so -1 from the right shift.
   1089  return vshrn_n_s32(sum, ROUND0_BITS - 1);
   1090 }
   1091 
   1092 static inline int16x8_t convolve6_8_2d_h(uint8x16_t samples,
   1093                                         const int8x16_t filter,
   1094                                         const uint8x16x2_t permute_tbl,
   1095                                         const int32x4_t horiz_const) {
   1096  // Permute samples ready for matrix multiply.
   1097  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
   1098  // { 4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
   1099  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
   1100                                 vqtbl1q_u8(samples, permute_tbl.val[1]) };
   1101 
   1102  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
   1103  // (filter), destructively accumulating into the destination register.
   1104  int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter);
   1105  int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter);
   1106 
   1107  // Narrow and re-pack.
   1108  // We halved the convolution filter values so -1 from the right shift.
   1109  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
   1110                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
   1111 }
   1112 
   1113 static inline void convolve_2d_sr_6tap_neon_i8mm(const uint8_t *src,
   1114                                                 int src_stride, uint8_t *dst,
   1115                                                 int dst_stride, int w, int h,
   1116                                                 const int16_t *x_filter_ptr,
   1117                                                 const int16_t *y_filter_ptr) {
   1118  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
   1119  // Filter values are even, so halve to reduce intermediate precision reqs.
   1120  const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
   1121  // Stagger the filter for use with the matrix multiply instructions.
   1122  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
   1123  const int8x16_t x_filter =
   1124      vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8);
   1125 
   1126  const int bd = 8;
   1127  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
   1128  // shifts in convolution kernels - which are generally faster than rounding
   1129  // shifts on modern CPUs. The outermost -1 is needed because we halved the
   1130  // filter values.
   1131  const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
   1132                                            (1 << ((ROUND0_BITS - 1) - 1)));
   1133  const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
   1134  const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
   1135 
   1136  do {
   1137    const uint8_t *s = src;
   1138    uint8_t *d = dst;
   1139    int height = h;
   1140 
   1141    uint8x16_t h_s0, h_s1, h_s2, h_s3, h_s4;
   1142    load_u8_16x5(s, src_stride, &h_s0, &h_s1, &h_s2, &h_s3, &h_s4);
   1143    s += 5 * src_stride;
   1144 
   1145    int16x8_t v_s0 = convolve6_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
   1146    int16x8_t v_s1 = convolve6_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
   1147    int16x8_t v_s2 = convolve6_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
   1148    int16x8_t v_s3 = convolve6_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
   1149    int16x8_t v_s4 = convolve6_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
   1150 
   1151    do {
   1152      uint8x16_t h_s5, h_s6, h_s7, h_s8;
   1153      load_u8_16x4(s, src_stride, &h_s5, &h_s6, &h_s7, &h_s8);
   1154 
   1155      int16x8_t v_s5 =
   1156          convolve6_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
   1157      int16x8_t v_s6 =
   1158          convolve6_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
   1159      int16x8_t v_s7 =
   1160          convolve6_8_2d_h(h_s7, x_filter, permute_tbl, horiz_const);
   1161      int16x8_t v_s8 =
   1162          convolve6_8_2d_h(h_s8, x_filter, permute_tbl, horiz_const);
   1163 
   1164      uint8x8_t d0 = convolve6_8_2d_v(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
   1165                                      y_filter, vert_const);
   1166      uint8x8_t d1 = convolve6_8_2d_v(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
   1167                                      y_filter, vert_const);
   1168      uint8x8_t d2 = convolve6_8_2d_v(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
   1169                                      y_filter, vert_const);
   1170      uint8x8_t d3 = convolve6_8_2d_v(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
   1171                                      y_filter, vert_const);
   1172 
   1173      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
   1174 
   1175      v_s0 = v_s4;
   1176      v_s1 = v_s5;
   1177      v_s2 = v_s6;
   1178      v_s3 = v_s7;
   1179      v_s4 = v_s8;
   1180 
   1181      s += 4 * src_stride;
   1182      d += 4 * dst_stride;
   1183      height -= 4;
   1184    } while (height != 0);
   1185    src += 8;
   1186    dst += 8;
   1187    w -= 8;
   1188  } while (w != 0);
   1189 }
   1190 
   1191 static inline void convolve_2d_sr_6tap_4tap_neon_i8mm(
   1192    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
   1193    int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) {
   1194  const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
   1195  // Filter values are even, so halve to reduce intermediate precision reqs.
   1196  const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
   1197  // Stagger the filter for use with the matrix multiply instructions.
   1198  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
   1199  const int8x16_t x_filter =
   1200      vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8);
   1201 
   1202  const int bd = 8;
   1203  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
   1204  // shifts - which are generally faster than rounding shifts on modern CPUs.
   1205  // Halve the total because we halved the filter values.
   1206  const int32x4_t horiz_const = vdupq_n_s32(
   1207      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) / 2);
   1208  const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
   1209 
   1210  if (w == 4) {
   1211    const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl);
   1212    uint8x16_t h_s0, h_s1, h_s2;
   1213    load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
   1214 
   1215    int16x4_t v_s0 = convolve6_4_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
   1216    int16x4_t v_s1 = convolve6_4_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
   1217    int16x4_t v_s2 = convolve6_4_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
   1218 
   1219    src += 3 * src_stride;
   1220 
   1221    do {
   1222      uint8x16_t h_s3, h_s4, h_s5, h_s6;
   1223      load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
   1224 
   1225      int16x4_t v_s3 =
   1226          convolve6_4_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
   1227      int16x4_t v_s4 =
   1228          convolve6_4_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
   1229      int16x4_t v_s5 =
   1230          convolve6_4_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
   1231      int16x4_t v_s6 =
   1232          convolve6_4_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
   1233 
   1234      int16x4_t d0 = convolve4_4_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter);
   1235      int16x4_t d1 = convolve4_4_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter);
   1236      int16x4_t d2 = convolve4_4_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter);
   1237      int16x4_t d3 = convolve4_4_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter);
   1238 
   1239      uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), vert_const));
   1240      uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), vert_const));
   1241 
   1242      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
   1243      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
   1244 
   1245      v_s0 = v_s4;
   1246      v_s1 = v_s5;
   1247      v_s2 = v_s6;
   1248 
   1249      src += 4 * src_stride;
   1250      dst += 4 * dst_stride;
   1251      h -= 4;
   1252    } while (h != 0);
   1253  } else {
   1254    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
   1255 
   1256    do {
   1257      int height = h;
   1258      const uint8_t *s = src;
   1259      uint8_t *d = dst;
   1260 
   1261      uint8x16_t h_s0, h_s1, h_s2;
   1262      load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
   1263 
   1264      int16x8_t v_s0 =
   1265          convolve6_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const);
   1266      int16x8_t v_s1 =
   1267          convolve6_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const);
   1268      int16x8_t v_s2 =
   1269          convolve6_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const);
   1270 
   1271      s += 3 * src_stride;
   1272 
   1273      do {
   1274        uint8x16_t h_s3, h_s4, h_s5, h_s6;
   1275        load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
   1276 
   1277        int16x8_t v_s3 =
   1278            convolve6_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const);
   1279        int16x8_t v_s4 =
   1280            convolve6_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const);
   1281        int16x8_t v_s5 =
   1282            convolve6_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const);
   1283        int16x8_t v_s6 =
   1284            convolve6_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const);
   1285 
   1286        uint8x8_t d0 =
   1287            convolve4_8_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter, vert_const);
   1288        uint8x8_t d1 =
   1289            convolve4_8_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter, vert_const);
   1290        uint8x8_t d2 =
   1291            convolve4_8_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter, vert_const);
   1292        uint8x8_t d3 =
   1293            convolve4_8_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter, vert_const);
   1294 
   1295        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
   1296 
   1297        v_s0 = v_s4;
   1298        v_s1 = v_s5;
   1299        v_s2 = v_s6;
   1300 
   1301        s += 4 * src_stride;
   1302        d += 4 * dst_stride;
   1303        height -= 4;
   1304      } while (height != 0);
   1305      src += 8;
   1306      dst += 8;
   1307      w -= 8;
   1308    } while (w != 0);
   1309  }
   1310 }
   1311 
   1312 void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride,
   1313                                  uint8_t *dst, int dst_stride, int w, int h,
   1314                                  const InterpFilterParams *filter_params_x,
   1315                                  const InterpFilterParams *filter_params_y,
   1316                                  const int subpel_x_qn, const int subpel_y_qn,
   1317                                  ConvolveParams *conv_params) {
   1318  if (w == 2 || h == 2) {
   1319    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
   1320                         filter_params_x, filter_params_y, subpel_x_qn,
   1321                         subpel_y_qn, conv_params);
   1322    return;
   1323  }
   1324 
   1325  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   1326  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
   1327  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
   1328  const int im_h = h + clamped_y_taps - 1;
   1329  const int im_stride = MAX_SB_SIZE;
   1330  const int vert_offset = clamped_y_taps / 2 - 1;
   1331  const int horiz_offset = filter_params_x->taps / 2 - 1;
   1332  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
   1333 
   1334  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1335      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   1336  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1337      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   1338 
   1339  if (filter_params_x->taps > 8) {
   1340    DECLARE_ALIGNED(16, int16_t,
   1341                    im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
   1342 
   1343    const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
   1344    const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
   1345 
   1346    convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block,
   1347                                         im_stride, w, im_h, x_filter_ptr);
   1348 
   1349    convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
   1350                                   y_filter_0_7, y_filter_8_11);
   1351  } else {
   1352    DECLARE_ALIGNED(16, int16_t,
   1353                    im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
   1354 
   1355    if (x_filter_taps == 6 && y_filter_taps == 6) {
   1356      convolve_2d_sr_6tap_neon_i8mm(src_ptr + 1, src_stride, dst, dst_stride, w,
   1357                                    h, x_filter_ptr, y_filter_ptr);
   1358      return;
   1359    }
   1360 
   1361    // Used for both 6, 4 and 4, 4 horiz, vert filter tap combinations.
   1362    if (x_filter_taps <= 6 && y_filter_taps <= 4) {
   1363      convolve_2d_sr_6tap_4tap_neon_i8mm(src_ptr + 1, src_stride, dst,
   1364                                         dst_stride, w, h, x_filter_ptr,
   1365                                         y_filter_ptr);
   1366      return;
   1367    }
   1368 
   1369    if (x_filter_taps <= 4) {
   1370      convolve_2d_sr_horiz_4tap_neon_i8mm(src_ptr + 2, src_stride, im_block,
   1371                                          im_stride, w, im_h, x_filter_ptr);
   1372    } else {
   1373      convolve_2d_sr_horiz_8tap_neon_i8mm(src_ptr, src_stride, im_block,
   1374                                          im_stride, w, im_h, x_filter_ptr);
   1375    }
   1376 
   1377    const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
   1378 
   1379    if (clamped_y_taps <= 4) {
   1380      convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h,
   1381                                    y_filter_ptr);
   1382    } else if (clamped_y_taps == 6) {
   1383      convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
   1384                                    y_filter);
   1385    } else {
   1386      convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
   1387                                    y_filter);
   1388    }
   1389  }
   1390 }