tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

aom_convolve8_neon_i8mm.c (24525B)


      1 /*
      2 * Copyright (c) 2014 The WebM project authors. All rights reserved.
      3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      4 *
      5 * This source code is subject to the terms of the BSD 2 Clause License and
      6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      7 * was not distributed with this source code in the LICENSE file, you can
      8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      9 * Media Patent License 1.0 was not distributed with this source code in the
     10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     11 */
     12 
     13 #include <arm_neon.h>
     14 #include <assert.h>
     15 #include <string.h>
     16 
     17 #include "config/aom_config.h"
     18 
     19 #include "aom/aom_integer.h"
     20 #include "aom_dsp/aom_dsp_common.h"
     21 #include "aom_dsp/aom_filter.h"
     22 #include "aom_dsp/arm/aom_convolve8_neon.h"
     23 #include "aom_dsp/arm/aom_filter.h"
     24 #include "aom_dsp/arm/mem_neon.h"
     25 #include "aom_dsp/arm/transpose_neon.h"
     26 #include "aom_ports/mem.h"
     27 
     28 DECLARE_ALIGNED(16, static const uint8_t, kMatMul6PermuteTbl[32]) = {
     29  // clang-format off
     30  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9,
     31  4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13
     32  // clang-format on
     33 };
     34 
     35 DECLARE_ALIGNED(16, static const uint8_t, kMatMul8PermuteTbl[32]) = {
     36  // clang-format off
     37  1,  2,  3,  4,  5,  6,  7,  8,  3,  4,  5,  6,  7,  8,  9, 10,
     38  5,  6,  7,  8,  9, 10, 11, 12,  7,  8,  9, 10, 11, 12, 13, 14
     39  // clang-format on
     40 };
     41 
     42 DECLARE_ALIGNED(16, static const uint8_t, kMatMul8FilterPermuteTbl[16]) = {
     43  // clang-format off
     44  1,  2,  3,  4,  5,  6,  7, 16, 16,  1,  2,  3,  4,  5,  6,  7
     45  // clang-format on
     46 };
     47 
     48 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
     49  // Shift left and insert new last column in transposed 4x4 block.
     50  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
     51  // Shift left and insert two new columns in transposed 4x4 block.
     52  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
     53  // Shift left and insert three new columns in transposed 4x4 block.
     54  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
     55 };
     56 
     57 static inline int16x4_t convolve8_4_h(const uint8x16_t samples,
     58                                      const int8x16_t filters,
     59                                      const uint8x16_t permute_tbl) {
     60  // Permute samples ready for matrix multiply.
     61  // { 1,  2,  3,  4,  5,  6,  7,  8,  3,  4,  5,  6,  7,  8,  9, 10 }
     62  uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
     63 
     64  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
     65  // (filter), destructively accumulating into the destination register.
     66  int32x4_t sum = vusmmlaq_s32(vdupq_n_s32(0), perm_samples, filters);
     67 
     68  // Tap 0, as well as further narrowing and packing, is applied by the caller.
     69  return vmovn_s32(sum);
     70 }
     71 
     72 static inline uint8x8_t convolve8_8_h(const uint8x16_t samples,
     73                                      const int8x16_t filters,
     74                                      const uint8x8_t f0,
     75                                      const uint8x16x2_t permute_tbl) {
     76  // Permute samples ready for matrix multiply.
     77  // { 1,  2,  3,  4,  5,  6,  7,  8,  3,  4,  5,  6,  7,  8,  9, 10 }
     78  // { 5,  6,  7,  8,  9, 10, 11, 12,  7,  8,  9, 10, 11, 12, 13, 14 }
     79  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
     80                                 vqtbl1q_u8(samples, permute_tbl.val[1]) };
     81 
     82  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
     83  // (filter), destructively accumulating into the destination register.
     84  int32x4_t sum0123 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[0], filters);
     85  int32x4_t sum4567 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[1], filters);
     86 
     87  // Narrow and re-pack.
     88  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
     89  // Apply tap 0 and accumulate.
     90  sum = vreinterpretq_s16_u16(
     91      vmlsl_u8(vreinterpretq_u16_s16(sum), vget_low_u8(samples), f0));
     92 
     93  // We halved the filter values so -1 from right shift.
     94  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
     95 }
     96 
     97 static inline void convolve8_horiz_8tap_neon_i8mm(
     98    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     99    ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
    100  // Filter values are even, so halve to reduce intermediate precision reqs.
    101  const int8x8_t filter_s8 = vshrn_n_s16(vld1q_s16(filter_x), 1);
    102  // Stagger the filter for use with the matrix multiply instructions.
    103  // { f1, f2, f3, f4, f5, f6, f7, 0, 0, f1, f2, f3, f4, f5, f6, f7 }
    104  const uint8x16_t filter_idx = vld1q_u8(kMatMul8FilterPermuteTbl);
    105  const int8x16_t filter =
    106      vqtbl1q_s8(vcombine_s8(filter_s8, vdup_n_s8(0)), filter_idx);
    107 
    108  // Since f0 is always negative and samples are unsigned, subtract (unsigned)
    109  // s0 * -f0 to avoid signed overflow.
    110  const uint8x8_t f0 = vdup_n_u8(-filter_x[0] >> 1);
    111 
    112  if (w == 4) {
    113    const uint8x16_t perm_tbl = vld1q_u8(kMatMul8PermuteTbl);
    114 
    115    do {
    116      uint8x16_t s0, s1, s2, s3;
    117      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
    118      uint8x8_t s01 = load_u8_4x2(src + 0 * src_stride, src_stride);
    119      uint8x8_t s23 = load_u8_4x2(src + 2 * src_stride, src_stride);
    120 
    121      int16x4_t t0 = convolve8_4_h(s0, filter, perm_tbl);
    122      int16x4_t t1 = convolve8_4_h(s1, filter, perm_tbl);
    123      int16x4_t t2 = convolve8_4_h(s2, filter, perm_tbl);
    124      int16x4_t t3 = convolve8_4_h(s3, filter, perm_tbl);
    125      // Apply tap 0 and accumulate.
    126      int16x8_t t01 = vcombine_s16(t0, t1);
    127      int16x8_t t23 = vcombine_s16(t2, t3);
    128      t01 =
    129          vreinterpretq_s16_u16(vmlsl_u8(vreinterpretq_u16_s16(t01), s01, f0));
    130      t23 =
    131          vreinterpretq_s16_u16(vmlsl_u8(vreinterpretq_u16_s16(t23), s23, f0));
    132      // We halved the filter values to -1 from right shift.
    133      uint8x8_t d01 = vqrshrun_n_s16(t01, FILTER_BITS - 1);
    134      uint8x8_t d23 = vqrshrun_n_s16(t23, FILTER_BITS - 1);
    135 
    136      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
    137      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
    138 
    139      src += 4 * src_stride;
    140      dst += 4 * dst_stride;
    141      h -= 4;
    142    } while (h > 0);
    143  } else {
    144    const uint8x16x2_t perm_tbl = vld1q_u8_x2(kMatMul8PermuteTbl);
    145 
    146    do {
    147      int width = w;
    148      const uint8_t *s = src;
    149      uint8_t *d = dst;
    150      do {
    151        uint8x16_t s0, s1, s2, s3;
    152        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
    153 
    154        uint8x8_t d0 = convolve8_8_h(s0, filter, f0, perm_tbl);
    155        uint8x8_t d1 = convolve8_8_h(s1, filter, f0, perm_tbl);
    156        uint8x8_t d2 = convolve8_8_h(s2, filter, f0, perm_tbl);
    157        uint8x8_t d3 = convolve8_8_h(s3, filter, f0, perm_tbl);
    158 
    159        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    160 
    161        s += 8;
    162        d += 8;
    163        width -= 8;
    164      } while (width != 0);
    165      src += 4 * src_stride;
    166      dst += 4 * dst_stride;
    167      h -= 4;
    168    } while (h > 0);
    169  }
    170 }
    171 
    172 static inline int16x4_t convolve6_4_h(const uint8x16_t samples,
    173                                      const int8x16_t filter,
    174                                      const uint8x16_t permute_tbl) {
    175  // Permute samples ready for matrix multiply.
    176  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
    177  uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl);
    178 
    179  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
    180  // (filter), destructively accumulating into the destination register.
    181  int32x4_t sum = vusmmlaq_s32(vdupq_n_s32(0), perm_samples, filter);
    182 
    183  // Further narrowing and packing is performed by the caller.
    184  return vmovn_s32(sum);
    185 }
    186 
    187 static inline uint8x8_t convolve6_8_h(const uint8x16_t samples,
    188                                      const int8x16_t filter,
    189                                      const uint8x16x2_t permute_tbl) {
    190  // Permute samples ready for matrix multiply.
    191  // { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
    192  // { 4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
    193  uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
    194                                 vqtbl1q_u8(samples, permute_tbl.val[1]) };
    195 
    196  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
    197  // (filter), destructively accumulating into the destination register.
    198  int32x4_t sum0123 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[0], filter);
    199  int32x4_t sum4567 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[1], filter);
    200 
    201  // Narrow and re-pack.
    202  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
    203  // We halved the filter values so -1 from right shift.
    204  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    205 }
    206 
    207 static inline void convolve8_horiz_6tap_neon_i8mm(
    208    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
    209    ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) {
    210  // Filter values are even, so halve to reduce intermediate precision reqs.
    211  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(filter_x), 1);
    212  // Stagger the filter for use with the matrix multiply instructions.
    213  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
    214  const int8x16_t filter =
    215      vcombine_s8(vext_s8(x_filter, x_filter, 1), x_filter);
    216 
    217  if (width == 4) {
    218    const uint8x16_t perm_tbl = vld1q_u8(kMatMul6PermuteTbl);
    219    do {
    220      uint8x16_t s0, s1, s2, s3;
    221      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
    222 
    223      int16x4_t t0 = convolve6_4_h(s0, filter, perm_tbl);
    224      int16x4_t t1 = convolve6_4_h(s1, filter, perm_tbl);
    225      int16x4_t t2 = convolve6_4_h(s2, filter, perm_tbl);
    226      int16x4_t t3 = convolve6_4_h(s3, filter, perm_tbl);
    227      // We halved the filter values so -1 from right shift.
    228      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
    229      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
    230 
    231      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
    232      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
    233 
    234      src += 4 * src_stride;
    235      dst += 4 * dst_stride;
    236      height -= 4;
    237    } while (height > 0);
    238  } else {
    239    const uint8x16x2_t perm_tbl = vld1q_u8_x2(kMatMul6PermuteTbl);
    240 
    241    do {
    242      int w = width;
    243      const uint8_t *s = src;
    244      uint8_t *d = dst;
    245      do {
    246        uint8x16_t s0, s1, s2, s3;
    247        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
    248 
    249        uint8x8_t d0 = convolve6_8_h(s0, filter, perm_tbl);
    250        uint8x8_t d1 = convolve6_8_h(s1, filter, perm_tbl);
    251        uint8x8_t d2 = convolve6_8_h(s2, filter, perm_tbl);
    252        uint8x8_t d3 = convolve6_8_h(s3, filter, perm_tbl);
    253 
    254        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    255 
    256        s += 8;
    257        d += 8;
    258        w -= 8;
    259      } while (w != 0);
    260      src += 4 * src_stride;
    261      dst += 4 * dst_stride;
    262      height -= 4;
    263    } while (height > 0);
    264  }
    265 }
    266 
    267 void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
    268                                   uint8_t *dst, ptrdiff_t dst_stride,
    269                                   const int16_t *filter_x, int x_step_q4,
    270                                   const int16_t *filter_y, int y_step_q4,
    271                                   int w, int h) {
    272  assert((intptr_t)dst % 4 == 0);
    273  assert(dst_stride % 4 == 0);
    274 
    275  (void)x_step_q4;
    276  (void)filter_y;
    277  (void)y_step_q4;
    278 
    279  src -= ((SUBPEL_TAPS / 2) - 1);
    280 
    281  int filter_taps = get_filter_taps_convolve8(filter_x);
    282 
    283  if (filter_taps == 2) {
    284    convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
    285                              h);
    286  } else if (filter_taps <= 6) {
    287    convolve8_horiz_6tap_neon_i8mm(src + 1, src_stride, dst, dst_stride,
    288                                   filter_x, w, h);
    289  } else {
    290    convolve8_horiz_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_x,
    291                                   w, h);
    292  }
    293 }
    294 
    295 static inline int16x4_t convolve8_4_v(const uint8x16_t samples_lo,
    296                                      const uint8x16_t samples_hi,
    297                                      const int8x8_t filters) {
    298  // Sample permutation is performed by the caller.
    299  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
    300  sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
    301 
    302  // Further narrowing and packing is performed by the caller.
    303  return vmovn_s32(sum);
    304 }
    305 
    306 static inline uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
    307                                      const uint8x16_t samples0_hi,
    308                                      const uint8x16_t samples1_lo,
    309                                      const uint8x16_t samples1_hi,
    310                                      const int8x8_t filters) {
    311  // Sample permutation is performed by the caller.
    312 
    313  // First 4 output values.
    314  int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
    315  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
    316  // Second 4 output values.
    317  int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
    318  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
    319 
    320  // Narrow and re-pack.
    321  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
    322  // We halved the filter values so -1 from right shift.
    323  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    324 }
    325 
    326 static inline void convolve8_vert_8tap_neon_i8mm(
    327    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
    328    ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
    329  // Filter values are even, so halve to reduce intermediate precision reqs.
    330  const int8x8_t filter = vshrn_n_s16(vld1q_s16(filter_y), 1);
    331  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
    332  uint8x16x2_t samples_LUT;
    333 
    334  if (w == 4) {
    335    uint8x8_t s0, s1, s2, s3, s4, s5, s6;
    336    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
    337    src += 7 * src_stride;
    338 
    339    // This operation combines a conventional transpose and the sample permute
    340    // required before computing the dot product.
    341    uint8x16_t s0123, s1234, s2345, s3456;
    342    transpose_concat_elems_u8_4x4(s0, s1, s2, s3, &s0123);
    343    transpose_concat_elems_u8_4x4(s1, s2, s3, s4, &s1234);
    344    transpose_concat_elems_u8_4x4(s2, s3, s4, s5, &s2345);
    345    transpose_concat_elems_u8_4x4(s3, s4, s5, s6, &s3456);
    346 
    347    do {
    348      uint8x8_t s7, s8, s9, s10;
    349      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
    350 
    351      uint8x16_t s4567, s5678, s6789, s78910;
    352      transpose_concat_elems_u8_4x4(s7, s8, s9, s10, &s78910);
    353 
    354      // Merge new data into block from previous iteration.
    355      samples_LUT.val[0] = s3456;
    356      samples_LUT.val[1] = s78910;
    357      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
    358      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
    359      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
    360 
    361      int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
    362      int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
    363      int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
    364      int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
    365      // We halved the filter values so -1 from right shift.
    366      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
    367      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
    368 
    369      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
    370      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
    371 
    372      // Prepare block for next iteration - re-using as much as possible.
    373      // Shuffle everything up four rows.
    374      s0123 = s4567;
    375      s1234 = s5678;
    376      s2345 = s6789;
    377      s3456 = s78910;
    378 
    379      src += 4 * src_stride;
    380      dst += 4 * dst_stride;
    381      h -= 4;
    382    } while (h != 0);
    383  } else {
    384    do {
    385      int height = h;
    386      const uint8_t *s = src;
    387      uint8_t *d = dst;
    388 
    389      uint8x8_t s0, s1, s2, s3, s4, s5, s6;
    390      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
    391      s += 7 * src_stride;
    392 
    393      // This operation combines a conventional transpose and the sample permute
    394      // required before computing the dot product.
    395      uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
    396          s3456_lo, s3456_hi;
    397      transpose_concat_elems_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
    398      transpose_concat_elems_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
    399      transpose_concat_elems_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
    400      transpose_concat_elems_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
    401 
    402      do {
    403        uint8x8_t s7, s8, s9, s10;
    404        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
    405 
    406        uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
    407            s78910_lo, s78910_hi;
    408        transpose_concat_elems_u8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
    409 
    410        // Merge new data into block from previous iteration.
    411        samples_LUT.val[0] = s3456_lo;
    412        samples_LUT.val[1] = s78910_lo;
    413        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
    414        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
    415        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
    416 
    417        samples_LUT.val[0] = s3456_hi;
    418        samples_LUT.val[1] = s78910_hi;
    419        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
    420        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
    421        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
    422 
    423        uint8x8_t d0 =
    424            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
    425        uint8x8_t d1 =
    426            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
    427        uint8x8_t d2 =
    428            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
    429        uint8x8_t d3 =
    430            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
    431 
    432        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    433 
    434        // Prepare block for next iteration - re-using as much as possible.
    435        // Shuffle everything up four rows.
    436        s0123_lo = s4567_lo;
    437        s0123_hi = s4567_hi;
    438        s1234_lo = s5678_lo;
    439        s1234_hi = s5678_hi;
    440        s2345_lo = s6789_lo;
    441        s2345_hi = s6789_hi;
    442        s3456_lo = s78910_lo;
    443        s3456_hi = s78910_hi;
    444 
    445        s += 4 * src_stride;
    446        d += 4 * dst_stride;
    447        height -= 4;
    448      } while (height != 0);
    449      src += 8;
    450      dst += 8;
    451      w -= 8;
    452    } while (w != 0);
    453  }
    454 }
    455 
    456 static inline int16x4_t convolve4_4_v(const uint8x16_t samples,
    457                                      const int8x8_t filters) {
    458  // Sample permutation is performed by the caller.
    459  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples, filters, 0);
    460 
    461  // Further narrowing and packing is performed by the caller.
    462  return vmovn_s32(sum);
    463 }
    464 
    465 static inline uint8x8_t convolve4_8_v(const uint8x16_t samples0,
    466                                      const uint8x16_t samples1,
    467                                      const int8x8_t filters) {
    468  // Sample permutation is performed by the caller.
    469 
    470  // First 4 output values.
    471  int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0, filters, 0);
    472  // Second 4 output values.
    473  int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1, filters, 0);
    474 
    475  // Narrow and re-pack.
    476  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
    477  // We halved the filter values so -1 from right shift.
    478  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    479 }
    480 
    481 static inline void convolve8_vert_4tap_neon_i8mm(
    482    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
    483    ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
    484  // Filter values are even, so halve to reduce intermediate precision reqs.
    485  const int16x8_t filter_s16 =
    486      vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
    487  const int8x8_t filter = vshrn_n_s16(filter_s16, 1);
    488  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
    489  uint8x16x2_t samples_LUT;
    490 
    491  if (w == 4) {
    492    uint8x8_t s0, s1, s2, s3;
    493    load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
    494    src += 4 * src_stride;
    495 
    496    // This operation combines a conventional transpose and the sample permute
    497    // required before computing the dot product.
    498    uint8x16_t s0123;
    499    transpose_concat_elems_u8_4x4(s0, s1, s2, s3, &s0123);
    500 
    501    do {
    502      uint8x8_t s4, s5, s6, s7;
    503      load_u8_8x4(src, src_stride, &s4, &s5, &s6, &s7);
    504 
    505      uint8x16_t s4567;
    506      transpose_concat_elems_u8_4x4(s4, s5, s6, s7, &s4567);
    507 
    508      // Merge new data into block from previous iteration.
    509      samples_LUT.val[0] = s0123;
    510      samples_LUT.val[1] = s4567;
    511      uint8x16_t s1234 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
    512      uint8x16_t s2345 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
    513      uint8x16_t s3456 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
    514 
    515      int16x4_t d0 = convolve4_4_v(s0123, filter);
    516      int16x4_t d1 = convolve4_4_v(s1234, filter);
    517      int16x4_t d2 = convolve4_4_v(s2345, filter);
    518      int16x4_t d3 = convolve4_4_v(s3456, filter);
    519      // We halved the filter values so -1 from right shift.
    520      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
    521      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
    522 
    523      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
    524      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
    525 
    526      // Prepare block for next iteration - re-using as much as possible.
    527      // Shuffle everything up four rows.
    528      s0123 = s4567;
    529 
    530      src += 4 * src_stride;
    531      dst += 4 * dst_stride;
    532      h -= 4;
    533    } while (h != 0);
    534  } else {
    535    do {
    536      int height = h;
    537      const uint8_t *s = src;
    538      uint8_t *d = dst;
    539 
    540      uint8x8_t s0, s1, s2, s3;
    541      load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
    542      s += 4 * src_stride;
    543 
    544      // This operation combines a conventional transpose and the sample permute
    545      // required before computing the dot product.
    546      uint8x16_t s0123_lo, s0123_hi;
    547      transpose_concat_elems_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
    548 
    549      do {
    550        uint8x8_t s4, s5, s6, s7;
    551        load_u8_8x4(s, src_stride, &s4, &s5, &s6, &s7);
    552 
    553        uint8x16_t s4567_lo, s4567_hi;
    554        transpose_concat_elems_u8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
    555 
    556        // Merge new data into block from previous iteration.
    557        samples_LUT.val[0] = s0123_lo;
    558        samples_LUT.val[1] = s4567_lo;
    559        uint8x16_t s1234_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
    560        uint8x16_t s2345_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
    561        uint8x16_t s3456_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
    562 
    563        samples_LUT.val[0] = s0123_hi;
    564        samples_LUT.val[1] = s4567_hi;
    565        uint8x16_t s1234_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
    566        uint8x16_t s2345_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
    567        uint8x16_t s3456_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
    568 
    569        uint8x8_t d0 = convolve4_8_v(s0123_lo, s0123_hi, filter);
    570        uint8x8_t d1 = convolve4_8_v(s1234_lo, s1234_hi, filter);
    571        uint8x8_t d2 = convolve4_8_v(s2345_lo, s2345_hi, filter);
    572        uint8x8_t d3 = convolve4_8_v(s3456_lo, s3456_hi, filter);
    573 
    574        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    575 
    576        // Prepare block for next iteration - re-using as much as possible.
    577        // Shuffle everything up four rows.
    578        s0123_lo = s4567_lo;
    579        s0123_hi = s4567_hi;
    580 
    581        s += 4 * src_stride;
    582        d += 4 * dst_stride;
    583        height -= 4;
    584      } while (height != 0);
    585      src += 8;
    586      dst += 8;
    587      w -= 8;
    588    } while (w != 0);
    589  }
    590 }
    591 
    592 void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
    593                                  uint8_t *dst, ptrdiff_t dst_stride,
    594                                  const int16_t *filter_x, int x_step_q4,
    595                                  const int16_t *filter_y, int y_step_q4, int w,
    596                                  int h) {
    597  assert((intptr_t)dst % 4 == 0);
    598  assert(dst_stride % 4 == 0);
    599 
    600  (void)filter_x;
    601  (void)x_step_q4;
    602  (void)y_step_q4;
    603 
    604  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
    605 
    606  int filter_taps = get_filter_taps_convolve8(filter_y);
    607 
    608  if (filter_taps == 2) {
    609    convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride,
    610                             filter_y, w, h);
    611  } else if (filter_taps == 4) {
    612    convolve8_vert_4tap_neon_i8mm(src + 2 * src_stride, src_stride, dst,
    613                                  dst_stride, filter_y, w, h);
    614  } else {
    615    convolve8_vert_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_y, w,
    616                                  h);
    617  }
    618 }