tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

convolve_neon_dotprod.c (68967B)


      1 /*
      2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 
     14 #include "config/aom_config.h"
     15 #include "config/av1_rtcd.h"
     16 
     17 #include "aom_dsp/aom_dsp_common.h"
     18 #include "aom_dsp/arm/mem_neon.h"
     19 #include "aom_dsp/arm/transpose_neon.h"
     20 #include "aom_ports/mem.h"
     21 #include "av1/common/arm/convolve_neon.h"
     22 #include "av1/common/convolve.h"
     23 #include "av1/common/filter.h"
     24 
     25 DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
     26  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
     27  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
     28  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
     29 };
     30 
     31 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
     32  // Shift left and insert new last column in transposed 4x4 block.
     33  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
     34  // Shift left and insert two new columns in transposed 4x4 block.
     35  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
     36  // Shift left and insert three new columns in transposed 4x4 block.
     37  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
     38 };
     39 
     40 static inline int16x4_t convolve12_4_x(uint8x16_t samples,
     41                                       const int8x16_t filter,
     42                                       const uint8x16x3_t permute_tbl) {
     43  // Transform sample range to [-128, 127] for 8-bit signed dot product.
     44  int8x16_t samples_128 =
     45      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
     46 
     47  // Permute samples ready for dot product.
     48  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
     49  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
     50  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
     51  int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
     52                                vqtbl1q_s8(samples_128, permute_tbl.val[1]),
     53                                vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
     54 
     55  // Dot product constants:
     56  // Accumulate into 128 << FILTER_BITS to account for range transform.
     57  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
     58  // right shift by FILTER_BITS - instead of a first rounding right shift by
     59  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
     60  // ROUND0_BITS.
     61  int32x4_t acc =
     62      vdupq_n_s32((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1))));
     63 
     64  int32x4_t sum = vdotq_laneq_s32(acc, perm_samples[0], filter, 0);
     65  sum = vdotq_laneq_s32(sum, perm_samples[1], filter, 1);
     66  sum = vdotq_laneq_s32(sum, perm_samples[2], filter, 2);
     67 
     68  return vshrn_n_s32(sum, 1);
     69 }
     70 
     71 static inline uint8x8_t convolve12_8_x(uint8x16_t samples[2],
     72                                       const int8x16_t filter,
     73                                       const uint8x16x3_t permute_tbl) {
     74  // Transform sample range to [-128, 127] for 8-bit signed dot product.
     75  int8x16_t samples_128[2] = {
     76    vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))),
     77    vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128)))
     78  };
     79 
     80  // Permute samples ready for dot product.
     81  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
     82  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
     83  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
     84  // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
     85  int8x16_t perm_samples[4] = { vqtbl1q_s8(samples_128[0], permute_tbl.val[0]),
     86                                vqtbl1q_s8(samples_128[0], permute_tbl.val[1]),
     87                                vqtbl1q_s8(samples_128[0], permute_tbl.val[2]),
     88                                vqtbl1q_s8(samples_128[1],
     89                                           permute_tbl.val[2]) };
     90 
     91  // Dot product constants:
     92  // Accumulate into 128 << FILTER_BITS to account for range transform.
     93  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
     94  // right shift by FILTER_BITS - instead of a first rounding right shift by
     95  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
     96  // ROUND0_BITS.
     97  int32x4_t acc =
     98      vdupq_n_s32((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1))));
     99 
    100  int32x4_t sum0123 = vdotq_laneq_s32(acc, perm_samples[0], filter, 0);
    101  sum0123 = vdotq_laneq_s32(sum0123, perm_samples[1], filter, 1);
    102  sum0123 = vdotq_laneq_s32(sum0123, perm_samples[2], filter, 2);
    103 
    104  int32x4_t sum4567 = vdotq_laneq_s32(acc, perm_samples[1], filter, 0);
    105  sum4567 = vdotq_laneq_s32(sum4567, perm_samples[2], filter, 1);
    106  sum4567 = vdotq_laneq_s32(sum4567, perm_samples[3], filter, 2);
    107 
    108  // Narrow and re-pack.
    109  int16x8_t sum_s16 =
    110      vcombine_s16(vshrn_n_s32(sum0123, 1), vshrn_n_s32(sum4567, 1));
    111  return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
    112 }
    113 
    114 static inline void convolve_x_sr_12tap_neon_dotprod(
    115    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
    116    int h, const int16_t *x_filter_ptr) {
    117  // The no-op filter should never be used here.
    118  assert(x_filter_ptr[5] != 128);
    119 
    120  const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
    121  const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
    122  const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
    123  const int8x16_t filter =
    124      vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
    125 
    126  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
    127 
    128  if (w <= 4) {
    129    do {
    130      uint8x16_t s0, s1, s2, s3;
    131      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
    132 
    133      int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl);
    134      int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl);
    135      int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl);
    136      int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl);
    137 
    138      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
    139      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
    140 
    141      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
    142      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
    143 
    144      dst += 4 * dst_stride;
    145      src += 4 * src_stride;
    146      h -= 4;
    147    } while (h != 0);
    148  } else {
    149    do {
    150      const uint8_t *s = src;
    151      uint8_t *d = dst;
    152      int width = w;
    153 
    154      do {
    155        uint8x16_t s0[2], s1[2], s2[2], s3[2];
    156        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
    157        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
    158 
    159        uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl);
    160        uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl);
    161        uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl);
    162        uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl);
    163 
    164        store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
    165 
    166        s += 8;
    167        d += 8;
    168        width -= 8;
    169      } while (width != 0);
    170      src += 4 * src_stride;
    171      dst += 4 * dst_stride;
    172      h -= 4;
    173    } while (h != 0);
    174  }
    175 }
    176 
    177 static inline int16x4_t convolve4_4_x(const uint8x16_t samples,
    178                                      const int8x8_t filters,
    179                                      const uint8x16_t permute_tbl) {
    180  // Transform sample range to [-128, 127] for 8-bit signed dot product.
    181  int8x16_t samples_128 =
    182      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
    183 
    184  // Permute samples ready for dot product.
    185  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
    186  int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
    187 
    188  // Dot product constants:
    189  // Accumulate into 128 << FILTER_BITS to account for range transform.
    190  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
    191  // right shift by FILTER_BITS - instead of a first rounding right shift by
    192  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
    193  // ROUND0_BITS. Halve the total because we halved the filter values.
    194  int32x4_t acc =
    195      vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
    196  int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
    197 
    198  // Further narrowing and packing is performed by the caller.
    199  return vmovn_s32(sum);
    200 }
    201 
    202 static inline uint8x8_t convolve4_8_x(const uint8x16_t samples,
    203                                      const int8x8_t filters,
    204                                      const uint8x16x2_t permute_tbl) {
    205  // Transform sample range to [-128, 127] for 8-bit signed dot product.
    206  int8x16_t samples_128 =
    207      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
    208 
    209  // Permute samples ready for dot product.
    210  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
    211  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
    212  int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
    213                                vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
    214 
    215  // Dot product constants:
    216  // Accumulate into 128 << FILTER_BITS to account for range transform.
    217  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
    218  // right shift by FILTER_BITS - instead of a first rounding right shift by
    219  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
    220  // ROUND0_BITS. Halve the total because we halved the filter values.
    221  int32x4_t acc =
    222      vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
    223 
    224  int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
    225  int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
    226 
    227  // Narrow and re-pack.
    228  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
    229  // We halved the filter values so -1 from right shift.
    230  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    231 }
    232 
    233 static inline void convolve_x_sr_4tap_neon_dotprod(
    234    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
    235    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x) {
    236  const int16x4_t x_filter = vld1_s16(filter_x + 2);
    237  // All 4-tap and bilinear filter values are even, so halve them to reduce
    238  // intermediate precision requirements.
    239  const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
    240 
    241  if (width == 4) {
    242    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
    243 
    244    do {
    245      uint8x16_t s0, s1, s2, s3;
    246      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
    247 
    248      int16x4_t t0 = convolve4_4_x(s0, filter, permute_tbl);
    249      int16x4_t t1 = convolve4_4_x(s1, filter, permute_tbl);
    250      int16x4_t t2 = convolve4_4_x(s2, filter, permute_tbl);
    251      int16x4_t t3 = convolve4_4_x(s3, filter, permute_tbl);
    252      // We halved the filter values so -1 from right shift.
    253      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
    254      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
    255 
    256      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
    257      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
    258 
    259      src += 4 * src_stride;
    260      dst += 4 * dst_stride;
    261      height -= 4;
    262    } while (height != 0);
    263  } else {
    264    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
    265 
    266    do {
    267      const uint8_t *s = src;
    268      uint8_t *d = dst;
    269      int w = width;
    270 
    271      do {
    272        uint8x16_t s0, s1, s2, s3;
    273        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
    274 
    275        uint8x8_t d0 = convolve4_8_x(s0, filter, permute_tbl);
    276        uint8x8_t d1 = convolve4_8_x(s1, filter, permute_tbl);
    277        uint8x8_t d2 = convolve4_8_x(s2, filter, permute_tbl);
    278        uint8x8_t d3 = convolve4_8_x(s3, filter, permute_tbl);
    279 
    280        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    281 
    282        s += 8;
    283        d += 8;
    284        w -= 8;
    285      } while (w != 0);
    286      src += 4 * src_stride;
    287      dst += 4 * dst_stride;
    288      height -= 4;
    289    } while (height != 0);
    290  }
    291 }
    292 
    293 static inline uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
    294                                      const uint8x16x3_t permute_tbl) {
    295  // Transform sample range to [-128, 127] for 8-bit signed dot product.
    296  int8x16_t samples_128 =
    297      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
    298 
    299  // Permute samples ready for dot product. */
    300  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
    301  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
    302  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
    303  int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
    304                                vqtbl1q_s8(samples_128, permute_tbl.val[1]),
    305                                vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
    306 
    307  // Dot product constants:
    308  // Accumulate into 128 << FILTER_BITS to account for range transform.
    309  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
    310  // right shift by FILTER_BITS - instead of a first rounding right shift by
    311  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
    312  // ROUND0_BITS. Halve the total because we halved the filter values.
    313  int32x4_t acc =
    314      vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
    315 
    316  int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filter, 0);
    317  sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filter, 1);
    318 
    319  int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[1], filter, 0);
    320  sum4567 = vdotq_lane_s32(sum4567, perm_samples[2], filter, 1);
    321 
    322  // Narrow and re-pack.
    323  int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
    324  // We halved the convolution filter values so - 1 from the right shift.
    325  return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
    326 }
    327 
    328 void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride,
    329                                    uint8_t *dst, int dst_stride, int w, int h,
    330                                    const InterpFilterParams *filter_params_x,
    331                                    const int subpel_x_qn,
    332                                    ConvolveParams *conv_params) {
    333  if (w == 2 || h == 2) {
    334    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
    335                        subpel_x_qn, conv_params);
    336    return;
    337  }
    338 
    339  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
    340  src -= horiz_offset;
    341 
    342  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
    343      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    344 
    345  int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
    346 
    347  if (filter_taps > 8) {
    348    convolve_x_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h,
    349                                     x_filter_ptr);
    350    return;
    351  }
    352 
    353  if (filter_taps <= 4) {
    354    convolve_x_sr_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride, w, h,
    355                                    x_filter_ptr);
    356    return;
    357  }
    358 
    359  const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
    360 
    361  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
    362  // Filter values are even, so halve to reduce intermediate precision reqs.
    363  const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
    364 
    365  do {
    366    int width = w;
    367    const uint8_t *s = src;
    368    uint8_t *d = dst;
    369 
    370    do {
    371      uint8x16_t s0, s1, s2, s3;
    372      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
    373 
    374      uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl);
    375      uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl);
    376      uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl);
    377      uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl);
    378 
    379      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    380 
    381      s += 8;
    382      d += 8;
    383      width -= 8;
    384    } while (width != 0);
    385    src += 4 * src_stride;
    386    dst += 4 * dst_stride;
    387    h -= 4;
    388  } while (h != 0);
    389 }
    390 
    391 static inline int16x4_t convolve12_4_y(const int8x16_t s0, const int8x16_t s1,
    392                                       const int8x16_t s2,
    393                                       const int8x8_t filters_0_7,
    394                                       const int8x8_t filters_4_11) {
    395  // The sample range transform and permutation are performed by the caller.
    396  // Accumulate into 128 << FILTER_BITS to account for range transform.
    397  const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
    398  int32x4_t sum = vdotq_lane_s32(acc, s0, filters_0_7, 0);
    399  sum = vdotq_lane_s32(sum, s1, filters_0_7, 1);
    400  sum = vdotq_lane_s32(sum, s2, filters_4_11, 1);
    401 
    402  // Further narrowing and packing is performed by the caller.
    403  return vshrn_n_s32(sum, 1);
    404 }
    405 
    406 static inline uint8x8_t convolve12_8_y(
    407    const int8x16_t s0_lo, const int8x16_t s0_hi, const int8x16_t s1_lo,
    408    const int8x16_t s1_hi, const int8x16_t s2_lo, const int8x16_t s2_hi,
    409    const int8x8_t filters_0_7, const int8x8_t filters_4_11) {
    410  // The sample range transform and permutation are performed by the caller.
    411  // Accumulate into 128 << FILTER_BITS to account for range transform.
    412  const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
    413 
    414  int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters_0_7, 0);
    415  sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1);
    416  sum0123 = vdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1);
    417 
    418  int32x4_t sum4567 = vdotq_lane_s32(acc, s0_hi, filters_0_7, 0);
    419  sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1);
    420  sum4567 = vdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1);
    421 
    422  // Narrow and re-pack.
    423  int16x8_t sum =
    424      vcombine_s16(vshrn_n_s32(sum0123, 1), vshrn_n_s32(sum4567, 1));
    425  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    426 }
    427 
    428 static inline void convolve_y_sr_12tap_neon_dotprod(
    429    const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride,
    430    int w, int h, const int16_t *y_filter_ptr) {
    431  // The no-op filter should never be used here.
    432  assert(y_filter_ptr[5] != 128);
    433 
    434  const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr));
    435  const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4));
    436 
    437  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
    438 
    439  if (w == 4) {
    440    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA;
    441    load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7,
    442                 &t8, &t9, &tA);
    443    src_ptr += 11 * src_stride;
    444 
    445    // Transform sample range to [-128, 127] for 8-bit signed dot product.
    446    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
    447    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
    448    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
    449    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
    450    int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
    451    int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
    452    int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
    453    int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
    454    int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
    455    int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
    456    int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
    457 
    458    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
    459    transpose_concat_elems_s8_4x4(s0, s1, s2, s3, &s0123);
    460    transpose_concat_elems_s8_4x4(s1, s2, s3, s4, &s1234);
    461    transpose_concat_elems_s8_4x4(s2, s3, s4, s5, &s2345);
    462    transpose_concat_elems_s8_4x4(s3, s4, s5, s6, &s3456);
    463    transpose_concat_elems_s8_4x4(s4, s5, s6, s7, &s4567);
    464    transpose_concat_elems_s8_4x4(s5, s6, s7, s8, &s5678);
    465    transpose_concat_elems_s8_4x4(s6, s7, s8, s9, &s6789);
    466    transpose_concat_elems_s8_4x4(s7, s8, s9, sA, &s789A);
    467 
    468    do {
    469      uint8x8_t tB, tC, tD, tE;
    470      load_u8_8x4(src_ptr, src_stride, &tB, &tC, &tD, &tE);
    471 
    472      int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128)));
    473      int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128)));
    474      int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128)));
    475      int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128)));
    476 
    477      int8x16_t s89AB, s9ABC, sABCD, sBCDE;
    478      transpose_concat_elems_s8_4x4(sB, sC, sD, sE, &sBCDE);
    479 
    480      // Merge new data into block from previous iteration.
    481      int8x16x2_t samples_LUT = { { s789A, sBCDE } };
    482      s89AB = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
    483      s9ABC = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
    484      sABCD = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
    485 
    486      int16x4_t d0 =
    487          convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11);
    488      int16x4_t d1 =
    489          convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11);
    490      int16x4_t d2 =
    491          convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
    492      int16x4_t d3 =
    493          convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
    494      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
    495      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
    496 
    497      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
    498      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
    499 
    500      // Prepare block for next iteration - re-using as much as possible.
    501      // Shuffle everything up four rows.
    502      s0123 = s4567;
    503      s1234 = s5678;
    504      s2345 = s6789;
    505      s3456 = s789A;
    506      s4567 = s89AB;
    507      s5678 = s9ABC;
    508      s6789 = sABCD;
    509      s789A = sBCDE;
    510 
    511      src_ptr += 4 * src_stride;
    512      dst_ptr += 4 * dst_stride;
    513      h -= 4;
    514    } while (h != 0);
    515  } else {
    516    do {
    517      int height = h;
    518      const uint8_t *s = src_ptr;
    519      uint8_t *d = dst_ptr;
    520 
    521      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA;
    522      load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
    523                   &t9, &tA);
    524      s += 11 * src_stride;
    525 
    526      // Transform sample range to [-128, 127] for 8-bit signed dot product.
    527      int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
    528      int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
    529      int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
    530      int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
    531      int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
    532      int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
    533      int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
    534      int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
    535      int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
    536      int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
    537      int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
    538 
    539      // This operation combines a conventional transpose and the sample
    540      // permute (see horizontal case) required before computing the dot
    541      // product.
    542      int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
    543          s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
    544          s6789_hi, s789A_lo, s789A_hi;
    545      transpose_concat_elems_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
    546      transpose_concat_elems_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
    547      transpose_concat_elems_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
    548      transpose_concat_elems_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
    549      transpose_concat_elems_s8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
    550      transpose_concat_elems_s8_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
    551      transpose_concat_elems_s8_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
    552      transpose_concat_elems_s8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
    553 
    554      do {
    555        uint8x8_t tB, tC, tD, tE;
    556        load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE);
    557 
    558        int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128)));
    559        int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128)));
    560        int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128)));
    561        int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128)));
    562 
    563        int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
    564            sBCDE_lo, sBCDE_hi;
    565        transpose_concat_elems_s8_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
    566 
    567        // Merge new data into block from previous iteration.
    568        int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
    569        s89AB_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]);
    570        s9ABC_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]);
    571        sABCD_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]);
    572 
    573        int8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
    574        s89AB_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]);
    575        s9ABC_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]);
    576        sABCD_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]);
    577 
    578        uint8x8_t d0 =
    579            convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
    580                           s89AB_hi, filter_0_7, filter_4_11);
    581        uint8x8_t d1 =
    582            convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
    583                           s9ABC_hi, filter_0_7, filter_4_11);
    584        uint8x8_t d2 =
    585            convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
    586                           sABCD_hi, filter_0_7, filter_4_11);
    587        uint8x8_t d3 =
    588            convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
    589                           sBCDE_hi, filter_0_7, filter_4_11);
    590 
    591        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    592 
    593        // Prepare block for next iteration - re-using as much as possible.
    594        // Shuffle everything up four rows.
    595        s0123_lo = s4567_lo;
    596        s0123_hi = s4567_hi;
    597        s1234_lo = s5678_lo;
    598        s1234_hi = s5678_hi;
    599        s2345_lo = s6789_lo;
    600        s2345_hi = s6789_hi;
    601        s3456_lo = s789A_lo;
    602        s3456_hi = s789A_hi;
    603        s4567_lo = s89AB_lo;
    604        s4567_hi = s89AB_hi;
    605        s5678_lo = s9ABC_lo;
    606        s5678_hi = s9ABC_hi;
    607        s6789_lo = sABCD_lo;
    608        s6789_hi = sABCD_hi;
    609        s789A_lo = sBCDE_lo;
    610        s789A_hi = sBCDE_hi;
    611 
    612        s += 4 * src_stride;
    613        d += 4 * dst_stride;
    614        height -= 4;
    615      } while (height != 0);
    616      src_ptr += 8;
    617      dst_ptr += 8;
    618      w -= 8;
    619    } while (w != 0);
    620  }
    621 }
    622 
    623 static inline int16x4_t convolve8_4_y(const int8x16_t s0, const int8x16_t s1,
    624                                      const int8x8_t filters) {
    625  // The sample range transform and permutation are performed by the caller.
    626  // Accumulate into 128 << FILTER_BITS to account for range transform.
    627  // (- 1 since we halved the filters.)
    628  const int32x4_t acc = vdupq_n_s32(128 << (FILTER_BITS - 1));
    629  int32x4_t sum = vdotq_lane_s32(acc, s0, filters, 0);
    630  sum = vdotq_lane_s32(sum, s1, filters, 1);
    631 
    632  // Further narrowing and packing is performed by the caller.
    633  return vmovn_s32(sum);
    634 }
    635 
    636 static inline uint8x8_t convolve8_8_y(const int8x16_t s0_lo,
    637                                      const int8x16_t s0_hi,
    638                                      const int8x16_t s1_lo,
    639                                      const int8x16_t s1_hi,
    640                                      const int8x8_t filters) {
    641  // The sample range transform and permutation are performed by the caller.
    642  // Accumulate into 128 << FILTER_BITS to account for range transform.
    643  // (- 1 since we halved the filters.)
    644  const int32x4_t acc = vdupq_n_s32(128 << (FILTER_BITS - 1));
    645 
    646  int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters, 0);
    647  sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters, 1);
    648 
    649  int32x4_t sum4567 = vdotq_lane_s32(acc, s0_hi, filters, 0);
    650  sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters, 1);
    651 
    652  // Narrow and re-pack.
    653  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
    654  // We halved the filter values so -1 from right shift.
    655  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    656 }
    657 
    658 static inline void convolve_y_sr_8tap_neon_dotprod(
    659    const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride,
    660    int w, int h, const int16_t *y_filter_ptr) {
    661  // Filter values are even, so halve to reduce intermediate precision reqs.
    662  const int8x8_t filter = vshrn_n_s16(vld1q_s16(y_filter_ptr), 1);
    663 
    664  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
    665 
    666  if (w == 4) {
    667    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
    668    load_u8_8x7(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
    669    src_ptr += 7 * src_stride;
    670 
    671    // Transform sample range to [-128, 127] for 8-bit signed dot product.
    672    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
    673    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
    674    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
    675    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
    676    int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
    677    int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
    678    int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
    679 
    680    int8x16_t s0123, s1234, s2345, s3456;
    681    transpose_concat_elems_s8_4x4(s0, s1, s2, s3, &s0123);
    682    transpose_concat_elems_s8_4x4(s1, s2, s3, s4, &s1234);
    683    transpose_concat_elems_s8_4x4(s2, s3, s4, s5, &s2345);
    684    transpose_concat_elems_s8_4x4(s3, s4, s5, s6, &s3456);
    685 
    686    do {
    687      uint8x8_t t7, t8, t9, tA;
    688      load_u8_8x4(src_ptr, src_stride, &t7, &t8, &t9, &tA);
    689 
    690      int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
    691      int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
    692      int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
    693      int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
    694 
    695      int8x16_t s4567, s5678, s6789, s789A;
    696      transpose_concat_elems_s8_4x4(s7, s8, s9, sA, &s789A);
    697 
    698      // Merge new data into block from previous iteration.
    699      int8x16x2_t samples_LUT = { { s3456, s789A } };
    700      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
    701      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
    702      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
    703 
    704      int16x4_t d0 = convolve8_4_y(s0123, s4567, filter);
    705      int16x4_t d1 = convolve8_4_y(s1234, s5678, filter);
    706      int16x4_t d2 = convolve8_4_y(s2345, s6789, filter);
    707      int16x4_t d3 = convolve8_4_y(s3456, s789A, filter);
    708      // We halved the filter values so -1 from right shift.
    709      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
    710      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
    711 
    712      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
    713      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
    714 
    715      // Prepare block for next iteration - re-using as much as possible.
    716      // Shuffle everything up four rows.
    717      s0123 = s4567;
    718      s1234 = s5678;
    719      s2345 = s6789;
    720      s3456 = s789A;
    721 
    722      src_ptr += 4 * src_stride;
    723      dst_ptr += 4 * dst_stride;
    724      h -= 4;
    725    } while (h != 0);
    726  } else {
    727    do {
    728      int height = h;
    729      const uint8_t *s = src_ptr;
    730      uint8_t *d = dst_ptr;
    731 
    732      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
    733      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
    734      s += 7 * src_stride;
    735 
    736      // Transform sample range to [-128, 127] for 8-bit signed dot product.
    737      int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
    738      int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
    739      int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
    740      int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
    741      int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
    742      int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
    743      int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
    744 
    745      // This operation combines a conventional transpose and the sample
    746      // permute (see horizontal case) required before computing the dot
    747      // product.
    748      int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
    749          s3456_lo, s3456_hi;
    750      transpose_concat_elems_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
    751      transpose_concat_elems_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
    752      transpose_concat_elems_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
    753      transpose_concat_elems_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
    754 
    755      do {
    756        uint8x8_t t7, t8, t9, tA;
    757        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &tA);
    758 
    759        int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
    760        int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
    761        int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
    762        int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
    763 
    764        int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
    765            s789A_lo, s789A_hi;
    766        transpose_concat_elems_s8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
    767 
    768        // Merge new data into block from previous iteration.
    769        int8x16x2_t samples_LUT_lo = { { s3456_lo, s789A_lo } };
    770        s4567_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]);
    771        s5678_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]);
    772        s6789_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]);
    773 
    774        int8x16x2_t samples_LUT_hi = { { s3456_hi, s789A_hi } };
    775        s4567_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]);
    776        s5678_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]);
    777        s6789_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]);
    778 
    779        uint8x8_t d0 =
    780            convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter);
    781        uint8x8_t d1 =
    782            convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter);
    783        uint8x8_t d2 =
    784            convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter);
    785        uint8x8_t d3 =
    786            convolve8_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, filter);
    787 
    788        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    789 
    790        // Prepare block for next iteration - re-using as much as possible.
    791        // Shuffle everything up four rows.
    792        s0123_lo = s4567_lo;
    793        s0123_hi = s4567_hi;
    794        s1234_lo = s5678_lo;
    795        s1234_hi = s5678_hi;
    796        s2345_lo = s6789_lo;
    797        s2345_hi = s6789_hi;
    798        s3456_lo = s789A_lo;
    799        s3456_hi = s789A_hi;
    800 
    801        s += 4 * src_stride;
    802        d += 4 * dst_stride;
    803        height -= 4;
    804      } while (height != 0);
    805      src_ptr += 8;
    806      dst_ptr += 8;
    807      w -= 8;
    808    } while (w != 0);
    809  }
    810 }
    811 
    812 static inline int16x4_t convolve4_4_y(const int8x16_t s0,
    813                                      const int8x8_t filters) {
    814  // The sample range transform and permutation are performed by the caller.
    815  // Accumulate into 128 << FILTER_BITS to account for range transform.
    816  // (- 1 since we halved the filters.)
    817  const int32x4_t acc = vdupq_n_s32(128 << (FILTER_BITS - 1));
    818 
    819  int32x4_t sum = vdotq_lane_s32(acc, s0, filters, 0);
    820 
    821  // Further narrowing and packing is performed by the caller.
    822  return vmovn_s32(sum);
    823 }
    824 
    825 static inline uint8x8_t convolve4_8_y(const int8x16_t s0, const int8x16_t s1,
    826                                      const int8x8_t filters) {
    827  // The sample range transform and permutation are performed by the caller.
    828  // Accumulate into 128 << FILTER_BITS to account for range transform.
    829  // (- 1 since we halved the filters.)
    830  const int32x4_t acc = vdupq_n_s32(128 << (FILTER_BITS - 1));
    831 
    832  int32x4_t sum0123 = vdotq_lane_s32(acc, s0, filters, 0);
    833  int32x4_t sum4567 = vdotq_lane_s32(acc, s1, filters, 0);
    834 
    835  // Narrow and re-pack.
    836  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
    837  // We halved the filter values so -1 from right shift.
    838  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    839 }
    840 
    841 static inline void convolve_y_sr_4tap_neon_dotprod(
    842    const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride,
    843    int w, int h, const int16_t *y_filter_ptr) {
    844  // Filter values are even, so halve to reduce intermediate precision reqs.
    845  const int16x8_t filter_s16 =
    846      vcombine_s16(vld1_s16(y_filter_ptr + 2), vdup_n_s16(0));
    847  const int8x8_t filter = vshrn_n_s16(filter_s16, 1);
    848  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
    849  int8x16x2_t samples_LUT;
    850 
    851  if (w == 4) {
    852    uint8x8_t t0, t1, t2, t3;
    853    load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
    854    src_ptr += 4 * src_stride;
    855 
    856    // Transform sample range to [-128, 127] for 8-bit signed dot product.
    857    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
    858    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
    859    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
    860    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
    861 
    862    // This operation combines a conventional transpose and the sample permute
    863    // required before computing the dot product.
    864    int8x16_t s0123;
    865    transpose_concat_elems_s8_4x4(s0, s1, s2, s3, &s0123);
    866 
    867    do {
    868      uint8x8_t t4, t5, t6, t7;
    869      load_u8_8x4(src_ptr, src_stride, &t4, &t5, &t6, &t7);
    870 
    871      int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
    872      int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
    873      int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
    874      int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
    875 
    876      int8x16_t s4567;
    877      transpose_concat_elems_s8_4x4(s4, s5, s6, s7, &s4567);
    878 
    879      // Merge new data into block from previous iteration.
    880      samples_LUT.val[0] = s0123;
    881      samples_LUT.val[1] = s4567;
    882      int8x16_t s1234 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
    883      int8x16_t s2345 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
    884      int8x16_t s3456 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
    885 
    886      int16x4_t d0 = convolve4_4_y(s0123, filter);
    887      int16x4_t d1 = convolve4_4_y(s1234, filter);
    888      int16x4_t d2 = convolve4_4_y(s2345, filter);
    889      int16x4_t d3 = convolve4_4_y(s3456, filter);
    890      // We halved the filter values so -1 from right shift.
    891      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
    892      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
    893 
    894      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
    895      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
    896 
    897      // Prepare block for next iteration - re-using as much as possible.
    898      // Shuffle everything up four rows.
    899      s0123 = s4567;
    900 
    901      src_ptr += 4 * src_stride;
    902      dst_ptr += 4 * dst_stride;
    903      h -= 4;
    904    } while (h != 0);
    905  } else {
    906    do {
    907      int height = h;
    908      const uint8_t *s = src_ptr;
    909      uint8_t *d = dst_ptr;
    910 
    911      uint8x8_t t0, t1, t2, t3;
    912      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
    913      s += 4 * src_stride;
    914 
    915      // Transform sample range to [-128, 127] for 8-bit signed dot product.
    916      int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
    917      int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
    918      int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
    919      int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
    920 
    921      // This operation combines a conventional transpose and the sample permute
    922      // required before computing the dot product.
    923      int8x16_t s0123_lo, s0123_hi;
    924      transpose_concat_elems_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
    925 
    926      do {
    927        uint8x8_t t4, t5, t6, t7;
    928        load_u8_8x4(s, src_stride, &t4, &t5, &t6, &t7);
    929 
    930        int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
    931        int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
    932        int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
    933        int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
    934 
    935        int8x16_t s4567_lo, s4567_hi;
    936        transpose_concat_elems_s8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
    937 
    938        // Merge new data into block from previous iteration.
    939        samples_LUT.val[0] = s0123_lo;
    940        samples_LUT.val[1] = s4567_lo;
    941        int8x16_t s1234_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
    942        int8x16_t s2345_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
    943        int8x16_t s3456_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
    944 
    945        samples_LUT.val[0] = s0123_hi;
    946        samples_LUT.val[1] = s4567_hi;
    947        int8x16_t s1234_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
    948        int8x16_t s2345_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
    949        int8x16_t s3456_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
    950 
    951        uint8x8_t d0 = convolve4_8_y(s0123_lo, s0123_hi, filter);
    952        uint8x8_t d1 = convolve4_8_y(s1234_lo, s1234_hi, filter);
    953        uint8x8_t d2 = convolve4_8_y(s2345_lo, s2345_hi, filter);
    954        uint8x8_t d3 = convolve4_8_y(s3456_lo, s3456_hi, filter);
    955 
    956        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    957 
    958        // Prepare block for next iteration - re-using as much as possible.
    959        // Shuffle everything up four rows.
    960        s0123_lo = s4567_lo;
    961        s0123_hi = s4567_hi;
    962 
    963        s += 4 * src_stride;
    964        d += 4 * dst_stride;
    965        height -= 4;
    966      } while (height != 0);
    967      src_ptr += 8;
    968      dst_ptr += 8;
    969      w -= 8;
    970    } while (w != 0);
    971  }
    972 }
    973 
    974 void av1_convolve_y_sr_neon_dotprod(const uint8_t *src, int src_stride,
    975                                    uint8_t *dst, int dst_stride, int w, int h,
    976                                    const InterpFilterParams *filter_params_y,
    977                                    const int subpel_y_qn) {
    978  if (w == 2 || h == 2) {
    979    av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
    980                        subpel_y_qn);
    981    return;
    982  }
    983 
    984  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
    985  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
    986      filter_params_y, subpel_y_qn & SUBPEL_MASK);
    987 
    988  if (y_filter_taps <= 4) {
    989    convolve_y_sr_4tap_neon_dotprod(src - src_stride, src_stride, dst,
    990                                    dst_stride, w, h, y_filter_ptr);
    991  } else if (y_filter_taps == 12) {
    992    convolve_y_sr_12tap_neon_dotprod(src - 5 * src_stride, src_stride, dst,
    993                                     dst_stride, w, h, y_filter_ptr);
    994  } else {
    995    // 6-tap or 8-tap.
    996    convolve_y_sr_8tap_neon_dotprod(src - 3 * src_stride, src_stride, dst,
    997                                    dst_stride, w, h, y_filter_ptr);
    998  }
    999 }
   1000 
   1001 static inline int16x4_t convolve12_4_2d_h(uint8x16_t samples,
   1002                                          const int8x16_t filters,
   1003                                          const int32x4_t horiz_const,
   1004                                          const uint8x16x3_t permute_tbl) {
   1005  // Transform sample range to [-128, 127] for 8-bit signed dot product.
   1006  int8x16_t samples_128 =
   1007      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
   1008 
   1009  // Permute samples ready for dot product.
   1010  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
   1011  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
   1012  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
   1013  int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
   1014                                vqtbl1q_s8(samples_128, permute_tbl.val[1]),
   1015                                vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
   1016 
   1017  // Accumulate dot product into 'correction' to account for range transform.
   1018  int32x4_t sum = vdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0);
   1019  sum = vdotq_laneq_s32(sum, perm_samples[1], filters, 1);
   1020  sum = vdotq_laneq_s32(sum, perm_samples[2], filters, 2);
   1021 
   1022  // Narrow and re-pack.
   1023  return vshrn_n_s32(sum, ROUND0_BITS);
   1024 }
   1025 
   1026 static inline int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
   1027                                          const int8x16_t filters,
   1028                                          const int32x4_t correction,
   1029                                          const uint8x16x3_t permute_tbl) {
   1030  // Transform sample range to [-128, 127] for 8-bit signed dot product.
   1031  int8x16_t samples_128[2] = {
   1032    vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))),
   1033    vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128)))
   1034  };
   1035 
   1036  // Permute samples ready for dot product.
   1037  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
   1038  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
   1039  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
   1040  // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
   1041  int8x16_t perm_samples[4] = { vqtbl1q_s8(samples_128[0], permute_tbl.val[0]),
   1042                                vqtbl1q_s8(samples_128[0], permute_tbl.val[1]),
   1043                                vqtbl1q_s8(samples_128[0], permute_tbl.val[2]),
   1044                                vqtbl1q_s8(samples_128[1],
   1045                                           permute_tbl.val[2]) };
   1046 
   1047  // Accumulate dot product into 'correction' to account for range transform.
   1048  int32x4_t sum0123 = vdotq_laneq_s32(correction, perm_samples[0], filters, 0);
   1049  sum0123 = vdotq_laneq_s32(sum0123, perm_samples[1], filters, 1);
   1050  sum0123 = vdotq_laneq_s32(sum0123, perm_samples[2], filters, 2);
   1051 
   1052  int32x4_t sum4567 = vdotq_laneq_s32(correction, perm_samples[1], filters, 0);
   1053  sum4567 = vdotq_laneq_s32(sum4567, perm_samples[2], filters, 1);
   1054  sum4567 = vdotq_laneq_s32(sum4567, perm_samples[3], filters, 2);
   1055 
   1056  // Narrow and re-pack.
   1057  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS),
   1058                      vshrn_n_s32(sum4567, ROUND0_BITS));
   1059 }
   1060 
   1061 static inline void convolve_2d_sr_horiz_12tap_neon_dotprod(
   1062    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
   1063    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
   1064    const int16x4_t x_filter_8_11) {
   1065  // The no-op filter should never be used here.
   1066  assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
   1067 
   1068  const int bd = 8;
   1069 
   1070  // Narrow filter values to 8-bit.
   1071  const int16x8x2_t x_filter_s16 = {
   1072    { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
   1073  };
   1074  const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
   1075                                         vmovn_s16(x_filter_s16.val[1]));
   1076 
   1077  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
   1078  // shifts - which are generally faster than rounding shifts on modern CPUs.
   1079  const int32_t horiz_const =
   1080      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
   1081  // Dot product constants.
   1082  const int32x4_t correction = vdupq_n_s32((128 << FILTER_BITS) + horiz_const);
   1083  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
   1084 
   1085  if (w <= 4) {
   1086    do {
   1087      uint8x16_t s0, s1, s2, s3;
   1088      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
   1089 
   1090      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, permute_tbl);
   1091      int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, permute_tbl);
   1092      int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, permute_tbl);
   1093      int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, permute_tbl);
   1094 
   1095      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
   1096 
   1097      src_ptr += 4 * src_stride;
   1098      dst_ptr += 4 * dst_stride;
   1099      h -= 4;
   1100    } while (h > 4);
   1101 
   1102    do {
   1103      uint8x16_t s0 = vld1q_u8(src_ptr);
   1104      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, permute_tbl);
   1105      vst1_s16(dst_ptr, d0);
   1106 
   1107      src_ptr += src_stride;
   1108      dst_ptr += dst_stride;
   1109    } while (--h != 0);
   1110 
   1111  } else {
   1112    do {
   1113      const uint8_t *s = src_ptr;
   1114      int16_t *d = dst_ptr;
   1115      int width = w;
   1116 
   1117      do {
   1118        uint8x16_t s0[2], s1[2], s2[2], s3[2];
   1119        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
   1120        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
   1121 
   1122        int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, permute_tbl);
   1123        int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction, permute_tbl);
   1124        int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction, permute_tbl);
   1125        int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction, permute_tbl);
   1126 
   1127        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
   1128 
   1129        s += 8;
   1130        d += 8;
   1131        width -= 8;
   1132      } while (width != 0);
   1133      src_ptr += 4 * src_stride;
   1134      dst_ptr += 4 * dst_stride;
   1135      h -= 4;
   1136    } while (h > 4);
   1137 
   1138    do {
   1139      const uint8_t *s = src_ptr;
   1140      int16_t *d = dst_ptr;
   1141      int width = w;
   1142 
   1143      do {
   1144        uint8x16_t s0[2];
   1145        s0[0] = vld1q_u8(s);
   1146        s0[1] = vld1q_u8(s + 4);
   1147        int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, permute_tbl);
   1148        vst1q_s16(d, d0);
   1149 
   1150        s += 8;
   1151        d += 8;
   1152        width -= 8;
   1153      } while (width != 0);
   1154      src_ptr += src_stride;
   1155      dst_ptr += dst_stride;
   1156    } while (--h != 0);
   1157  }
   1158 }
   1159 
   1160 static inline int16x4_t convolve4_4_2d_h(const uint8x16_t samples,
   1161                                         const int8x8_t filters,
   1162                                         const uint8x16_t permute_tbl,
   1163                                         const int32x4_t correction) {
   1164  // Transform sample range to [-128, 127] for 8-bit signed dot product.
   1165  int8x16_t samples_128 =
   1166      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
   1167 
   1168  // Permute samples ready for dot product.
   1169  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
   1170  int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
   1171 
   1172  // Accumulate into 'correction' to account for range transform.
   1173  int32x4_t sum = vdotq_lane_s32(correction, perm_samples, filters, 0);
   1174 
   1175  // We halved the convolution filter values so -1 from the right shift.
   1176  return vshrn_n_s32(sum, ROUND0_BITS - 1);
   1177 }
   1178 
   1179 static inline int16x8_t convolve4_8_2d_h(const uint8x16_t samples,
   1180                                         const int8x8_t filters,
   1181                                         const uint8x16x2_t permute_tbl,
   1182                                         const int32x4_t correction) {
   1183  // Transform sample range to [-128, 127] for 8-bit signed dot product.
   1184  int8x16_t samples_128 =
   1185      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
   1186 
   1187  // Permute samples ready for dot product.
   1188  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
   1189  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
   1190  int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
   1191                                vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
   1192 
   1193  // Accumulate into 'correction' to account for range transform.
   1194  int32x4_t sum0123 = vdotq_lane_s32(correction, perm_samples[0], filters, 0);
   1195  int32x4_t sum4567 = vdotq_lane_s32(correction, perm_samples[1], filters, 0);
   1196 
   1197  // Narrow and re-pack.
   1198  // We halved the filter values so -1 from right shift.
   1199  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
   1200                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
   1201 }
   1202 
   1203 static inline void convolve_2d_sr_horiz_4tap_neon_dotprod(
   1204    const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
   1205    ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) {
   1206  const int bd = 8;
   1207  const int16x4_t x_filter = vld1_s16(filter_x + 2);
   1208  // All 4-tap and bilinear filter values are even, so halve them to reduce
   1209  // intermediate precision requirements.
   1210  const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
   1211 
   1212  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
   1213  // shifts - which are generally faster than rounding shifts on modern CPUs.
   1214  const int32_t horiz_const =
   1215      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
   1216  // Accumulate into 128 << FILTER_BITS to account for range transform.
   1217  // Halve the total because we halved the filter values.
   1218  const int32x4_t correction =
   1219      vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
   1220 
   1221  if (w == 4) {
   1222    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
   1223 
   1224    do {
   1225      uint8x16_t s0, s1, s2, s3;
   1226      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
   1227 
   1228      int16x4_t d0 = convolve4_4_2d_h(s0, filter, permute_tbl, correction);
   1229      int16x4_t d1 = convolve4_4_2d_h(s1, filter, permute_tbl, correction);
   1230      int16x4_t d2 = convolve4_4_2d_h(s2, filter, permute_tbl, correction);
   1231      int16x4_t d3 = convolve4_4_2d_h(s3, filter, permute_tbl, correction);
   1232 
   1233      store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
   1234 
   1235      src += 4 * src_stride;
   1236      dst += 4 * dst_stride;
   1237      h -= 4;
   1238    } while (h > 4);
   1239 
   1240    do {
   1241      uint8x16_t s0 = vld1q_u8(src);
   1242      int16x4_t d0 = convolve4_4_2d_h(s0, filter, permute_tbl, correction);
   1243      vst1_s16(dst, d0);
   1244 
   1245      src += src_stride;
   1246      dst += dst_stride;
   1247    } while (--h != 0);
   1248  } else {
   1249    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
   1250    do {
   1251      const uint8_t *s = src;
   1252      int16_t *d = dst;
   1253      int width = w;
   1254 
   1255      do {
   1256        uint8x16_t s0, s1, s2, s3;
   1257        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
   1258 
   1259        int16x8_t d0 = convolve4_8_2d_h(s0, filter, permute_tbl, correction);
   1260        int16x8_t d1 = convolve4_8_2d_h(s1, filter, permute_tbl, correction);
   1261        int16x8_t d2 = convolve4_8_2d_h(s2, filter, permute_tbl, correction);
   1262        int16x8_t d3 = convolve4_8_2d_h(s3, filter, permute_tbl, correction);
   1263 
   1264        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
   1265 
   1266        s += 8;
   1267        d += 8;
   1268        width -= 8;
   1269      } while (width != 0);
   1270      src += 4 * src_stride;
   1271      dst += 4 * dst_stride;
   1272      h -= 4;
   1273    } while (h > 4);
   1274 
   1275    do {
   1276      const uint8_t *s = src;
   1277      int16_t *d = dst;
   1278      int width = w;
   1279 
   1280      do {
   1281        uint8x16_t s0 = vld1q_u8(s);
   1282        int16x8_t d0 = convolve4_8_2d_h(s0, filter, permute_tbl, correction);
   1283        vst1q_s16(d, d0);
   1284 
   1285        s += 8;
   1286        d += 8;
   1287        width -= 8;
   1288      } while (width != 0);
   1289      src += src_stride;
   1290      dst += dst_stride;
   1291    } while (--h != 0);
   1292  }
   1293 }
   1294 
   1295 static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples,
   1296                                         const int8x8_t filters,
   1297                                         const int32x4_t correction,
   1298                                         const uint8x16x3_t permute_tbl) {
   1299  // Transform sample range to [-128, 127] for 8-bit signed dot product.
   1300  int8x16_t samples_128 =
   1301      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
   1302 
   1303  // Permute samples ready for dot product.
   1304  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
   1305  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
   1306  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
   1307  int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
   1308                                vqtbl1q_s8(samples_128, permute_tbl.val[1]),
   1309                                vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
   1310 
   1311  // Accumulate dot product into 'correction' to account for range transform.
   1312  int32x4_t sum0123 = vdotq_lane_s32(correction, perm_samples[0], filters, 0);
   1313  sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filters, 1);
   1314 
   1315  int32x4_t sum4567 = vdotq_lane_s32(correction, perm_samples[1], filters, 0);
   1316  sum4567 = vdotq_lane_s32(sum4567, perm_samples[2], filters, 1);
   1317 
   1318  // Narrow and re-pack.
   1319  // We halved the convolution filter values so -1 from the right shift.
   1320  return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
   1321                      vshrn_n_s32(sum4567, ROUND0_BITS - 1));
   1322 }
   1323 
   1324 static inline void convolve_2d_sr_horiz_8tap_neon_dotprod(
   1325    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
   1326    int im_h, const int16_t *x_filter_ptr) {
   1327  const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
   1328  // Filter values are even, so halve to reduce intermediate precision reqs.
   1329  const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
   1330 
   1331  const int bd = 8;
   1332  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
   1333  // shifts - which are generally faster than rounding shifts on modern CPUs.
   1334  const int32_t horiz_const =
   1335      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
   1336  // Halve the total because we halved the filter values.
   1337  const int32x4_t correction =
   1338      vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
   1339 
   1340  const uint8_t *src_ptr = src;
   1341  int16_t *dst_ptr = im_block;
   1342  int dst_stride = im_stride;
   1343  int height = im_h;
   1344 
   1345  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
   1346  do {
   1347    const uint8_t *s = src_ptr;
   1348    int16_t *d = dst_ptr;
   1349    int width = w;
   1350 
   1351    do {
   1352      uint8x16_t s0, s1, s2, s3;
   1353      load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
   1354 
   1355      int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, permute_tbl);
   1356      int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, permute_tbl);
   1357      int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, permute_tbl);
   1358      int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, permute_tbl);
   1359 
   1360      store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
   1361 
   1362      s += 8;
   1363      d += 8;
   1364      width -= 8;
   1365    } while (width != 0);
   1366    src_ptr += 4 * src_stride;
   1367    dst_ptr += 4 * dst_stride;
   1368    height -= 4;
   1369  } while (height > 4);
   1370 
   1371  do {
   1372    const uint8_t *s = src_ptr;
   1373    int16_t *d = dst_ptr;
   1374    int width = w;
   1375 
   1376    do {
   1377      uint8x16_t s0 = vld1q_u8(s);
   1378      int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, permute_tbl);
   1379      vst1q_s16(d, d0);
   1380 
   1381      s += 8;
   1382      d += 8;
   1383      width -= 8;
   1384    } while (width != 0);
   1385    src_ptr += src_stride;
   1386    dst_ptr += dst_stride;
   1387  } while (--height != 0);
   1388 }
   1389 
   1390 static inline void convolve_2d_sr_6tap_neon_dotprod(
   1391    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
   1392    int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) {
   1393  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
   1394  // Filter values are even, so halve to reduce intermediate precision reqs.
   1395  const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
   1396 
   1397  const int bd = 8;
   1398  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
   1399  // shifts - which are generally faster than rounding shifts on modern CPUs.
   1400  const int32_t horiz_const =
   1401      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
   1402  // Accumulate into 128 << FILTER_BITS to account for range transform.
   1403  // Halve the total because we halved the filter values.
   1404  const int32x4_t correction =
   1405      vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
   1406  const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
   1407  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
   1408 
   1409  do {
   1410    const uint8_t *s = src;
   1411    uint8_t *d = dst;
   1412    int height = h;
   1413 
   1414    uint8x16_t h_s0, h_s1, h_s2, h_s3, h_s4;
   1415    load_u8_16x5(s, src_stride, &h_s0, &h_s1, &h_s2, &h_s3, &h_s4);
   1416    s += 5 * src_stride;
   1417 
   1418    int16x8_t v_s0 = convolve8_8_2d_h(h_s0, x_filter, correction, permute_tbl);
   1419    int16x8_t v_s1 = convolve8_8_2d_h(h_s1, x_filter, correction, permute_tbl);
   1420    int16x8_t v_s2 = convolve8_8_2d_h(h_s2, x_filter, correction, permute_tbl);
   1421    int16x8_t v_s3 = convolve8_8_2d_h(h_s3, x_filter, correction, permute_tbl);
   1422    int16x8_t v_s4 = convolve8_8_2d_h(h_s4, x_filter, correction, permute_tbl);
   1423 
   1424    do {
   1425      uint8x16_t h_s5, h_s6, h_s7, h_s8;
   1426      load_u8_16x4(s, src_stride, &h_s5, &h_s6, &h_s7, &h_s8);
   1427 
   1428      int16x8_t v_s5 =
   1429          convolve8_8_2d_h(h_s5, x_filter, correction, permute_tbl);
   1430      int16x8_t v_s6 =
   1431          convolve8_8_2d_h(h_s6, x_filter, correction, permute_tbl);
   1432      int16x8_t v_s7 =
   1433          convolve8_8_2d_h(h_s7, x_filter, correction, permute_tbl);
   1434      int16x8_t v_s8 =
   1435          convolve8_8_2d_h(h_s8, x_filter, correction, permute_tbl);
   1436 
   1437      uint8x8_t d0 = convolve6_8_2d_v(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
   1438                                      y_filter, vert_const);
   1439      uint8x8_t d1 = convolve6_8_2d_v(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
   1440                                      y_filter, vert_const);
   1441      uint8x8_t d2 = convolve6_8_2d_v(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
   1442                                      y_filter, vert_const);
   1443      uint8x8_t d3 = convolve6_8_2d_v(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
   1444                                      y_filter, vert_const);
   1445 
   1446      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
   1447 
   1448      v_s0 = v_s4;
   1449      v_s1 = v_s5;
   1450      v_s2 = v_s6;
   1451      v_s3 = v_s7;
   1452      v_s4 = v_s8;
   1453 
   1454      s += 4 * src_stride;
   1455      d += 4 * dst_stride;
   1456      height -= 4;
   1457    } while (height != 0);
   1458    src += 8;
   1459    dst += 8;
   1460    w -= 8;
   1461  } while (w != 0);
   1462 }
   1463 
   1464 static inline void convolve_2d_sr_4tap_neon_dotprod(
   1465    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
   1466    int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) {
   1467  const int bd = 8;
   1468  const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
   1469 
   1470  const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
   1471  const int16x4_t x_filter_s16 = vld1_s16(x_filter_ptr + 2);
   1472  // All 4-tap and bilinear filter values are even, so halve them to reduce
   1473  // intermediate precision requirements.
   1474  const int8x8_t x_filter =
   1475      vshrn_n_s16(vcombine_s16(x_filter_s16, vdup_n_s16(0)), 1);
   1476 
   1477  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
   1478  // shifts - which are generally faster than rounding shifts on modern CPUs.
   1479  const int32_t horiz_const =
   1480      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
   1481  // Accumulate into 128 << FILTER_BITS to account for range transform.
   1482  // Halve the total because we halved the filter values.
   1483  const int32x4_t correction =
   1484      vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
   1485 
   1486  if (w == 4) {
   1487    const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
   1488 
   1489    uint8x16_t h_s0, h_s1, h_s2;
   1490    load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
   1491 
   1492    int16x4_t v_s0 = convolve4_4_2d_h(h_s0, x_filter, permute_tbl, correction);
   1493    int16x4_t v_s1 = convolve4_4_2d_h(h_s1, x_filter, permute_tbl, correction);
   1494    int16x4_t v_s2 = convolve4_4_2d_h(h_s2, x_filter, permute_tbl, correction);
   1495 
   1496    src += 3 * src_stride;
   1497 
   1498    do {
   1499      uint8x16_t h_s3, h_s4, h_s5, h_s6;
   1500      load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
   1501 
   1502      int16x4_t v_s3 =
   1503          convolve4_4_2d_h(h_s3, x_filter, permute_tbl, correction);
   1504      int16x4_t v_s4 =
   1505          convolve4_4_2d_h(h_s4, x_filter, permute_tbl, correction);
   1506      int16x4_t v_s5 =
   1507          convolve4_4_2d_h(h_s5, x_filter, permute_tbl, correction);
   1508      int16x4_t v_s6 =
   1509          convolve4_4_2d_h(h_s6, x_filter, permute_tbl, correction);
   1510 
   1511      int16x4_t d0 = convolve4_4_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter);
   1512      int16x4_t d1 = convolve4_4_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter);
   1513      int16x4_t d2 = convolve4_4_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter);
   1514      int16x4_t d3 = convolve4_4_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter);
   1515 
   1516      uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), vert_const));
   1517      uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), vert_const));
   1518 
   1519      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
   1520      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
   1521 
   1522      v_s0 = v_s4;
   1523      v_s1 = v_s5;
   1524      v_s2 = v_s6;
   1525 
   1526      src += 4 * src_stride;
   1527      dst += 4 * dst_stride;
   1528      h -= 4;
   1529    } while (h != 0);
   1530  } else {
   1531    const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
   1532 
   1533    do {
   1534      int height = h;
   1535      const uint8_t *s = src;
   1536      uint8_t *d = dst;
   1537 
   1538      uint8x16_t h_s0, h_s1, h_s2;
   1539      load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
   1540 
   1541      int16x8_t v_s0 =
   1542          convolve4_8_2d_h(h_s0, x_filter, permute_tbl, correction);
   1543      int16x8_t v_s1 =
   1544          convolve4_8_2d_h(h_s1, x_filter, permute_tbl, correction);
   1545      int16x8_t v_s2 =
   1546          convolve4_8_2d_h(h_s2, x_filter, permute_tbl, correction);
   1547 
   1548      s += 3 * src_stride;
   1549 
   1550      do {
   1551        uint8x16_t h_s3, h_s4, h_s5, h_s6;
   1552        load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
   1553 
   1554        int16x8_t v_s3 =
   1555            convolve4_8_2d_h(h_s3, x_filter, permute_tbl, correction);
   1556        int16x8_t v_s4 =
   1557            convolve4_8_2d_h(h_s4, x_filter, permute_tbl, correction);
   1558        int16x8_t v_s5 =
   1559            convolve4_8_2d_h(h_s5, x_filter, permute_tbl, correction);
   1560        int16x8_t v_s6 =
   1561            convolve4_8_2d_h(h_s6, x_filter, permute_tbl, correction);
   1562 
   1563        uint8x8_t d0 =
   1564            convolve4_8_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter, vert_const);
   1565        uint8x8_t d1 =
   1566            convolve4_8_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter, vert_const);
   1567        uint8x8_t d2 =
   1568            convolve4_8_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter, vert_const);
   1569        uint8x8_t d3 =
   1570            convolve4_8_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter, vert_const);
   1571 
   1572        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
   1573 
   1574        v_s0 = v_s4;
   1575        v_s1 = v_s5;
   1576        v_s2 = v_s6;
   1577 
   1578        s += 4 * src_stride;
   1579        d += 4 * dst_stride;
   1580        height -= 4;
   1581      } while (height != 0);
   1582      src += 8;
   1583      dst += 8;
   1584      w -= 8;
   1585    } while (w != 0);
   1586  }
   1587 }
   1588 
   1589 void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride,
   1590                                     uint8_t *dst, int dst_stride, int w, int h,
   1591                                     const InterpFilterParams *filter_params_x,
   1592                                     const InterpFilterParams *filter_params_y,
   1593                                     const int subpel_x_qn,
   1594                                     const int subpel_y_qn,
   1595                                     ConvolveParams *conv_params) {
   1596  if (w == 2 || h == 2) {
   1597    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
   1598                         filter_params_x, filter_params_y, subpel_x_qn,
   1599                         subpel_y_qn, conv_params);
   1600    return;
   1601  }
   1602 
   1603  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   1604  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
   1605  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
   1606  const int im_h = h + clamped_y_taps - 1;
   1607  const int im_stride = MAX_SB_SIZE;
   1608  const int vert_offset = clamped_y_taps / 2 - 1;
   1609  const int horiz_offset = filter_params_x->taps / 2 - 1;
   1610  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
   1611 
   1612  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1613      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   1614  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1615      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   1616 
   1617  if (filter_params_x->taps > 8) {
   1618    DECLARE_ALIGNED(16, int16_t,
   1619                    im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
   1620 
   1621    const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
   1622    const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
   1623    const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
   1624    const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
   1625 
   1626    convolve_2d_sr_horiz_12tap_neon_dotprod(src_ptr, src_stride, im_block,
   1627                                            im_stride, w, im_h, x_filter_0_7,
   1628                                            x_filter_8_11);
   1629 
   1630    convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
   1631                                   y_filter_0_7, y_filter_8_11);
   1632  } else {
   1633    if (x_filter_taps >= 6 && y_filter_taps == 6) {
   1634      convolve_2d_sr_6tap_neon_dotprod(src_ptr, src_stride, dst, dst_stride, w,
   1635                                       h, x_filter_ptr, y_filter_ptr);
   1636      return;
   1637    }
   1638 
   1639    if (x_filter_taps <= 4 && y_filter_taps <= 4) {
   1640      convolve_2d_sr_4tap_neon_dotprod(src_ptr + 2, src_stride, dst, dst_stride,
   1641                                       w, h, x_filter_ptr, y_filter_ptr);
   1642      return;
   1643    }
   1644 
   1645    DECLARE_ALIGNED(16, int16_t,
   1646                    im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
   1647 
   1648    if (x_filter_taps <= 4) {
   1649      convolve_2d_sr_horiz_4tap_neon_dotprod(src_ptr + 2, src_stride, im_block,
   1650                                             im_stride, w, im_h, x_filter_ptr);
   1651    } else {
   1652      convolve_2d_sr_horiz_8tap_neon_dotprod(src_ptr, src_stride, im_block,
   1653                                             im_stride, w, im_h, x_filter_ptr);
   1654    }
   1655 
   1656    const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
   1657 
   1658    if (clamped_y_taps <= 4) {
   1659      convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h,
   1660                                    y_filter_ptr);
   1661    } else if (clamped_y_taps == 6) {
   1662      convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
   1663                                    y_filter);
   1664    } else {
   1665      convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
   1666                                    y_filter);
   1667    }
   1668  }
   1669 }