tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

convolve_neon.c (71254B)


      1 /*
      2 *
      3 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      4 *
      5 * This source code is subject to the terms of the BSD 2 Clause License and
      6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      7 * was not distributed with this source code in the LICENSE file, you can
      8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      9 * Media Patent License 1.0 was not distributed with this source code in the
     10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     11 */
     12 
     13 #include <assert.h>
     14 #include <arm_neon.h>
     15 
     16 #include "config/aom_config.h"
     17 #include "config/av1_rtcd.h"
     18 
     19 #include "aom_dsp/aom_dsp_common.h"
     20 #include "aom_dsp/arm/mem_neon.h"
     21 #include "aom_dsp/arm/transpose_neon.h"
     22 #include "aom_ports/mem.h"
     23 #include "av1/common/convolve.h"
     24 #include "av1/common/filter.h"
     25 #include "av1/common/arm/convolve_neon.h"
     26 
     27 static inline int16x4_t convolve12_4_x(const int16x4_t s0, const int16x4_t s1,
     28                                       const int16x4_t s2, const int16x4_t s3,
     29                                       const int16x4_t s4, const int16x4_t s5,
     30                                       const int16x4_t s6, const int16x4_t s7,
     31                                       const int16x4_t s8, const int16x4_t s9,
     32                                       const int16x4_t s10, const int16x4_t s11,
     33                                       const int16x8_t x_filter_0_7,
     34                                       const int16x4_t x_filter_8_11,
     35                                       const int32x4_t horiz_const) {
     36  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
     37  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
     38 
     39  int32x4_t sum = horiz_const;
     40  sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
     41  sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
     42  sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
     43  sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3);
     44  sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0);
     45  sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1);
     46  sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2);
     47  sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3);
     48  sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0);
     49  sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1);
     50  sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2);
     51  sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3);
     52 
     53  return vqrshrn_n_s32(sum, FILTER_BITS);
     54 }
     55 
     56 static inline void convolve_x_sr_12tap_neon(const uint8_t *src_ptr,
     57                                            int src_stride, uint8_t *dst_ptr,
     58                                            const int dst_stride, int w, int h,
     59                                            const int16_t *x_filter_ptr) {
     60  const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
     61  const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
     62 
     63  // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right
     64  // shift by FILTER_BITS - instead of a first rounding right shift by
     65  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
     66  // ROUND0_BITS.
     67  const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
     68 
     69 #if AOM_ARCH_AARCH64
     70  do {
     71    const uint8_t *s = src_ptr;
     72    uint8_t *d = dst_ptr;
     73    int width = w;
     74 
     75    uint8x8_t t0, t1, t2, t3;
     76    load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
     77    transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
     78 
     79    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
     80    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
     81    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
     82    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
     83    int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
     84    int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
     85    int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
     86    int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
     87 
     88    load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
     89    transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
     90 
     91    int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
     92    int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
     93    int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
     94 
     95    s += 11;
     96 
     97    do {
     98      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
     99      transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
    100 
    101      int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
    102      int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
    103      int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
    104      int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
    105 
    106      int16x4_t d0 =
    107          convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
    108                         x_filter_0_7, x_filter_8_11, horiz_const);
    109      int16x4_t d1 =
    110          convolve12_4_x(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
    111                         x_filter_0_7, x_filter_8_11, horiz_const);
    112      int16x4_t d2 =
    113          convolve12_4_x(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
    114                         x_filter_0_7, x_filter_8_11, horiz_const);
    115      int16x4_t d3 =
    116          convolve12_4_x(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
    117                         x_filter_0_7, x_filter_8_11, horiz_const);
    118 
    119      transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
    120 
    121      uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
    122      uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
    123 
    124      store_u8x4_strided_x2(d, dst_stride, d01);
    125      store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23);
    126 
    127      s0 = s4;
    128      s1 = s5;
    129      s2 = s6;
    130      s3 = s7;
    131      s4 = s8;
    132      s5 = s9;
    133      s6 = s10;
    134      s7 = s11;
    135      s8 = s12;
    136      s9 = s13;
    137      s10 = s14;
    138      s += 4;
    139      d += 4;
    140      width -= 4;
    141    } while (width != 0);
    142    src_ptr += 4 * src_stride;
    143    dst_ptr += 4 * dst_stride;
    144    h -= 4;
    145  } while (h != 0);
    146 
    147 #else   // !AOM_ARCH_AARCH64
    148  do {
    149    const uint8_t *s = src_ptr;
    150    uint8_t *d = dst_ptr;
    151    int width = w;
    152 
    153    do {
    154      uint8x16_t t0 = vld1q_u8(s);
    155      int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
    156      int16x8_t tt8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
    157 
    158      int16x4_t s0 = vget_low_s16(tt0);
    159      int16x4_t s4 = vget_high_s16(tt0);
    160      int16x4_t s8 = vget_low_s16(tt8);
    161      int16x4_t s12 = vget_high_s16(tt8);
    162 
    163      int16x4_t s1 = vext_s16(s0, s4, 1);    //  a1  a2  a3  a4
    164      int16x4_t s2 = vext_s16(s0, s4, 2);    //  a2  a3  a4  a5
    165      int16x4_t s3 = vext_s16(s0, s4, 3);    //  a3  a4  a5  a6
    166      int16x4_t s5 = vext_s16(s4, s8, 1);    //  a5  a6  a7  a8
    167      int16x4_t s6 = vext_s16(s4, s8, 2);    //  a6  a7  a8  a9
    168      int16x4_t s7 = vext_s16(s4, s8, 3);    //  a7  a8  a9 a10
    169      int16x4_t s9 = vext_s16(s8, s12, 1);   //  a9 a10 a11 a12
    170      int16x4_t s10 = vext_s16(s8, s12, 2);  // a10 a11 a12 a13
    171      int16x4_t s11 = vext_s16(s8, s12, 3);  // a11 a12 a13 a14
    172 
    173      int16x4_t d0 =
    174          convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
    175                         x_filter_0_7, x_filter_8_11, horiz_const);
    176 
    177      uint8x8_t dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0)));
    178 
    179      store_u8_4x1(d, dd0);
    180 
    181      s += 4;
    182      d += 4;
    183      width -= 4;
    184    } while (width != 0);
    185    src_ptr += src_stride;
    186    dst_ptr += dst_stride;
    187  } while (--h != 0);
    188 #endif  // AOM_ARCH_AARCH64
    189 }
    190 
    191 static inline uint8x8_t convolve4_8_x(const int16x8_t s0, const int16x8_t s1,
    192                                      const int16x8_t s2, const int16x8_t s3,
    193                                      const int16x4_t filter,
    194                                      int16x8_t horiz_const) {
    195  int16x8_t sum = horiz_const;
    196  sum = vmlaq_lane_s16(sum, s0, filter, 0);
    197  sum = vmlaq_lane_s16(sum, s1, filter, 1);
    198  sum = vmlaq_lane_s16(sum, s2, filter, 2);
    199  sum = vmlaq_lane_s16(sum, s3, filter, 3);
    200  // We halved the filter values so -1 from right shift.
    201  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    202 }
    203 
    204 static inline void convolve_x_sr_4tap_neon(const uint8_t *src_ptr,
    205                                           int src_stride, uint8_t *dst_ptr,
    206                                           const int dst_stride, int w, int h,
    207                                           const int16_t *x_filter_ptr) {
    208  // All filter values are even, halve to reduce intermediate precision
    209  // requirements.
    210  const int16x4_t filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
    211 
    212  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
    213  // rounding right shift by FILTER_BITS - instead of a first rounding right
    214  // shift by ROUND0_BITS, followed by second rounding right shift by
    215  // FILTER_BITS - ROUND0_BITS.
    216  // The outermost -1 is needed because we will halve the filter values.
    217  const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
    218 
    219  if (w == 4) {
    220    do {
    221      uint8x8_t t01[4];
    222      t01[0] = load_unaligned_u8(src_ptr + 0, src_stride);
    223      t01[1] = load_unaligned_u8(src_ptr + 1, src_stride);
    224      t01[2] = load_unaligned_u8(src_ptr + 2, src_stride);
    225      t01[3] = load_unaligned_u8(src_ptr + 3, src_stride);
    226 
    227      int16x8_t s01[4];
    228      s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
    229      s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
    230      s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
    231      s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
    232 
    233      uint8x8_t d01 =
    234          convolve4_8_x(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
    235 
    236      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
    237 
    238      src_ptr += 2 * src_stride;
    239      dst_ptr += 2 * dst_stride;
    240      h -= 2;
    241    } while (h != 0);
    242  } else {
    243    do {
    244      int width = w;
    245      const uint8_t *s = src_ptr;
    246      uint8_t *d = dst_ptr;
    247 
    248      do {
    249        uint8x8_t t0[4], t1[4];
    250        load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
    251        load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
    252 
    253        int16x8_t s0[4], s1[4];
    254        s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
    255        s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
    256        s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
    257        s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
    258 
    259        s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
    260        s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
    261        s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
    262        s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
    263 
    264        uint8x8_t d0 =
    265            convolve4_8_x(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
    266        uint8x8_t d1 =
    267            convolve4_8_x(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
    268 
    269        store_u8_8x2(d, dst_stride, d0, d1);
    270 
    271        s += 8;
    272        d += 8;
    273        width -= 8;
    274      } while (width != 0);
    275      src_ptr += 2 * src_stride;
    276      dst_ptr += 2 * dst_stride;
    277      h -= 2;
    278    } while (h != 0);
    279  }
    280 }
    281 
    282 static inline uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
    283                                      const int16x8_t s2, const int16x8_t s3,
    284                                      const int16x8_t s4, const int16x8_t s5,
    285                                      const int16x8_t s6, const int16x8_t s7,
    286                                      const int16x8_t filter,
    287                                      const int16x8_t horiz_const) {
    288  const int16x4_t filter_lo = vget_low_s16(filter);
    289  const int16x4_t filter_hi = vget_high_s16(filter);
    290 
    291  int16x8_t sum = horiz_const;
    292  sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
    293  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
    294  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
    295  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
    296  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
    297  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
    298  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
    299  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
    300 
    301  // We halved the convolution filter values so - 1 from the right shift.
    302  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    303 }
    304 
    305 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
    306                            int dst_stride, int w, int h,
    307                            const InterpFilterParams *filter_params_x,
    308                            const int subpel_x_qn,
    309                            ConvolveParams *conv_params) {
    310  if (w == 2 || h == 2) {
    311    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
    312                        subpel_x_qn, conv_params);
    313    return;
    314  }
    315 
    316  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
    317  src -= horiz_offset;
    318 
    319  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
    320      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    321 
    322  int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
    323 
    324  if (filter_taps > 8) {
    325    convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
    326                             x_filter_ptr);
    327    return;
    328  }
    329 
    330  if (filter_taps <= 4) {
    331    convolve_x_sr_4tap_neon(src + 2, src_stride, dst, dst_stride, w, h,
    332                            x_filter_ptr);
    333    return;
    334  }
    335 
    336  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
    337  // rounding right shift by FILTER_BITS - instead of a first rounding right
    338  // shift by ROUND0_BITS, followed by second rounding right shift by
    339  // FILTER_BITS - ROUND0_BITS.
    340  // The outermost -1 is needed because we will halve the filter values.
    341  const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1));
    342 
    343  // Filter values are even so halve to reduce precision requirements.
    344  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
    345 
    346 #if AOM_ARCH_AARCH64
    347  while (h >= 8) {
    348    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
    349    load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    350 
    351    transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    352    int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    353    int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    354    int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    355    int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    356    int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
    357    int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
    358    int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
    359 
    360    int width = w;
    361    const uint8_t *s = src + 7;
    362    uint8_t *d = dst;
    363 
    364    __builtin_prefetch(d + 0 * dst_stride);
    365    __builtin_prefetch(d + 1 * dst_stride);
    366    __builtin_prefetch(d + 2 * dst_stride);
    367    __builtin_prefetch(d + 3 * dst_stride);
    368    __builtin_prefetch(d + 4 * dst_stride);
    369    __builtin_prefetch(d + 5 * dst_stride);
    370    __builtin_prefetch(d + 6 * dst_stride);
    371    __builtin_prefetch(d + 7 * dst_stride);
    372 
    373    do {
    374      uint8x8_t t8, t9, t10, t11, t12, t13, t14;
    375      load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
    376 
    377      transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13,
    378                                     &t14);
    379      int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
    380      int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
    381      int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
    382      int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
    383      int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
    384      int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
    385      int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
    386      int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
    387 
    388      uint8x8_t d0 =
    389          convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
    390      uint8x8_t d1 =
    391          convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, horiz_const);
    392      uint8x8_t d2 =
    393          convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, horiz_const);
    394      uint8x8_t d3 =
    395          convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, horiz_const);
    396      uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
    397                                   horiz_const);
    398      uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
    399                                   horiz_const);
    400      uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
    401                                   horiz_const);
    402      uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
    403                                   x_filter, horiz_const);
    404 
    405      transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
    406 
    407      store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
    408 
    409      s0 = s8;
    410      s1 = s9;
    411      s2 = s10;
    412      s3 = s11;
    413      s4 = s12;
    414      s5 = s13;
    415      s6 = s14;
    416      s += 8;
    417      d += 8;
    418      width -= 8;
    419    } while (width != 0);
    420    src += 8 * src_stride;
    421    dst += 8 * dst_stride;
    422    h -= 8;
    423  }
    424 #endif  // AOM_ARCH_AARCH64
    425 
    426  while (h-- != 0) {
    427    uint8x8_t t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
    428    int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    429 
    430    int width = w;
    431    const uint8_t *s = src + 8;
    432    uint8_t *d = dst;
    433 
    434    __builtin_prefetch(d);
    435 
    436    do {
    437      uint8x8_t t8 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
    438      int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
    439 
    440      int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
    441      int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
    442      int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
    443      int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
    444      int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
    445      int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
    446      int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
    447 
    448      uint8x8_t d0 =
    449          convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const);
    450 
    451      vst1_u8(d, d0);
    452 
    453      s0 = s8;
    454      s += 8;
    455      d += 8;
    456      width -= 8;
    457    } while (width != 0);
    458    src += src_stride;
    459    dst += dst_stride;
    460  }
    461 }
    462 
    463 static inline uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1,
    464                                      const int16x8_t s2, const int16x8_t s3,
    465                                      const int16x4_t filter) {
    466  int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
    467  sum = vmlaq_lane_s16(sum, s1, filter, 1);
    468  sum = vmlaq_lane_s16(sum, s2, filter, 2);
    469  sum = vmlaq_lane_s16(sum, s3, filter, 3);
    470 
    471  // We halved the filter values so -1 from right shift.
    472  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    473 }
    474 
    475 static inline void convolve_y_sr_4tap_neon(const uint8_t *src,
    476                                           const int src_stride, uint8_t *dst,
    477                                           const int dst_stride, int w, int h,
    478                                           const int16_t *filter_y) {
    479  // All filter values are even, halve to reduce intermediate precision
    480  // requirements.
    481  const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1);
    482 
    483  if (w == 4) {
    484    uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, src_stride);
    485    uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, src_stride);
    486 
    487    int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01));
    488    int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
    489 
    490    src += 2 * src_stride;
    491 
    492    do {
    493      uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, src_stride);
    494      uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, src_stride);
    495      uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, src_stride);
    496      uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, src_stride);
    497 
    498      int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23));
    499      int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34));
    500      int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45));
    501      int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56));
    502 
    503      uint8x8_t d01 = convolve4_8_y(s01, s12, s23, s34, filter);
    504      uint8x8_t d23 = convolve4_8_y(s23, s34, s45, s56, filter);
    505 
    506      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
    507      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
    508 
    509      s01 = s45;
    510      s12 = s56;
    511 
    512      src += 4 * src_stride;
    513      dst += 4 * dst_stride;
    514      h -= 4;
    515    } while (h != 0);
    516  } else {
    517    do {
    518      uint8x8_t t0, t1, t2;
    519      load_u8_8x3(src, src_stride, &t0, &t1, &t2);
    520 
    521      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    522      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    523      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    524 
    525      int height = h;
    526      const uint8_t *s = src + 3 * src_stride;
    527      uint8_t *d = dst;
    528 
    529      do {
    530        uint8x8_t t3;
    531        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
    532 
    533        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
    534        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
    535        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
    536        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
    537 
    538        uint8x8_t d0 = convolve4_8_y(s0, s1, s2, s3, filter);
    539        uint8x8_t d1 = convolve4_8_y(s1, s2, s3, s4, filter);
    540        uint8x8_t d2 = convolve4_8_y(s2, s3, s4, s5, filter);
    541        uint8x8_t d3 = convolve4_8_y(s3, s4, s5, s6, filter);
    542 
    543        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    544 
    545        s0 = s4;
    546        s1 = s5;
    547        s2 = s6;
    548 
    549        s += 4 * src_stride;
    550        d += 4 * dst_stride;
    551        height -= 4;
    552      } while (height != 0);
    553      src += 8;
    554      dst += 8;
    555      w -= 8;
    556    } while (w != 0);
    557  }
    558 }
    559 
    560 static inline int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
    561                                      const int16x4_t s2, const int16x4_t s3,
    562                                      const int16x4_t s4, const int16x4_t s5,
    563                                      const int16x8_t y_filter_0_7) {
    564  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
    565  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
    566 
    567  // Filter values at indices 0 and 7 are 0.
    568  int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
    569  sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
    570  sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
    571  sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
    572  sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
    573  sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
    574 
    575  return sum;
    576 }
    577 
    578 static inline uint8x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
    579                                      const int16x8_t s2, const int16x8_t s3,
    580                                      const int16x8_t s4, const int16x8_t s5,
    581                                      const int16x8_t y_filters) {
    582  const int16x4_t y_filter_lo = vget_low_s16(y_filters);
    583  const int16x4_t y_filter_hi = vget_high_s16(y_filters);
    584 
    585  // Filter values at indices 0 and 7 are 0.
    586  int16x8_t sum = vmulq_lane_s16(s0, y_filter_lo, 1);
    587  sum = vmlaq_lane_s16(sum, s1, y_filter_lo, 2);
    588  sum = vmlaq_lane_s16(sum, s2, y_filter_lo, 3);
    589  sum = vmlaq_lane_s16(sum, s3, y_filter_hi, 0);
    590  sum = vmlaq_lane_s16(sum, s4, y_filter_hi, 1);
    591  sum = vmlaq_lane_s16(sum, s5, y_filter_hi, 2);
    592  // We halved the convolution filter values so -1 from the right shift.
    593  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    594 }
    595 
    596 static inline void convolve_y_sr_6tap_neon(const uint8_t *src_ptr,
    597                                           int src_stride, uint8_t *dst_ptr,
    598                                           const int dst_stride, int w, int h,
    599                                           const int16x8_t y_filter) {
    600  if (w <= 4) {
    601    uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
    602    uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
    603    uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
    604    uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
    605    uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride);
    606 
    607    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
    608    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
    609    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
    610    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
    611    int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
    612 
    613    src_ptr += 5 * src_stride;
    614 
    615    do {
    616 #if AOM_ARCH_AARCH64
    617      uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
    618      uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
    619      uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
    620      uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
    621 
    622      int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
    623      int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
    624      int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
    625      int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
    626 
    627      int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter);
    628      int16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter);
    629      int16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter);
    630      int16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter);
    631 
    632      // We halved the convolution filter values so -1 from the right shift.
    633      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
    634      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
    635 
    636      store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
    637      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
    638 
    639      s0 = s4;
    640      s1 = s5;
    641      s2 = s6;
    642      s3 = s7;
    643      s4 = s8;
    644      src_ptr += 4 * src_stride;
    645      dst_ptr += 4 * dst_stride;
    646      h -= 4;
    647 #else   // !AOM_ARCH_AARCH64
    648      uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr);
    649      int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
    650 
    651      int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter);
    652      // We halved the convolution filter values so -1 from the right shift.
    653      uint8x8_t d01 =
    654          vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
    655 
    656      store_u8_4x1(dst_ptr, d01);
    657 
    658      s0 = s1;
    659      s1 = s2;
    660      s2 = s3;
    661      s3 = s4;
    662      s4 = s5;
    663      src_ptr += src_stride;
    664      dst_ptr += dst_stride;
    665      h--;
    666 #endif  // AOM_ARCH_AARCH64
    667    } while (h != 0);
    668 
    669  } else {
    670    do {
    671      const uint8_t *s = src_ptr;
    672      uint8_t *d = dst_ptr;
    673      int height = h;
    674 
    675      uint8x8_t t0, t1, t2, t3, t4;
    676      load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4);
    677 
    678      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    679      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    680      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    681      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    682      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
    683 
    684      s += 5 * src_stride;
    685 
    686      do {
    687 #if AOM_ARCH_AARCH64
    688        uint8x8_t t5, t6, t7, t8;
    689        load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8);
    690 
    691        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
    692        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
    693        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
    694        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
    695 
    696        uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter);
    697        uint8x8_t d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter);
    698        uint8x8_t d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter);
    699        uint8x8_t d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter);
    700 
    701        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    702 
    703        s0 = s4;
    704        s1 = s5;
    705        s2 = s6;
    706        s3 = s7;
    707        s4 = s8;
    708        s += 4 * src_stride;
    709        d += 4 * dst_stride;
    710        height -= 4;
    711 #else   // !AOM_ARCH_AARCH64
    712        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    713 
    714        uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter);
    715 
    716        vst1_u8(d, d0);
    717 
    718        s0 = s1;
    719        s1 = s2;
    720        s2 = s3;
    721        s3 = s4;
    722        s4 = s5;
    723        s += src_stride;
    724        d += dst_stride;
    725        height--;
    726 #endif  // AOM_ARCH_AARCH64
    727      } while (height != 0);
    728      src_ptr += 8;
    729      dst_ptr += 8;
    730      w -= 8;
    731    } while (w != 0);
    732  }
    733 }
    734 
    735 static inline int16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
    736                                      const int16x4_t s2, const int16x4_t s3,
    737                                      const int16x4_t s4, const int16x4_t s5,
    738                                      const int16x4_t s6, const int16x4_t s7,
    739                                      const int16x8_t filter) {
    740  const int16x4_t filter_lo = vget_low_s16(filter);
    741  const int16x4_t filter_hi = vget_high_s16(filter);
    742 
    743  int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0);
    744  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
    745  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
    746  sum = vmla_lane_s16(sum, s3, filter_lo, 3);
    747  sum = vmla_lane_s16(sum, s4, filter_hi, 0);
    748  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
    749  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
    750  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
    751 
    752  return sum;
    753 }
    754 
    755 static inline uint8x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
    756                                      const int16x8_t s2, const int16x8_t s3,
    757                                      const int16x8_t s4, const int16x8_t s5,
    758                                      const int16x8_t s6, const int16x8_t s7,
    759                                      const int16x8_t filter) {
    760  const int16x4_t filter_lo = vget_low_s16(filter);
    761  const int16x4_t filter_hi = vget_high_s16(filter);
    762 
    763  int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0);
    764  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
    765  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
    766  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
    767  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
    768  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
    769  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
    770  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
    771 
    772  // We halved the convolution filter values so -1 from the right shift.
    773  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
    774 }
    775 
    776 static inline void convolve_y_sr_8tap_neon(const uint8_t *src_ptr,
    777                                           int src_stride, uint8_t *dst_ptr,
    778                                           const int dst_stride, int w, int h,
    779                                           const int16x8_t y_filter) {
    780  if (w <= 4) {
    781    uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
    782    uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
    783    uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
    784    uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
    785    uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride);
    786    uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 5 * src_stride);
    787    uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 6 * src_stride);
    788 
    789    int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
    790    int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
    791    int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
    792    int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
    793    int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
    794    int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
    795    int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
    796 
    797    src_ptr += 7 * src_stride;
    798 
    799    do {
    800 #if AOM_ARCH_AARCH64
    801      uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride);
    802      uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride);
    803      uint8x8_t t9 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride);
    804      uint8x8_t t10 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride);
    805 
    806      int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
    807      int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
    808      int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
    809      int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
    810 
    811      int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
    812      int16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
    813      int16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
    814      int16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
    815 
    816      // We halved the convolution filter values so -1 from the right shift.
    817      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
    818      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
    819 
    820      store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
    821      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
    822 
    823      s0 = s4;
    824      s1 = s5;
    825      s2 = s6;
    826      s3 = s7;
    827      s4 = s8;
    828      s5 = s9;
    829      s6 = s10;
    830      src_ptr += 4 * src_stride;
    831      dst_ptr += 4 * dst_stride;
    832      h -= 4;
    833 #else   // !AOM_ARCH_AARCH64
    834      uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr);
    835      int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
    836 
    837      int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
    838      // We halved the convolution filter values so -1 from the right shift.
    839      uint8x8_t d01 =
    840          vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1);
    841 
    842      store_u8_4x1(dst_ptr, d01);
    843 
    844      s0 = s1;
    845      s1 = s2;
    846      s2 = s3;
    847      s3 = s4;
    848      s4 = s5;
    849      s5 = s6;
    850      s6 = s7;
    851      src_ptr += src_stride;
    852      dst_ptr += dst_stride;
    853      h--;
    854 #endif  // AOM_ARCH_AARCH64
    855    } while (h != 0);
    856  } else {
    857    do {
    858      const uint8_t *s = src_ptr;
    859      uint8_t *d = dst_ptr;
    860      int height = h;
    861 
    862      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
    863      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
    864 
    865      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    866      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    867      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    868      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    869      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
    870      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
    871      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
    872 
    873      s += 7 * src_stride;
    874 
    875      do {
    876 #if AOM_ARCH_AARCH64
    877        uint8x8_t t7, t8, t9, t10;
    878        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
    879 
    880        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
    881        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
    882        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
    883        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
    884 
    885        uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
    886        uint8x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
    887        uint8x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
    888        uint8x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
    889 
    890        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
    891 
    892        s0 = s4;
    893        s1 = s5;
    894        s2 = s6;
    895        s3 = s7;
    896        s4 = s8;
    897        s5 = s9;
    898        s6 = s10;
    899        s += 4 * src_stride;
    900        d += 4 * dst_stride;
    901        height -= 4;
    902 #else   // !AOM_ARCH_AARCH64
    903        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
    904 
    905        uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
    906 
    907        vst1_u8(d, d0);
    908 
    909        s0 = s1;
    910        s1 = s2;
    911        s2 = s3;
    912        s3 = s4;
    913        s4 = s5;
    914        s5 = s6;
    915        s6 = s7;
    916        s += src_stride;
    917        d += dst_stride;
    918        height--;
    919 #endif  // AOM_ARCH_AARCH64
    920      } while (height != 0);
    921      src_ptr += 8;
    922      dst_ptr += 8;
    923      w -= 8;
    924    } while (w != 0);
    925  }
    926 }
    927 
    928 static inline int16x4_t convolve12_4_y(const int16x4_t s0, const int16x4_t s1,
    929                                       const int16x4_t s2, const int16x4_t s3,
    930                                       const int16x4_t s4, const int16x4_t s5,
    931                                       const int16x4_t s6, const int16x4_t s7,
    932                                       const int16x4_t s8, const int16x4_t s9,
    933                                       const int16x4_t s10, const int16x4_t s11,
    934                                       const int16x8_t y_filter_0_7,
    935                                       const int16x4_t y_filter_8_11) {
    936  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
    937  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
    938  int16x4_t sum;
    939 
    940  sum = vmul_lane_s16(s0, y_filter_0_3, 0);
    941  sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1);
    942  sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2);
    943  sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3);
    944  sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0);
    945 
    946  sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3);
    947  sum = vmla_lane_s16(sum, s8, y_filter_8_11, 0);
    948  sum = vmla_lane_s16(sum, s9, y_filter_8_11, 1);
    949  sum = vmla_lane_s16(sum, s10, y_filter_8_11, 2);
    950  sum = vmla_lane_s16(sum, s11, y_filter_8_11, 3);
    951 
    952  // Saturating addition is required for the largest filter taps to avoid
    953  // overflow (while staying in 16-bit elements.)
    954  sum = vqadd_s16(sum, vmul_lane_s16(s5, y_filter_4_7, 1));
    955  sum = vqadd_s16(sum, vmul_lane_s16(s6, y_filter_4_7, 2));
    956 
    957  return sum;
    958 }
    959 
    960 static inline uint8x8_t convolve12_8_y(const int16x8_t s0, const int16x8_t s1,
    961                                       const int16x8_t s2, const int16x8_t s3,
    962                                       const int16x8_t s4, const int16x8_t s5,
    963                                       const int16x8_t s6, const int16x8_t s7,
    964                                       const int16x8_t s8, const int16x8_t s9,
    965                                       const int16x8_t s10, const int16x8_t s11,
    966                                       const int16x8_t y_filter_0_7,
    967                                       const int16x4_t y_filter_8_11) {
    968  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
    969  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
    970  int16x8_t sum;
    971 
    972  sum = vmulq_lane_s16(s0, y_filter_0_3, 0);
    973  sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1);
    974  sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2);
    975  sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3);
    976  sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0);
    977 
    978  sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3);
    979  sum = vmlaq_lane_s16(sum, s8, y_filter_8_11, 0);
    980  sum = vmlaq_lane_s16(sum, s9, y_filter_8_11, 1);
    981  sum = vmlaq_lane_s16(sum, s10, y_filter_8_11, 2);
    982  sum = vmlaq_lane_s16(sum, s11, y_filter_8_11, 3);
    983 
    984  // Saturating addition is required for the largest filter taps to avoid
    985  // overflow (while staying in 16-bit elements.)
    986  sum = vqaddq_s16(sum, vmulq_lane_s16(s5, y_filter_4_7, 1));
    987  sum = vqaddq_s16(sum, vmulq_lane_s16(s6, y_filter_4_7, 2));
    988 
    989  return vqrshrun_n_s16(sum, FILTER_BITS);
    990 }
    991 
    992 static inline void convolve_y_sr_12tap_neon(const uint8_t *src_ptr,
    993                                            int src_stride, uint8_t *dst_ptr,
    994                                            int dst_stride, int w, int h,
    995                                            const int16_t *y_filter_ptr) {
    996  const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
    997  const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
    998 
    999  if (w <= 4) {
   1000    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
   1001    load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7,
   1002                 &t8, &t9, &t10);
   1003    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
   1004    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
   1005    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
   1006    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
   1007    int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
   1008    int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
   1009    int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
   1010    int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
   1011    int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
   1012    int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
   1013    int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
   1014 
   1015    src_ptr += 11 * src_stride;
   1016 
   1017    do {
   1018      uint8x8_t t11, t12, t13, t14;
   1019      load_u8_8x4(src_ptr, src_stride, &t11, &t12, &t13, &t14);
   1020 
   1021      int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t11)));
   1022      int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t12)));
   1023      int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t13)));
   1024      int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t14)));
   1025 
   1026      int16x4_t d0 = convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
   1027                                    s11, y_filter_0_7, y_filter_8_11);
   1028      int16x4_t d1 = convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
   1029                                    s11, s12, y_filter_0_7, y_filter_8_11);
   1030      int16x4_t d2 = convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
   1031                                    s12, s13, y_filter_0_7, y_filter_8_11);
   1032      int16x4_t d3 = convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
   1033                                    s13, s14, y_filter_0_7, y_filter_8_11);
   1034 
   1035      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
   1036      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
   1037 
   1038      store_u8x4_strided_x2(dst_ptr, dst_stride, d01);
   1039      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
   1040 
   1041      s0 = s4;
   1042      s1 = s5;
   1043      s2 = s6;
   1044      s3 = s7;
   1045      s4 = s8;
   1046      s5 = s9;
   1047      s6 = s10;
   1048      s7 = s11;
   1049      s8 = s12;
   1050      s9 = s13;
   1051      s10 = s14;
   1052      src_ptr += 4 * src_stride;
   1053      dst_ptr += 4 * dst_stride;
   1054      h -= 4;
   1055    } while (h != 0);
   1056 
   1057  } else {
   1058    do {
   1059      const uint8_t *s = src_ptr;
   1060      uint8_t *d = dst_ptr;
   1061      int height = h;
   1062 
   1063      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
   1064      load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
   1065                   &t9, &t10);
   1066      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1067      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
   1068      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
   1069      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
   1070      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
   1071      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
   1072      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
   1073      int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
   1074      int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
   1075      int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
   1076      int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
   1077 
   1078      s += 11 * src_stride;
   1079 
   1080      do {
   1081        uint8x8_t t11, t12, t13, t14;
   1082        load_u8_8x4(s, src_stride, &t11, &t12, &t13, &t14);
   1083 
   1084        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
   1085        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
   1086        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
   1087        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
   1088 
   1089        uint8x8_t d0 = convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
   1090                                      s10, s11, y_filter_0_7, y_filter_8_11);
   1091        uint8x8_t d1 = convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
   1092                                      s11, s12, y_filter_0_7, y_filter_8_11);
   1093        uint8x8_t d2 = convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
   1094                                      s12, s13, y_filter_0_7, y_filter_8_11);
   1095        uint8x8_t d3 = convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
   1096                                      s13, s14, y_filter_0_7, y_filter_8_11);
   1097 
   1098        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
   1099 
   1100        s0 = s4;
   1101        s1 = s5;
   1102        s2 = s6;
   1103        s3 = s7;
   1104        s4 = s8;
   1105        s5 = s9;
   1106        s6 = s10;
   1107        s7 = s11;
   1108        s8 = s12;
   1109        s9 = s13;
   1110        s10 = s14;
   1111        s += 4 * src_stride;
   1112        d += 4 * dst_stride;
   1113        height -= 4;
   1114      } while (height != 0);
   1115      src_ptr += 8;
   1116      dst_ptr += 8;
   1117      w -= 8;
   1118    } while (w != 0);
   1119  }
   1120 }
   1121 
   1122 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   1123                            int dst_stride, int w, int h,
   1124                            const InterpFilterParams *filter_params_y,
   1125                            const int subpel_y_qn) {
   1126  if (w == 2 || h == 2) {
   1127    av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
   1128                        subpel_y_qn);
   1129    return;
   1130  }
   1131 
   1132  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   1133  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
   1134  const int vert_offset = clamped_y_taps / 2 - 1;
   1135 
   1136  src -= vert_offset * src_stride;
   1137 
   1138  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1139      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   1140 
   1141  if (y_filter_taps > 8) {
   1142    convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
   1143                             y_filter_ptr);
   1144    return;
   1145  }
   1146 
   1147  // Filter values are even so halve to reduce precision requirements.
   1148  const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
   1149 
   1150  if (y_filter_taps <= 4) {
   1151    convolve_y_sr_4tap_neon(src, src_stride, dst, dst_stride, w, h,
   1152                            y_filter_ptr);
   1153  } else if (y_filter_taps == 6) {
   1154    convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
   1155  } else {
   1156    convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
   1157  }
   1158 }
   1159 
   1160 static inline int16x4_t convolve12_4_2d_h(
   1161    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   1162    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
   1163    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
   1164    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
   1165    const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11,
   1166    const int32x4_t horiz_const) {
   1167  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
   1168  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
   1169 
   1170  int32x4_t sum = horiz_const;
   1171  sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0);
   1172  sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1);
   1173  sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2);
   1174  sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3);
   1175  sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0);
   1176  sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1);
   1177  sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2);
   1178  sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3);
   1179  sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0);
   1180  sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1);
   1181  sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2);
   1182  sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3);
   1183 
   1184  return vshrn_n_s32(sum, ROUND0_BITS);
   1185 }
   1186 
   1187 static inline void convolve_2d_sr_horiz_12tap_neon(
   1188    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
   1189    const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
   1190    const int16x4_t x_filter_8_11) {
   1191  const int bd = 8;
   1192  // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts -
   1193  // which are generally faster than rounding shifts on modern CPUs.
   1194  const int32x4_t horiz_const =
   1195      vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
   1196 
   1197 #if AOM_ARCH_AARCH64
   1198  do {
   1199    const uint8_t *s = src_ptr;
   1200    int16_t *d = dst_ptr;
   1201    int width = w;
   1202 
   1203    uint8x8_t t0, t1, t2, t3;
   1204    load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
   1205    transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
   1206 
   1207    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
   1208    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
   1209    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
   1210    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
   1211    int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
   1212    int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
   1213    int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
   1214    int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
   1215 
   1216    load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
   1217    transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
   1218 
   1219    int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
   1220    int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
   1221    int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
   1222 
   1223    s += 11;
   1224 
   1225    do {
   1226      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
   1227      transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
   1228 
   1229      int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
   1230      int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
   1231      int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
   1232      int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
   1233 
   1234      int16x4_t d0 =
   1235          convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
   1236                            x_filter_0_7, x_filter_8_11, horiz_const);
   1237      int16x4_t d1 =
   1238          convolve12_4_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
   1239                            x_filter_0_7, x_filter_8_11, horiz_const);
   1240      int16x4_t d2 =
   1241          convolve12_4_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
   1242                            x_filter_0_7, x_filter_8_11, horiz_const);
   1243      int16x4_t d3 =
   1244          convolve12_4_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
   1245                            x_filter_0_7, x_filter_8_11, horiz_const);
   1246 
   1247      transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
   1248      store_s16_4x4(d, dst_stride, d0, d1, d2, d3);
   1249 
   1250      s0 = s4;
   1251      s1 = s5;
   1252      s2 = s6;
   1253      s3 = s7;
   1254      s4 = s8;
   1255      s5 = s9;
   1256      s6 = s10;
   1257      s7 = s11;
   1258      s8 = s12;
   1259      s9 = s13;
   1260      s10 = s14;
   1261      s += 4;
   1262      d += 4;
   1263      width -= 4;
   1264    } while (width != 0);
   1265    src_ptr += 4 * src_stride;
   1266    dst_ptr += 4 * dst_stride;
   1267    h -= 4;
   1268  } while (h > 4);
   1269 #endif  // AOM_ARCH_AARCH64
   1270 
   1271  do {
   1272    const uint8_t *s = src_ptr;
   1273    int16_t *d = dst_ptr;
   1274    int width = w;
   1275 
   1276    do {
   1277      uint8x16_t t0 = vld1q_u8(s);
   1278      int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
   1279      int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
   1280 
   1281      int16x4_t s0 = vget_low_s16(tt0);
   1282      int16x4_t s4 = vget_high_s16(tt0);
   1283      int16x4_t s8 = vget_low_s16(tt1);
   1284      int16x4_t s12 = vget_high_s16(tt1);
   1285 
   1286      int16x4_t s1 = vext_s16(s0, s4, 1);    //  a1  a2  a3  a4
   1287      int16x4_t s2 = vext_s16(s0, s4, 2);    //  a2  a3  a4  a5
   1288      int16x4_t s3 = vext_s16(s0, s4, 3);    //  a3  a4  a5  a6
   1289      int16x4_t s5 = vext_s16(s4, s8, 1);    //  a5  a6  a7  a8
   1290      int16x4_t s6 = vext_s16(s4, s8, 2);    //  a6  a7  a8  a9
   1291      int16x4_t s7 = vext_s16(s4, s8, 3);    //  a7  a8  a9 a10
   1292      int16x4_t s9 = vext_s16(s8, s12, 1);   //  a9 a10 a11 a12
   1293      int16x4_t s10 = vext_s16(s8, s12, 2);  // a10 a11 a12 a13
   1294      int16x4_t s11 = vext_s16(s8, s12, 3);  // a11 a12 a13 a14
   1295 
   1296      int16x4_t d0 =
   1297          convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
   1298                            x_filter_0_7, x_filter_8_11, horiz_const);
   1299      vst1_s16(d, d0);
   1300 
   1301      s += 4;
   1302      d += 4;
   1303      width -= 4;
   1304    } while (width != 0);
   1305    src_ptr += src_stride;
   1306    dst_ptr += dst_stride;
   1307  } while (--h != 0);
   1308 }
   1309 
   1310 static inline int16x8_t convolve4_8_2d_h(const int16x8_t s0, const int16x8_t s1,
   1311                                         const int16x8_t s2, const int16x8_t s3,
   1312                                         const int16x4_t filter,
   1313                                         const int16x8_t horiz_const) {
   1314  int16x8_t sum = vmlaq_lane_s16(horiz_const, s0, filter, 0);
   1315  sum = vmlaq_lane_s16(sum, s1, filter, 1);
   1316  sum = vmlaq_lane_s16(sum, s2, filter, 2);
   1317  sum = vmlaq_lane_s16(sum, s3, filter, 3);
   1318  // We halved the filter values so -1 from right shift.
   1319  return vshrq_n_s16(sum, ROUND0_BITS - 1);
   1320 }
   1321 
   1322 static inline void convolve_2d_sr_horiz_4tap_neon(
   1323    const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
   1324    ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) {
   1325  const int bd = 8;
   1326  // All filter values are even, halve to reduce intermediate precision
   1327  // requirements.
   1328  const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1);
   1329 
   1330  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
   1331  // shifts - which are generally faster than rounding shifts on modern CPUs.
   1332  // (The extra -1 is needed because we halved the filter values.)
   1333  const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
   1334                                            (1 << ((ROUND0_BITS - 1) - 1)));
   1335 
   1336  if (w == 4) {
   1337    do {
   1338      uint8x8_t t01[4];
   1339      t01[0] = load_unaligned_u8(src + 0, (int)src_stride);
   1340      t01[1] = load_unaligned_u8(src + 1, (int)src_stride);
   1341      t01[2] = load_unaligned_u8(src + 2, (int)src_stride);
   1342      t01[3] = load_unaligned_u8(src + 3, (int)src_stride);
   1343 
   1344      int16x8_t s01[4];
   1345      s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0]));
   1346      s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1]));
   1347      s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2]));
   1348      s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3]));
   1349 
   1350      int16x8_t d01 =
   1351          convolve4_8_2d_h(s01[0], s01[1], s01[2], s01[3], filter, horiz_const);
   1352 
   1353      store_s16x4_strided_x2(dst, (int)dst_stride, d01);
   1354 
   1355      src += 2 * src_stride;
   1356      dst += 2 * dst_stride;
   1357      h -= 2;
   1358    } while (h > 0);
   1359  } else {
   1360    do {
   1361      int width = w;
   1362      const uint8_t *s = src;
   1363      int16_t *d = dst;
   1364 
   1365      do {
   1366        uint8x8_t t0[4], t1[4];
   1367        load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
   1368        load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]);
   1369 
   1370        int16x8_t s0[4];
   1371        s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
   1372        s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
   1373        s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
   1374        s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
   1375 
   1376        int16x8_t s1[4];
   1377        s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0]));
   1378        s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1]));
   1379        s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2]));
   1380        s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3]));
   1381 
   1382        int16x8_t d0 =
   1383            convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
   1384        int16x8_t d1 =
   1385            convolve4_8_2d_h(s1[0], s1[1], s1[2], s1[3], filter, horiz_const);
   1386 
   1387        store_s16_8x2(d, dst_stride, d0, d1);
   1388 
   1389        s += 8;
   1390        d += 8;
   1391        width -= 8;
   1392      } while (width != 0);
   1393      src += 2 * src_stride;
   1394      dst += 2 * dst_stride;
   1395      h -= 2;
   1396    } while (h > 2);
   1397 
   1398    do {
   1399      const uint8_t *s = src;
   1400      int16_t *d = dst;
   1401      int width = w;
   1402 
   1403      do {
   1404        uint8x8_t t0[4];
   1405        load_u8_8x4(s, 1, &t0[0], &t0[1], &t0[2], &t0[3]);
   1406 
   1407        int16x8_t s0[4];
   1408        s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0]));
   1409        s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1]));
   1410        s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2]));
   1411        s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3]));
   1412 
   1413        int16x8_t d0 =
   1414            convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const);
   1415 
   1416        vst1q_s16(d, d0);
   1417 
   1418        s += 8;
   1419        d += 8;
   1420        width -= 8;
   1421      } while (width != 0);
   1422      src += src_stride;
   1423      dst += dst_stride;
   1424    } while (--h != 0);
   1425  }
   1426 }
   1427 
   1428 static inline int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
   1429                                         const int16x8_t s2, const int16x8_t s3,
   1430                                         const int16x8_t s4, const int16x8_t s5,
   1431                                         const int16x8_t s6, const int16x8_t s7,
   1432                                         const int16x8_t filter,
   1433                                         const int16x8_t horiz_const) {
   1434  const int16x4_t filter_lo = vget_low_s16(filter);
   1435  const int16x4_t filter_hi = vget_high_s16(filter);
   1436 
   1437  int16x8_t sum = horiz_const;
   1438  sum = vmlaq_lane_s16(sum, s0, filter_lo, 0);
   1439  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
   1440  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
   1441  sum = vmlaq_lane_s16(sum, s3, filter_lo, 3);
   1442  sum = vmlaq_lane_s16(sum, s4, filter_hi, 0);
   1443  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
   1444  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
   1445  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
   1446 
   1447  // We halved the convolution filter values so -1 from the right shift.
   1448  return vshrq_n_s16(sum, ROUND0_BITS - 1);
   1449 }
   1450 
   1451 static inline void convolve_2d_sr_horiz_8tap_neon(
   1452    const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
   1453    int im_h, const int16_t *x_filter_ptr) {
   1454  const int bd = 8;
   1455 
   1456  const uint8_t *src_ptr = src;
   1457  int16_t *dst_ptr = im_block;
   1458  int dst_stride = im_stride;
   1459  int height = im_h;
   1460 
   1461  // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
   1462  // shifts - which are generally faster than rounding shifts on modern CPUs.
   1463  // (The extra -1 is needed because we halved the filter values.)
   1464  const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
   1465                                            (1 << ((ROUND0_BITS - 1) - 1)));
   1466  // Filter values are even, so halve to reduce intermediate precision reqs.
   1467  const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
   1468 
   1469 #if AOM_ARCH_AARCH64
   1470  while (height > 8) {
   1471    const uint8_t *s = src_ptr;
   1472    int16_t *d = dst_ptr;
   1473    int width = w;
   1474 
   1475    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
   1476    load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1477    transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1478 
   1479    int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1480    int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
   1481    int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
   1482    int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
   1483    int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
   1484    int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
   1485    int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
   1486 
   1487    s += 7;
   1488 
   1489    do {
   1490      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1491 
   1492      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1493 
   1494      int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1495      int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
   1496      int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
   1497      int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
   1498      int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
   1499      int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
   1500      int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
   1501      int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
   1502 
   1503      int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
   1504                                      horiz_const);
   1505      int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
   1506                                      horiz_const);
   1507      int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
   1508                                      horiz_const);
   1509      int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
   1510                                      horiz_const);
   1511      int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
   1512                                      x_filter, horiz_const);
   1513      int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
   1514                                      x_filter, horiz_const);
   1515      int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
   1516                                      x_filter, horiz_const);
   1517      int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
   1518                                      x_filter, horiz_const);
   1519 
   1520      transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
   1521 
   1522      store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
   1523 
   1524      s0 = s8;
   1525      s1 = s9;
   1526      s2 = s10;
   1527      s3 = s11;
   1528      s4 = s12;
   1529      s5 = s13;
   1530      s6 = s14;
   1531      s += 8;
   1532      d += 8;
   1533      width -= 8;
   1534    } while (width != 0);
   1535    src_ptr += 8 * src_stride;
   1536    dst_ptr += 8 * dst_stride;
   1537    height -= 8;
   1538  }
   1539 #endif  // AOM_ARCH_AARCH64
   1540 
   1541  do {
   1542    const uint8_t *s = src_ptr;
   1543    int16_t *d = dst_ptr;
   1544    int width = w;
   1545 
   1546    uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
   1547    int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1548 
   1549    do {
   1550      uint8x8_t t1 = vld1_u8(s + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
   1551      int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
   1552 
   1553      int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
   1554      int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
   1555      int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
   1556      int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
   1557      int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
   1558      int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
   1559      int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
   1560 
   1561      int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
   1562                                      horiz_const);
   1563 
   1564      vst1q_s16(d, d0);
   1565 
   1566      s0 = s8;
   1567      s += 8;
   1568      d += 8;
   1569      width -= 8;
   1570    } while (width != 0);
   1571    src_ptr += src_stride;
   1572    dst_ptr += dst_stride;
   1573  } while (--height != 0);
   1574 }
   1575 
   1576 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
   1577                             int dst_stride, int w, int h,
   1578                             const InterpFilterParams *filter_params_x,
   1579                             const InterpFilterParams *filter_params_y,
   1580                             const int subpel_x_qn, const int subpel_y_qn,
   1581                             ConvolveParams *conv_params) {
   1582  if (w == 2 || h == 2) {
   1583    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
   1584                         filter_params_x, filter_params_y, subpel_x_qn,
   1585                         subpel_y_qn, conv_params);
   1586    return;
   1587  }
   1588 
   1589  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   1590  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
   1591  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
   1592  const int im_h = h + clamped_y_taps - 1;
   1593  const int im_stride = MAX_SB_SIZE;
   1594  const int vert_offset = clamped_y_taps / 2 - 1;
   1595  const int horiz_offset = filter_params_x->taps / 2 - 1;
   1596  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
   1597 
   1598  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1599      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   1600  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1601      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   1602 
   1603  if (filter_params_x->taps > 8) {
   1604    DECLARE_ALIGNED(16, int16_t,
   1605                    im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
   1606 
   1607    const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
   1608    const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
   1609    const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
   1610    const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
   1611 
   1612    convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, im_stride, w,
   1613                                    im_h, x_filter_0_7, x_filter_8_11);
   1614 
   1615    convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
   1616                                   y_filter_0_7, y_filter_8_11);
   1617  } else {
   1618    DECLARE_ALIGNED(16, int16_t,
   1619                    im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
   1620 
   1621    if (x_filter_taps <= 4) {
   1622      convolve_2d_sr_horiz_4tap_neon(src_ptr + 2, src_stride, im_block,
   1623                                     im_stride, w, im_h, x_filter_ptr);
   1624    } else {
   1625      convolve_2d_sr_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride,
   1626                                     w, im_h, x_filter_ptr);
   1627    }
   1628 
   1629    const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
   1630 
   1631    if (clamped_y_taps <= 4) {
   1632      convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h,
   1633                                    y_filter_ptr);
   1634    } else if (clamped_y_taps == 6) {
   1635      convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
   1636                                    y_filter);
   1637    } else {
   1638      convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
   1639                                    y_filter);
   1640    }
   1641  }
   1642 }
   1643 
   1644 void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride,
   1645                                    uint8_t *dst, int dst_stride, int w, int h,
   1646                                    const InterpFilterParams *filter_params_x,
   1647                                    const int subpel_x_qn,
   1648                                    ConvolveParams *conv_params) {
   1649  assert(subpel_x_qn == 8);
   1650  assert(filter_params_x->taps == 2);
   1651  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
   1652  (void)filter_params_x;
   1653  (void)subpel_x_qn;
   1654  (void)conv_params;
   1655 
   1656  if (w <= 4) {
   1657    do {
   1658      uint8x8_t s0_0 = vld1_u8(src);
   1659      uint8x8_t s0_1 = vld1_u8(src + 1);
   1660      uint8x8_t s1_0 = vld1_u8(src + src_stride);
   1661      uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
   1662 
   1663      uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
   1664      uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
   1665 
   1666      if (w == 2) {
   1667        store_u8_2x1(dst + 0 * dst_stride, d0);
   1668        store_u8_2x1(dst + 1 * dst_stride, d1);
   1669      } else {
   1670        store_u8_4x1(dst + 0 * dst_stride, d0);
   1671        store_u8_4x1(dst + 1 * dst_stride, d1);
   1672      }
   1673 
   1674      src += 2 * src_stride;
   1675      dst += 2 * dst_stride;
   1676      h -= 2;
   1677    } while (h != 0);
   1678  } else if (w == 8) {
   1679    do {
   1680      uint8x8_t s0_0 = vld1_u8(src);
   1681      uint8x8_t s0_1 = vld1_u8(src + 1);
   1682      uint8x8_t s1_0 = vld1_u8(src + src_stride);
   1683      uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
   1684 
   1685      uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
   1686      uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
   1687 
   1688      vst1_u8(dst, d0);
   1689      vst1_u8(dst + dst_stride, d1);
   1690 
   1691      src += 2 * src_stride;
   1692      dst += 2 * dst_stride;
   1693      h -= 2;
   1694    } while (h != 0);
   1695  } else {
   1696    do {
   1697      const uint8_t *src_ptr = src;
   1698      uint8_t *dst_ptr = dst;
   1699      int width = w;
   1700 
   1701      do {
   1702        uint8x16_t s0 = vld1q_u8(src_ptr);
   1703        uint8x16_t s1 = vld1q_u8(src_ptr + 1);
   1704 
   1705        uint8x16_t d0 = vrhaddq_u8(s0, s1);
   1706 
   1707        vst1q_u8(dst_ptr, d0);
   1708 
   1709        src_ptr += 16;
   1710        dst_ptr += 16;
   1711        width -= 16;
   1712      } while (width != 0);
   1713      src += src_stride;
   1714      dst += dst_stride;
   1715    } while (--h != 0);
   1716  }
   1717 }
   1718 
   1719 void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride,
   1720                                    uint8_t *dst, int dst_stride, int w, int h,
   1721                                    const InterpFilterParams *filter_params_y,
   1722                                    const int subpel_y_qn) {
   1723  assert(subpel_y_qn == 8);
   1724  assert(filter_params_y->taps == 2);
   1725  (void)filter_params_y;
   1726  (void)subpel_y_qn;
   1727 
   1728  if (w <= 4) {
   1729    do {
   1730      uint8x8_t s0 = load_unaligned_u8_4x1(src);
   1731      uint8x8_t s1 = load_unaligned_u8_4x1(src + src_stride);
   1732      uint8x8_t s2 = load_unaligned_u8_4x1(src + 2 * src_stride);
   1733 
   1734      uint8x8_t d0 = vrhadd_u8(s0, s1);
   1735      uint8x8_t d1 = vrhadd_u8(s1, s2);
   1736 
   1737      if (w == 2) {
   1738        store_u8_2x1(dst + 0 * dst_stride, d0);
   1739        store_u8_2x1(dst + 1 * dst_stride, d1);
   1740      } else {
   1741        store_u8_4x1(dst + 0 * dst_stride, d0);
   1742        store_u8_4x1(dst + 1 * dst_stride, d1);
   1743      }
   1744 
   1745      src += 2 * src_stride;
   1746      dst += 2 * dst_stride;
   1747      h -= 2;
   1748    } while (h != 0);
   1749  } else if (w == 8) {
   1750    do {
   1751      uint8x8_t s0 = vld1_u8(src);
   1752      uint8x8_t s1 = vld1_u8(src + src_stride);
   1753      uint8x8_t s2 = vld1_u8(src + 2 * src_stride);
   1754 
   1755      uint8x8_t d0 = vrhadd_u8(s0, s1);
   1756      uint8x8_t d1 = vrhadd_u8(s1, s2);
   1757 
   1758      vst1_u8(dst, d0);
   1759      vst1_u8(dst + dst_stride, d1);
   1760 
   1761      src += 2 * src_stride;
   1762      dst += 2 * dst_stride;
   1763      h -= 2;
   1764    } while (h != 0);
   1765  } else {
   1766    do {
   1767      const uint8_t *src_ptr = src;
   1768      uint8_t *dst_ptr = dst;
   1769      int height = h;
   1770 
   1771      do {
   1772        uint8x16_t s0 = vld1q_u8(src_ptr);
   1773        uint8x16_t s1 = vld1q_u8(src_ptr + src_stride);
   1774 
   1775        uint8x16_t d0 = vrhaddq_u8(s0, s1);
   1776 
   1777        vst1q_u8(dst_ptr, d0);
   1778 
   1779        src_ptr += src_stride;
   1780        dst_ptr += dst_stride;
   1781      } while (--height != 0);
   1782      src += 16;
   1783      dst += 16;
   1784      w -= 16;
   1785    } while (w != 0);
   1786  }
   1787 }
   1788 
   1789 void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride,
   1790                                     uint8_t *dst, int dst_stride, int w, int h,
   1791                                     const InterpFilterParams *filter_params_x,
   1792                                     const InterpFilterParams *filter_params_y,
   1793                                     const int subpel_x_qn,
   1794                                     const int subpel_y_qn,
   1795                                     ConvolveParams *conv_params) {
   1796  assert(subpel_x_qn == 8);
   1797  assert(subpel_y_qn == 8);
   1798  assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
   1799  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
   1800  (void)filter_params_x;
   1801  (void)subpel_x_qn;
   1802  (void)filter_params_y;
   1803  (void)subpel_y_qn;
   1804  (void)conv_params;
   1805 
   1806  uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
   1807  int im_h = h + 1;
   1808  int im_stride = w;
   1809  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
   1810 
   1811  uint16_t *im = im_block;
   1812 
   1813  // Horizontal filter.
   1814  if (w <= 4) {
   1815    do {
   1816      uint8x8_t s0 = vld1_u8(src);
   1817      uint8x8_t s1 = vld1_u8(src + 1);
   1818 
   1819      uint16x4_t sum = vget_low_u16(vaddl_u8(s0, s1));
   1820 
   1821      // Safe to store the whole vector, the im buffer is big enough.
   1822      vst1_u16(im, sum);
   1823 
   1824      src += src_stride;
   1825      im += im_stride;
   1826    } while (--im_h != 0);
   1827  } else {
   1828    do {
   1829      const uint8_t *src_ptr = src;
   1830      uint16_t *im_ptr = im;
   1831      int width = w;
   1832 
   1833      do {
   1834        uint8x8_t s0 = vld1_u8(src_ptr);
   1835        uint8x8_t s1 = vld1_u8(src_ptr + 1);
   1836 
   1837        uint16x8_t sum = vaddl_u8(s0, s1);
   1838 
   1839        vst1q_u16(im_ptr, sum);
   1840 
   1841        src_ptr += 8;
   1842        im_ptr += 8;
   1843        width -= 8;
   1844      } while (width != 0);
   1845      src += src_stride;
   1846      im += im_stride;
   1847    } while (--im_h != 0);
   1848  }
   1849 
   1850  im = im_block;
   1851 
   1852  // Vertical filter.
   1853  if (w <= 4) {
   1854    do {
   1855      uint16x4_t s0 = vld1_u16(im);
   1856      uint16x4_t s1 = vld1_u16(im + im_stride);
   1857      uint16x4_t s2 = vld1_u16(im + 2 * im_stride);
   1858 
   1859      uint16x4_t sum0 = vadd_u16(s0, s1);
   1860      uint16x4_t sum1 = vadd_u16(s1, s2);
   1861 
   1862      uint8x8_t d0 = vqrshrn_n_u16(vcombine_u16(sum0, vdup_n_u16(0)), 2);
   1863      uint8x8_t d1 = vqrshrn_n_u16(vcombine_u16(sum1, vdup_n_u16(0)), 2);
   1864 
   1865      if (w == 2) {
   1866        store_u8_2x1(dst + 0 * dst_stride, d0);
   1867        store_u8_2x1(dst + 1 * dst_stride, d1);
   1868      } else {
   1869        store_u8_4x1(dst + 0 * dst_stride, d0);
   1870        store_u8_4x1(dst + 1 * dst_stride, d1);
   1871      }
   1872 
   1873      im += 2 * im_stride;
   1874      dst += 2 * dst_stride;
   1875      h -= 2;
   1876    } while (h != 0);
   1877  } else {
   1878    do {
   1879      uint16_t *im_ptr = im;
   1880      uint8_t *dst_ptr = dst;
   1881      int height = h;
   1882 
   1883      do {
   1884        uint16x8_t s0 = vld1q_u16(im_ptr);
   1885        uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
   1886 
   1887        uint16x8_t sum = vaddq_u16(s0, s1);
   1888        uint8x8_t d0 = vqrshrn_n_u16(sum, 2);
   1889 
   1890        vst1_u8(dst_ptr, d0);
   1891 
   1892        im_ptr += im_stride;
   1893        dst_ptr += dst_stride;
   1894      } while (--height != 0);
   1895      im += 8;
   1896      dst += 8;
   1897      w -= 8;
   1898    } while (w != 0);
   1899  }
   1900 }