tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

compound_convolve_neon.c (105069B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <arm_neon.h>
     13 #include <assert.h>
     14 
     15 #include "aom_dsp/arm/mem_neon.h"
     16 #include "aom_dsp/arm/transpose_neon.h"
     17 #include "av1/common/arm/compound_convolve_neon.h"
     18 #include "config/aom_config.h"
     19 #include "config/av1_rtcd.h"
     20 
     21 static inline int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1,
     22                                         const int16x4_t s2, const int16x4_t s3,
     23                                         const int16x4_t x_filter,
     24                                         const int16x4_t horiz_const) {
     25  int16x4_t sum = horiz_const;
     26  sum = vmla_lane_s16(sum, s0, x_filter, 0);
     27  sum = vmla_lane_s16(sum, s1, x_filter, 1);
     28  sum = vmla_lane_s16(sum, s2, x_filter, 2);
     29  sum = vmla_lane_s16(sum, s3, x_filter, 3);
     30 
     31  // We halved the convolution filter values so -1 from the right shift.
     32  return vshr_n_s16(sum, ROUND0_BITS - 1);
     33 }
     34 
     35 static inline int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1,
     36                                         const int16x8_t s2, const int16x8_t s3,
     37                                         const int16x8_t s4, const int16x8_t s5,
     38                                         const int16x8_t s6, const int16x8_t s7,
     39                                         const int16x8_t x_filter,
     40                                         const int16x8_t horiz_const) {
     41  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
     42  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
     43 
     44  int16x8_t sum = horiz_const;
     45  sum = vmlaq_lane_s16(sum, s0, x_filter_0_3, 0);
     46  sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
     47  sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
     48  sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
     49  sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
     50  sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
     51  sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
     52  sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
     53 
     54  // We halved the convolution filter values so -1 from the right shift.
     55  return vshrq_n_s16(sum, ROUND0_BITS - 1);
     56 }
     57 
     58 static inline void dist_wtd_convolve_2d_horiz_neon(
     59    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
     60    const int16_t *x_filter_ptr, const int im_h, int w) {
     61  const int bd = 8;
     62 
     63  const uint8_t *src_ptr = src;
     64  int16_t *dst_ptr = im_block;
     65  int dst_stride = im_stride;
     66  int height = im_h;
     67 
     68  if (w == 4) {
     69    // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
     70    // shifts - which are generally faster than rounding shifts on modern CPUs.
     71    // (The extra -1 is needed because we halved the filter values.)
     72    const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) +
     73                                             (1 << ((ROUND0_BITS - 1) - 1)));
     74    // 4-tap filters are used for blocks having width <= 4.
     75    // Filter values are even, so halve to reduce intermediate precision reqs.
     76    const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
     77 
     78    src_ptr += 2;
     79 
     80    do {
     81      uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
     82      int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
     83      int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
     84 
     85      __builtin_prefetch(dst_ptr);
     86 
     87      int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
     88      int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
     89      int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
     90 
     91      int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const);
     92 
     93      vst1_s16(dst_ptr, d0);
     94 
     95      src_ptr += src_stride;
     96      dst_ptr += dst_stride;
     97    } while (--height != 0);
     98  } else {
     99    // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
    100    // shifts - which are generally faster than rounding shifts on modern CPUs.
    101    // (The extra -1 is needed because we halved the filter values.)
    102    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) +
    103                                              (1 << ((ROUND0_BITS - 1) - 1)));
    104    // Filter values are even, so halve to reduce intermediate precision reqs.
    105    const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
    106 
    107 #if AOM_ARCH_AARCH64
    108    do {
    109      const uint8_t *s;
    110      int16_t *d = dst_ptr;
    111      int width = w;
    112 
    113      __builtin_prefetch(src_ptr + 0 * src_stride);
    114      __builtin_prefetch(src_ptr + 1 * src_stride);
    115      __builtin_prefetch(src_ptr + 2 * src_stride);
    116      __builtin_prefetch(src_ptr + 3 * src_stride);
    117      __builtin_prefetch(src_ptr + 4 * src_stride);
    118      __builtin_prefetch(src_ptr + 5 * src_stride);
    119      __builtin_prefetch(src_ptr + 6 * src_stride);
    120      __builtin_prefetch(src_ptr + 7 * src_stride);
    121 
    122      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
    123      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    124      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    125 
    126      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    127      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    128      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    129      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    130      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
    131      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
    132      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
    133 
    134      s = src_ptr + 7;
    135 
    136      __builtin_prefetch(dst_ptr + 0 * dst_stride);
    137      __builtin_prefetch(dst_ptr + 1 * dst_stride);
    138      __builtin_prefetch(dst_ptr + 2 * dst_stride);
    139      __builtin_prefetch(dst_ptr + 3 * dst_stride);
    140      __builtin_prefetch(dst_ptr + 4 * dst_stride);
    141      __builtin_prefetch(dst_ptr + 5 * dst_stride);
    142      __builtin_prefetch(dst_ptr + 6 * dst_stride);
    143      __builtin_prefetch(dst_ptr + 7 * dst_stride);
    144 
    145      do {
    146        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    147        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    148 
    149        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
    150        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
    151        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
    152        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
    153        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
    154        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
    155        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
    156        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
    157 
    158        int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
    159                                        x_filter, horiz_const);
    160        int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8,
    161                                        x_filter, horiz_const);
    162        int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9,
    163                                        x_filter, horiz_const);
    164        int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10,
    165                                        x_filter, horiz_const);
    166        int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11,
    167                                        x_filter, horiz_const);
    168        int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12,
    169                                        x_filter, horiz_const);
    170        int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13,
    171                                        x_filter, horiz_const);
    172        int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14,
    173                                        x_filter, horiz_const);
    174 
    175        transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
    176        store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
    177 
    178        s0 = s8;
    179        s1 = s9;
    180        s2 = s10;
    181        s3 = s11;
    182        s4 = s12;
    183        s5 = s13;
    184        s6 = s14;
    185        s += 8;
    186        d += 8;
    187        width -= 8;
    188      } while (width > 0);
    189      src_ptr += 8 * src_stride;
    190      dst_ptr += 8 * dst_stride;
    191      height -= 8;
    192    } while (height > 8);
    193 #endif  // AOM_ARCH_AARCH64
    194 
    195    do {
    196      const uint8_t *s;
    197      int16_t *d = dst_ptr;
    198      int width = w;
    199 
    200      uint8x8_t t0 = vld1_u8(src_ptr);
    201      int16x8_t s0 =
    202          vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
    203 
    204      s = src_ptr + 8;
    205      __builtin_prefetch(dst_ptr);
    206 
    207      do {
    208        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
    209        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
    210 
    211        int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
    212        int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
    213        int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
    214        int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
    215        int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
    216        int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
    217        int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
    218 
    219        int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7,
    220                                        x_filter, horiz_const);
    221        vst1q_s16(d, d0);
    222 
    223        s0 = s8;
    224        s += 8;
    225        d += 8;
    226        width -= 8;
    227      } while (width > 0);
    228      src_ptr += src_stride;
    229      dst_ptr += dst_stride;
    230    } while (--height != 0);
    231  }
    232 }
    233 
    234 void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
    235                                   uint8_t *dst8, int dst8_stride, int w, int h,
    236                                   const InterpFilterParams *filter_params_x,
    237                                   const InterpFilterParams *filter_params_y,
    238                                   const int subpel_x_qn, const int subpel_y_qn,
    239                                   ConvolveParams *conv_params) {
    240  assert(w % 4 == 0);
    241  assert(h % 4 == 0);
    242 
    243  DECLARE_ALIGNED(16, int16_t,
    244                  im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
    245 
    246  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
    247  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
    248 
    249  const int im_h = h + clamped_y_taps - 1;
    250  const int im_stride = MAX_SB_SIZE;
    251  const int vert_offset = clamped_y_taps / 2 - 1;
    252  const int horiz_offset = filter_params_x->taps / 2 - 1;
    253  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
    254  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
    255      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    256  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
    257      filter_params_y, subpel_y_qn & SUBPEL_MASK);
    258 
    259  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
    260 
    261  dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
    262                                  x_filter_ptr, im_h, w);
    263 
    264  if (clamped_y_taps == 6) {
    265    if (conv_params->do_average) {
    266      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
    267        dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
    268            im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
    269            w);
    270      } else {
    271        dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8,
    272                                                dst8_stride, conv_params,
    273                                                y_filter, h, w);
    274      }
    275    } else {
    276      dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params,
    277                                          y_filter, h, w);
    278    }
    279  } else {
    280    if (conv_params->do_average) {
    281      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
    282        dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
    283            im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
    284            w);
    285      } else {
    286        dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
    287                                                dst8_stride, conv_params,
    288                                                y_filter, h, w);
    289      }
    290    } else {
    291      dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
    292                                          y_filter, h, w);
    293    }
    294  }
    295 }
    296 
    297 static inline void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
    298    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
    299    int h, ConvolveParams *conv_params) {
    300  assert(w % 4 == 0);
    301  assert(h % 4 == 0);
    302 
    303  const int bd = 8;
    304  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
    305  const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
    306                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
    307  const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
    308  const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
    309 
    310  const uint16_t fwd_offset = conv_params->fwd_offset;
    311  const uint16_t bck_offset = conv_params->bck_offset;
    312 
    313  CONV_BUF_TYPE *dst = conv_params->dst;
    314  const int dst_stride = conv_params->dst_stride;
    315  int height = h;
    316 
    317  if (w == 4) {
    318    do {
    319      uint8x8_t s0, s1, s2, s3;
    320      load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
    321 
    322      uint16x4_t d0 =
    323          vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
    324      uint16x4_t d1 =
    325          vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
    326      uint16x4_t d2 =
    327          vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
    328      uint16x4_t d3 =
    329          vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
    330 
    331      uint16x4_t dd0, dd1, dd2, dd3;
    332      load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
    333 
    334      uint8x8_t d01, d23;
    335      compute_dist_wtd_avg_4x4(
    336          dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset,
    337          vreinterpretq_s16_u16(round_offset_vec), &d01, &d23);
    338 
    339      store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01);
    340      store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23);
    341 
    342      src += 4 * src_stride;
    343      dst += 4 * dst_stride;
    344      dst8 += 4 * dst8_stride;
    345      height -= 4;
    346    } while (height != 0);
    347  } else {
    348    do {
    349      const uint8_t *s = src;
    350      CONV_BUF_TYPE *d = dst;
    351      uint8_t *d_u8 = dst8;
    352      int width = w;
    353 
    354      do {
    355        uint8x8_t s0, s1, s2, s3;
    356        load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
    357 
    358        uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
    359        uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
    360        uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
    361        uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
    362 
    363        uint16x8_t dd0, dd1, dd2, dd3;
    364        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
    365 
    366        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
    367        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
    368                                 bck_offset,
    369                                 vreinterpretq_s16_u16(round_offset_vec),
    370                                 &d0_u8, &d1_u8, &d2_u8, &d3_u8);
    371 
    372        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
    373 
    374        s += 8;
    375        d += 8;
    376        d_u8 += 8;
    377        width -= 8;
    378      } while (width != 0);
    379      src += 4 * src_stride;
    380      dst += 4 * dst_stride;
    381      dst8 += 4 * dst8_stride;
    382      height -= 4;
    383    } while (height != 0);
    384  }
    385 }
    386 
    387 static inline void dist_wtd_convolve_2d_copy_avg_neon(
    388    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
    389    int h, ConvolveParams *conv_params) {
    390  assert(w % 4 == 0);
    391  assert(h % 4 == 0);
    392 
    393  const int bd = 8;
    394  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
    395  const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
    396                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
    397  const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
    398  const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
    399 
    400  CONV_BUF_TYPE *dst = conv_params->dst;
    401  const int dst_stride = conv_params->dst_stride;
    402  int height = h;
    403 
    404  if (w == 4) {
    405    do {
    406      uint8x8_t s0, s1, s2, s3;
    407      load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
    408 
    409      uint16x4_t d0 =
    410          vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
    411      uint16x4_t d1 =
    412          vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
    413      uint16x4_t d2 =
    414          vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
    415      uint16x4_t d3 =
    416          vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
    417 
    418      uint16x4_t dd0, dd1, dd2, dd3;
    419      load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3);
    420 
    421      uint8x8_t d01, d23;
    422      compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
    423                            vreinterpretq_s16_u16(round_offset_vec), &d01,
    424                            &d23);
    425 
    426      store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01);
    427      store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23);
    428 
    429      src += 4 * src_stride;
    430      dst += 4 * dst_stride;
    431      dst8 += 4 * dst8_stride;
    432      height -= 4;
    433    } while (height != 0);
    434  } else {
    435    do {
    436      const uint8_t *s = src;
    437      CONV_BUF_TYPE *d = dst;
    438      uint8_t *d_u8 = dst8;
    439      int width = w;
    440 
    441      do {
    442        uint8x8_t s0, s1, s2, s3;
    443        load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
    444 
    445        uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
    446        uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
    447        uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
    448        uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
    449 
    450        uint16x8_t dd0, dd1, dd2, dd3;
    451        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
    452 
    453        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
    454        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
    455                              vreinterpretq_s16_u16(round_offset_vec), &d0_u8,
    456                              &d1_u8, &d2_u8, &d3_u8);
    457 
    458        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
    459 
    460        s += 8;
    461        d += 8;
    462        d_u8 += 8;
    463        width -= 8;
    464      } while (width != 0);
    465      src += 4 * src_stride;
    466      dst += 4 * dst_stride;
    467      dst8 += 4 * dst8_stride;
    468      height -= 4;
    469    } while (height != 0);
    470  }
    471 }
    472 
    473 static inline void dist_wtd_convolve_2d_copy_neon(const uint8_t *src,
    474                                                  int src_stride, int w, int h,
    475                                                  ConvolveParams *conv_params) {
    476  assert(w % 4 == 0);
    477  assert(h % 4 == 0);
    478 
    479  const int bd = 8;
    480  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
    481  const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
    482                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
    483  const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset);
    484  const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS));
    485 
    486  CONV_BUF_TYPE *dst = conv_params->dst;
    487  const int dst_stride = conv_params->dst_stride;
    488  int height = h;
    489 
    490  if (w == 4) {
    491    do {
    492      uint8x8_t s0, s1, s2, s3;
    493      load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3);
    494 
    495      uint16x4_t d0 =
    496          vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits));
    497      uint16x4_t d1 =
    498          vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits));
    499      uint16x4_t d2 =
    500          vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits));
    501      uint16x4_t d3 =
    502          vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits));
    503 
    504      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
    505 
    506      src += 4 * src_stride;
    507      dst += 4 * dst_stride;
    508      height -= 4;
    509    } while (height != 0);
    510  } else {
    511    do {
    512      const uint8_t *s = src;
    513      CONV_BUF_TYPE *d = dst;
    514      int width = w;
    515 
    516      do {
    517        uint8x8_t s0, s1, s2, s3;
    518        load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3);
    519 
    520        uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits);
    521        uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits);
    522        uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits);
    523        uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits);
    524 
    525        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    526 
    527        s += 8;
    528        d += 8;
    529        width -= 8;
    530      } while (width != 0);
    531      src += 4 * src_stride;
    532      dst += 4 * dst_stride;
    533      height -= 4;
    534    } while (height != 0);
    535  }
    536 }
    537 
    538 void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
    539                                        uint8_t *dst8, int dst8_stride, int w,
    540                                        int h, ConvolveParams *conv_params) {
    541  if (conv_params->do_average) {
    542    if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
    543      dist_wtd_convolve_2d_copy_dist_wtd_avg_neon(
    544          src, src_stride, dst8, dst8_stride, w, h, conv_params);
    545    } else {
    546      dist_wtd_convolve_2d_copy_avg_neon(src, src_stride, dst8, dst8_stride, w,
    547                                         h, conv_params);
    548    }
    549  } else {
    550    dist_wtd_convolve_2d_copy_neon(src, src_stride, w, h, conv_params);
    551  }
    552 }
    553 
    554 static inline uint16x4_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1,
    555                                       const int16x4_t s2, const int16x4_t s3,
    556                                       const int16x4_t x_filter,
    557                                       const int16x4_t round_offset) {
    558  int16x4_t sum = vmul_lane_s16(s0, x_filter, 0);
    559  sum = vmla_lane_s16(sum, s1, x_filter, 1);
    560  sum = vmla_lane_s16(sum, s2, x_filter, 2);
    561  sum = vmla_lane_s16(sum, s3, x_filter, 3);
    562 
    563  // We halved the convolution filter values so -1 from the right shift.
    564  int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
    565  return vreinterpret_u16_s16(res);
    566 }
    567 
    568 static inline uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1,
    569                                       const int16x8_t s2, const int16x8_t s3,
    570                                       const int16x8_t s4, const int16x8_t s5,
    571                                       const int16x8_t s6, const int16x8_t s7,
    572                                       const int16x8_t x_filter,
    573                                       const int16x8_t round_offset) {
    574  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
    575  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
    576 
    577  int16x8_t sum = vmulq_lane_s16(s0, x_filter_0_3, 0);
    578  sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1);
    579  sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2);
    580  sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3);
    581  sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0);
    582  sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1);
    583  sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2);
    584  sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3);
    585 
    586  // We halved the convolution filter values so -1 from the right shift.
    587  int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
    588  return vreinterpretq_u16_s16(res);
    589 }
    590 
    591 static inline void dist_wtd_convolve_x_dist_wtd_avg_neon(
    592    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
    593    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
    594    ConvolveParams *conv_params) {
    595  assert(w % 4 == 0);
    596  assert(h % 4 == 0);
    597 
    598  const int bd = 8;
    599  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
    600  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
    601                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
    602  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
    603 
    604  const uint16_t fwd_offset = conv_params->fwd_offset;
    605  const uint16_t bck_offset = conv_params->bck_offset;
    606 
    607  // Horizontal filter.
    608  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
    609      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    610 
    611  const int horiz_offset = filter_params_x->taps / 2 - 1;
    612  const uint8_t *src_ptr = src - horiz_offset;
    613  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
    614  uint8_t *dst8_ptr = dst8;
    615  int dst_stride = conv_params->dst_stride;
    616  int height = h;
    617 
    618  if (w == 4) {
    619    // 4-tap filters are used for blocks having width <= 4.
    620    // Filter values are even, so halve to reduce intermediate precision reqs.
    621    const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
    622 
    623    src_ptr += 2;
    624 
    625    do {
    626      uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
    627      int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
    628      int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
    629 
    630      __builtin_prefetch(dst_ptr);
    631      __builtin_prefetch(dst8_ptr);
    632 
    633      int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
    634      int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
    635      int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
    636 
    637      uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
    638                                    vget_low_s16(round_offset_vec));
    639 
    640      uint16x4_t dd0 = vld1_u16(dst_ptr);
    641 
    642      uint8x8_t d01;
    643      compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
    644                               vget_low_s16(round_offset_vec), &d01);
    645 
    646      store_u8_4x1(dst8_ptr, d01);
    647 
    648      src_ptr += src_stride;
    649      dst_ptr += dst_stride;
    650      dst8_ptr += dst8_stride;
    651    } while (--height != 0);
    652  } else {
    653    // Filter values are even, so halve to reduce intermediate precision reqs.
    654    const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
    655 
    656 #if AOM_ARCH_AARCH64
    657    while (height >= 8) {
    658      const uint8_t *s = src_ptr;
    659      CONV_BUF_TYPE *d = dst_ptr;
    660      uint8_t *d_u8 = dst8_ptr;
    661      int width = w;
    662 
    663      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
    664      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    665      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    666 
    667      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    668      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    669      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    670      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    671      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
    672      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
    673      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
    674 
    675      __builtin_prefetch(d + 0 * dst_stride);
    676      __builtin_prefetch(d + 1 * dst_stride);
    677      __builtin_prefetch(d + 2 * dst_stride);
    678      __builtin_prefetch(d + 3 * dst_stride);
    679      __builtin_prefetch(d + 4 * dst_stride);
    680      __builtin_prefetch(d + 5 * dst_stride);
    681      __builtin_prefetch(d + 6 * dst_stride);
    682      __builtin_prefetch(d + 7 * dst_stride);
    683 
    684      s += 7;
    685 
    686      do {
    687        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    688        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    689 
    690        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
    691        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
    692        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
    693        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
    694        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
    695        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
    696        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
    697        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
    698 
    699        uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
    700                                      round_offset_vec);
    701        uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
    702                                      round_offset_vec);
    703        uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
    704                                      round_offset_vec);
    705        uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
    706                                      round_offset_vec);
    707        uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
    708                                      x_filter, round_offset_vec);
    709        uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
    710                                      x_filter, round_offset_vec);
    711        uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
    712                                      x_filter, round_offset_vec);
    713        uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
    714                                      x_filter, round_offset_vec);
    715 
    716        transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
    717 
    718        uint16x8_t dd0, dd1, dd2, dd3;
    719        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
    720 
    721        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
    722        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
    723                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
    724                                 &d2_u8, &d3_u8);
    725 
    726        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
    727 
    728        uint16x8_t dd4, dd5, dd6, dd7;
    729        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
    730 
    731        uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
    732        compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
    733                                 bck_offset, round_offset_vec, &d4_u8, &d5_u8,
    734                                 &d6_u8, &d7_u8);
    735 
    736        store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
    737                     d7_u8);
    738 
    739        s0 = s8;
    740        s1 = s9;
    741        s2 = s10;
    742        s3 = s11;
    743        s4 = s12;
    744        s5 = s13;
    745        s6 = s14;
    746        s += 8;
    747        d += 8;
    748        d_u8 += 8;
    749        width -= 8;
    750      } while (width != 0);
    751      src_ptr += 8 * src_stride;
    752      dst_ptr += 8 * dst_stride;
    753      dst8_ptr += 8 * dst8_stride;
    754      height -= 8;
    755    }
    756 #endif  // AOM_ARCH_AARCH64
    757 
    758    while (height > 0) {
    759      const uint8_t *s = src_ptr;
    760      CONV_BUF_TYPE *d = dst_ptr;
    761      uint8_t *d_u8 = dst8_ptr;
    762      int width = w;
    763 
    764      uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
    765      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    766 
    767      __builtin_prefetch(d);
    768 
    769      s += 8;
    770 
    771      do {
    772        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
    773        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
    774 
    775        int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
    776        int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
    777        int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
    778        int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
    779        int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
    780        int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
    781        int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
    782 
    783        uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
    784                                      round_offset_vec);
    785 
    786        uint16x8_t dd0 = vld1q_u16(d);
    787 
    788        uint8x8_t d0_u8;
    789        compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
    790                                 round_offset_vec, &d0_u8);
    791 
    792        vst1_u8(d_u8, d0_u8);
    793 
    794        s0 = s8;
    795        s += 8;
    796        d += 8;
    797        d_u8 += 8;
    798        width -= 8;
    799      } while (width != 0);
    800      src_ptr += src_stride;
    801      dst_ptr += dst_stride;
    802      dst8_ptr += dst8_stride;
    803      height--;
    804    }
    805  }
    806 }
    807 
    808 static inline void dist_wtd_convolve_x_avg_neon(
    809    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
    810    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
    811    ConvolveParams *conv_params) {
    812  assert(w % 4 == 0);
    813  assert(h % 4 == 0);
    814 
    815  const int bd = 8;
    816  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
    817  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
    818                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
    819  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
    820 
    821  // Horizontal filter.
    822  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
    823      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    824 
    825  const int horiz_offset = filter_params_x->taps / 2 - 1;
    826  const uint8_t *src_ptr = src - horiz_offset;
    827  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
    828  uint8_t *dst8_ptr = dst8;
    829  int dst_stride = conv_params->dst_stride;
    830  int height = h;
    831 
    832  if (w == 4) {
    833    // 4-tap filters are used for blocks having width <= 4.
    834    // Filter values are even, so halve to reduce intermediate precision reqs.
    835    const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
    836 
    837    src_ptr += 2;
    838 
    839    do {
    840      uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
    841      int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
    842      int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
    843 
    844      __builtin_prefetch(dst_ptr);
    845      __builtin_prefetch(dst8_ptr);
    846 
    847      int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
    848      int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
    849      int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
    850 
    851      uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
    852                                    vget_low_s16(round_offset_vec));
    853 
    854      uint16x4_t dd0 = vld1_u16(dst_ptr);
    855 
    856      uint8x8_t d01;
    857      compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
    858 
    859      store_u8_4x1(dst8_ptr, d01);
    860 
    861      src_ptr += src_stride;
    862      dst_ptr += dst_stride;
    863      dst8_ptr += dst8_stride;
    864    } while (--height != 0);
    865  } else {
    866    // Filter values are even, so halve to reduce intermediate precision reqs.
    867    const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
    868 
    869 #if AOM_ARCH_AARCH64
    870    while (height >= 8) {
    871      const uint8_t *s = src_ptr;
    872      CONV_BUF_TYPE *d = dst_ptr;
    873      uint8_t *d_u8 = dst8_ptr;
    874      int width = w;
    875 
    876      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
    877      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    878      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    879 
    880      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    881      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
    882      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
    883      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
    884      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
    885      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
    886      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
    887 
    888      __builtin_prefetch(d + 0 * dst_stride);
    889      __builtin_prefetch(d + 1 * dst_stride);
    890      __builtin_prefetch(d + 2 * dst_stride);
    891      __builtin_prefetch(d + 3 * dst_stride);
    892      __builtin_prefetch(d + 4 * dst_stride);
    893      __builtin_prefetch(d + 5 * dst_stride);
    894      __builtin_prefetch(d + 6 * dst_stride);
    895      __builtin_prefetch(d + 7 * dst_stride);
    896 
    897      s += 7;
    898 
    899      do {
    900        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    901        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
    902 
    903        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
    904        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
    905        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
    906        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
    907        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
    908        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
    909        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
    910        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
    911 
    912        uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
    913                                      round_offset_vec);
    914        uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
    915                                      round_offset_vec);
    916        uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
    917                                      round_offset_vec);
    918        uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
    919                                      round_offset_vec);
    920        uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
    921                                      x_filter, round_offset_vec);
    922        uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
    923                                      x_filter, round_offset_vec);
    924        uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
    925                                      x_filter, round_offset_vec);
    926        uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
    927                                      x_filter, round_offset_vec);
    928 
    929        transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
    930 
    931        uint16x8_t dd0, dd1, dd2, dd3;
    932        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
    933 
    934        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
    935        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
    936                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
    937 
    938        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
    939 
    940        uint16x8_t dd4, dd5, dd6, dd7;
    941        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
    942 
    943        uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
    944        compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
    945                              round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
    946 
    947        store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8,
    948                     d7_u8);
    949 
    950        s0 = s8;
    951        s1 = s9;
    952        s2 = s10;
    953        s3 = s11;
    954        s4 = s12;
    955        s5 = s13;
    956        s6 = s14;
    957        s += 8;
    958        d += 8;
    959        d_u8 += 8;
    960        width -= 8;
    961      } while (width != 0);
    962      src_ptr += 8 * src_stride;
    963      dst_ptr += 8 * dst_stride;
    964      dst8_ptr += 8 * dst8_stride;
    965      height -= 8;
    966    }
    967 #endif  // AOM_ARCH_AARCH64
    968 
    969    while (height > 0) {
    970      const uint8_t *s = src_ptr;
    971      CONV_BUF_TYPE *d = dst_ptr;
    972      uint8_t *d_u8 = dst8_ptr;
    973      int width = w;
    974 
    975      uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
    976      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
    977 
    978      __builtin_prefetch(d);
    979 
    980      s += 8;
    981 
    982      do {
    983        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
    984        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
    985 
    986        int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
    987        int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
    988        int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
    989        int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
    990        int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
    991        int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
    992        int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
    993 
    994        uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
    995                                      round_offset_vec);
    996 
    997        uint16x8_t dd0 = vld1q_u16(d);
    998 
    999        uint8x8_t d0_u8;
   1000        compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
   1001 
   1002        vst1_u8(d_u8, d0_u8);
   1003 
   1004        s0 = s8;
   1005        s += 8;
   1006        d += 8;
   1007        d_u8 += 8;
   1008        width -= 8;
   1009      } while (width != 0);
   1010      src_ptr += src_stride;
   1011      dst_ptr += dst_stride;
   1012      dst8_ptr += dst8_stride;
   1013      height--;
   1014    }
   1015  }
   1016 }
   1017 
   1018 static inline void dist_wtd_convolve_x_neon(
   1019    const uint8_t *src, int src_stride, int w, int h,
   1020    const InterpFilterParams *filter_params_x, const int subpel_x_qn,
   1021    ConvolveParams *conv_params) {
   1022  assert(w % 4 == 0);
   1023  assert(h % 4 == 0);
   1024 
   1025  const int bd = 8;
   1026  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
   1027  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
   1028                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
   1029  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
   1030 
   1031  // Horizontal filter.
   1032  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1033      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   1034 
   1035  const int horiz_offset = filter_params_x->taps / 2 - 1;
   1036  const uint8_t *src_ptr = src - horiz_offset;
   1037  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
   1038  int dst_stride = conv_params->dst_stride;
   1039  int height = h;
   1040 
   1041  if (w == 4) {
   1042    // 4-tap filters are used for blocks having width <= 4.
   1043    // Filter values are even, so halve to reduce intermediate precision reqs.
   1044    const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1);
   1045 
   1046    src_ptr += 2;
   1047 
   1048    do {
   1049      uint8x8_t t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
   1050      int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
   1051      int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
   1052 
   1053      __builtin_prefetch(dst_ptr);
   1054 
   1055      int16x4_t s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
   1056      int16x4_t s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
   1057      int16x4_t s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
   1058 
   1059      uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter,
   1060                                    vget_low_s16(round_offset_vec));
   1061 
   1062      vst1_u16(dst_ptr, d0);
   1063 
   1064      src_ptr += src_stride;
   1065      dst_ptr += dst_stride;
   1066    } while (--height != 0);
   1067  } else {
   1068    // Filter values are even, so halve to reduce intermediate precision reqs.
   1069    const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1);
   1070 
   1071 #if AOM_ARCH_AARCH64
   1072    while (height >= 8) {
   1073      const uint8_t *s = src_ptr;
   1074      CONV_BUF_TYPE *d = dst_ptr;
   1075      int width = w;
   1076 
   1077      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
   1078      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1079      transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1080 
   1081      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1082      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
   1083      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
   1084      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
   1085      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
   1086      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
   1087      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
   1088 
   1089      __builtin_prefetch(d + 0 * dst_stride);
   1090      __builtin_prefetch(d + 1 * dst_stride);
   1091      __builtin_prefetch(d + 2 * dst_stride);
   1092      __builtin_prefetch(d + 3 * dst_stride);
   1093      __builtin_prefetch(d + 4 * dst_stride);
   1094      __builtin_prefetch(d + 5 * dst_stride);
   1095      __builtin_prefetch(d + 6 * dst_stride);
   1096      __builtin_prefetch(d + 7 * dst_stride);
   1097 
   1098      s += 7;
   1099 
   1100      do {
   1101        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1102        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1103 
   1104        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1105        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
   1106        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
   1107        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
   1108        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
   1109        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
   1110        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
   1111        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
   1112 
   1113        uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
   1114                                      round_offset_vec);
   1115        uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
   1116                                      round_offset_vec);
   1117        uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
   1118                                      round_offset_vec);
   1119        uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
   1120                                      round_offset_vec);
   1121        uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11,
   1122                                      x_filter, round_offset_vec);
   1123        uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12,
   1124                                      x_filter, round_offset_vec);
   1125        uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13,
   1126                                      x_filter, round_offset_vec);
   1127        uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14,
   1128                                      x_filter, round_offset_vec);
   1129 
   1130        transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
   1131 
   1132        store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
   1133 
   1134        s0 = s8;
   1135        s1 = s9;
   1136        s2 = s10;
   1137        s3 = s11;
   1138        s4 = s12;
   1139        s5 = s13;
   1140        s6 = s14;
   1141        s += 8;
   1142        d += 8;
   1143        width -= 8;
   1144      } while (width != 0);
   1145      src_ptr += 8 * src_stride;
   1146      dst_ptr += 8 * dst_stride;
   1147      height -= 8;
   1148    }
   1149 #endif  // AOM_ARCH_AARCH64
   1150 
   1151    while (height > 0) {
   1152      const uint8_t *s = src_ptr;
   1153      CONV_BUF_TYPE *d = dst_ptr;
   1154      int width = w;
   1155 
   1156      uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
   1157      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1158 
   1159      __builtin_prefetch(d);
   1160 
   1161      s = src_ptr + 8;
   1162 
   1163      do {
   1164        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
   1165        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1166 
   1167        int16x8_t s1 = vextq_s16(s0, s8, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
   1168        int16x8_t s2 = vextq_s16(s0, s8, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
   1169        int16x8_t s3 = vextq_s16(s0, s8, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
   1170        int16x8_t s4 = vextq_s16(s0, s8, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
   1171        int16x8_t s5 = vextq_s16(s0, s8, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
   1172        int16x8_t s6 = vextq_s16(s0, s8, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
   1173        int16x8_t s7 = vextq_s16(s0, s8, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
   1174 
   1175        uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
   1176                                      round_offset_vec);
   1177 
   1178        vst1q_u16(d, d0);
   1179 
   1180        s0 = s8;
   1181        s += 8;
   1182        d += 8;
   1183        width -= 8;
   1184      } while (width != 0);
   1185      src_ptr += src_stride;
   1186      dst_ptr += dst_stride;
   1187      height--;
   1188    }
   1189  }
   1190 }
   1191 
   1192 void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
   1193                                  uint8_t *dst8, int dst8_stride, int w, int h,
   1194                                  const InterpFilterParams *filter_params_x,
   1195                                  const int subpel_x_qn,
   1196                                  ConvolveParams *conv_params) {
   1197  if (conv_params->do_average) {
   1198    if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
   1199      dist_wtd_convolve_x_dist_wtd_avg_neon(src, src_stride, dst8, dst8_stride,
   1200                                            w, h, filter_params_x, subpel_x_qn,
   1201                                            conv_params);
   1202    } else {
   1203      dist_wtd_convolve_x_avg_neon(src, src_stride, dst8, dst8_stride, w, h,
   1204                                   filter_params_x, subpel_x_qn, conv_params);
   1205    }
   1206  } else {
   1207    dist_wtd_convolve_x_neon(src, src_stride, w, h, filter_params_x,
   1208                             subpel_x_qn, conv_params);
   1209  }
   1210 }
   1211 
   1212 static inline uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
   1213                                       const int16x4_t s2, const int16x4_t s3,
   1214                                       const int16x4_t s4, const int16x4_t s5,
   1215                                       const int16x8_t y_filter,
   1216                                       const int16x4_t round_offset) {
   1217  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   1218  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
   1219 
   1220  // Filter values at indices 0 and 7 are 0.
   1221  int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1);
   1222  sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2);
   1223  sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3);
   1224  sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0);
   1225  sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1);
   1226  sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2);
   1227 
   1228  // We halved the convolution filter values so -1 from the right shift.
   1229  int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
   1230  return vreinterpret_u16_s16(res);
   1231 }
   1232 
   1233 static inline uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1,
   1234                                       const int16x8_t s2, const int16x8_t s3,
   1235                                       const int16x8_t s4, const int16x8_t s5,
   1236                                       const int16x8_t y_filter,
   1237                                       const int16x8_t round_offset) {
   1238  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   1239  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
   1240 
   1241  // Filter values at indices 0 and 7 are 0.
   1242  int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 1);
   1243  sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 2);
   1244  sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 3);
   1245  sum = vmlaq_lane_s16(sum, s3, y_filter_4_7, 0);
   1246  sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 1);
   1247  sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 2);
   1248 
   1249  // We halved the convolution filter values so -1 from the right shift.
   1250  int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
   1251  return vreinterpretq_u16_s16(res);
   1252 }
   1253 
   1254 static inline void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
   1255    const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
   1256    const int dst8_stride, int w, int h, const int16x8_t y_filter,
   1257    ConvolveParams *conv_params) {
   1258  const int bd = 8;
   1259  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
   1260  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
   1261                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
   1262  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
   1263 
   1264  const uint16_t fwd_offset = conv_params->fwd_offset;
   1265  const uint16_t bck_offset = conv_params->bck_offset;
   1266 
   1267  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
   1268  const int dst_stride = conv_params->dst_stride;
   1269  int width = w;
   1270 
   1271  if (w == 4 || h == 4) {
   1272    do {
   1273      const uint8_t *s = src_ptr;
   1274      CONV_BUF_TYPE *d = dst_ptr;
   1275      uint8_t *d_u8 = dst8_ptr;
   1276      int height = h;
   1277 
   1278      uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
   1279      uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
   1280      uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
   1281      uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
   1282      uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
   1283 
   1284      int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   1285      int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
   1286      int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
   1287      int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
   1288      int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
   1289 
   1290      s += 5 * src_stride;
   1291 
   1292      do {
   1293 #if AOM_ARCH_AARCH64
   1294        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
   1295        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
   1296        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
   1297        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
   1298 
   1299        int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   1300        int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
   1301        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
   1302        int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
   1303 
   1304        uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
   1305                                      vget_low_s16(round_offset_vec));
   1306        uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
   1307                                      vget_low_s16(round_offset_vec));
   1308        uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
   1309                                      vget_low_s16(round_offset_vec));
   1310        uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
   1311                                      vget_low_s16(round_offset_vec));
   1312 
   1313        uint16x4_t dd0, dd1, dd2, dd3;
   1314        load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
   1315 
   1316        uint8x8_t d01, d23;
   1317        compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
   1318                                 bck_offset, round_offset_vec, &d01, &d23);
   1319 
   1320        store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
   1321        store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
   1322 
   1323        s0 = s4;
   1324        s1 = s5;
   1325        s2 = s6;
   1326        s3 = s7;
   1327        s4 = s8;
   1328        s += 4 * src_stride;
   1329        d += 4 * dst_stride;
   1330        d_u8 += 4 * dst8_stride;
   1331        height -= 4;
   1332 #else   // !AOM_ARCH_AARCH64
   1333        t0 = load_unaligned_u8_4x1(s);
   1334        int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   1335 
   1336        uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
   1337                                      vget_low_s16(round_offset_vec));
   1338 
   1339        uint16x4_t dd0 = vld1_u16(d);
   1340 
   1341        uint8x8_t d01;
   1342        compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
   1343                                 vget_low_s16(round_offset_vec), &d01);
   1344 
   1345        store_u8_4x1(d_u8, d01);
   1346 
   1347        s0 = s1;
   1348        s1 = s2;
   1349        s2 = s3;
   1350        s3 = s4;
   1351        s4 = s5;
   1352        s += src_stride;
   1353        d += dst_stride;
   1354        d_u8 += dst8_stride;
   1355        height--;
   1356 #endif  // AOM_ARCH_AARCH64
   1357      } while (height != 0);
   1358      src_ptr += 4;
   1359      dst_ptr += 4;
   1360      dst8_ptr += 4;
   1361      width -= 4;
   1362    } while (width != 0);
   1363  } else {
   1364    do {
   1365      const uint8_t *s = src_ptr + (5 * src_stride);
   1366      CONV_BUF_TYPE *d = dst_ptr;
   1367      uint8_t *d_u8 = dst8_ptr;
   1368      int height = h;
   1369 
   1370      uint8x8_t t0, t1, t2, t3, t4;
   1371      load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
   1372 
   1373      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1374      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
   1375      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
   1376      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
   1377      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
   1378 
   1379      do {
   1380 #if AOM_ARCH_AARCH64
   1381        uint8x8_t t5, t6, t7;
   1382        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1383 
   1384        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1385        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
   1386        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
   1387        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
   1388        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
   1389        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
   1390        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
   1391        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
   1392 
   1393        uint16x8_t d0 =
   1394            convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
   1395        uint16x8_t d1 =
   1396            convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
   1397        uint16x8_t d2 =
   1398            convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
   1399        uint16x8_t d3 =
   1400            convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
   1401        uint16x8_t d4 =
   1402            convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
   1403        uint16x8_t d5 =
   1404            convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
   1405        uint16x8_t d6 =
   1406            convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
   1407        uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
   1408                                      round_offset_vec);
   1409 
   1410        uint16x8_t dd0, dd1, dd2, dd3;
   1411        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
   1412 
   1413        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
   1414        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
   1415                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
   1416                                 &d2_u8, &d3_u8);
   1417 
   1418        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
   1419        d_u8 += 4 * dst8_stride;
   1420 
   1421        uint16x8_t dd4, dd5, dd6, dd7;
   1422        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
   1423 
   1424        uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
   1425        compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
   1426                                 bck_offset, round_offset_vec, &d4_u8, &d5_u8,
   1427                                 &d6_u8, &d7_u8);
   1428 
   1429        store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
   1430        d_u8 += 4 * dst8_stride;
   1431 
   1432        s0 = s8;
   1433        s1 = s9;
   1434        s2 = s10;
   1435        s3 = s11;
   1436        s4 = s12;
   1437        s += 8 * src_stride;
   1438        d += 8 * dst_stride;
   1439        height -= 8;
   1440 #else   // !AOM_ARCH_AARCH64
   1441        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
   1442 
   1443        uint16x8_t d0 =
   1444            convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
   1445 
   1446        s0 = s1;
   1447        s1 = s2;
   1448        s2 = s3;
   1449        s3 = s4;
   1450        s4 = s5;
   1451 
   1452        uint16x8_t dd0 = vld1q_u16(d);
   1453 
   1454        uint8x8_t d0_u8;
   1455        compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
   1456                                 round_offset_vec, &d0_u8);
   1457 
   1458        vst1_u8(d_u8, d0_u8);
   1459        d_u8 += dst8_stride;
   1460 
   1461        s += src_stride;
   1462        d += dst_stride;
   1463        height--;
   1464 #endif  // AOM_ARCH_AARCH64
   1465      } while (height != 0);
   1466      src_ptr += 8;
   1467      dst_ptr += 8;
   1468      dst8_ptr += 8;
   1469      width -= 8;
   1470    } while (width != 0);
   1471  }
   1472 }
   1473 
   1474 static inline void dist_wtd_convolve_y_6tap_avg_neon(
   1475    const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
   1476    const int dst8_stride, int w, int h, const int16x8_t y_filter,
   1477    ConvolveParams *conv_params) {
   1478  const int bd = 8;
   1479  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
   1480  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
   1481                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
   1482  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
   1483 
   1484  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
   1485  const int dst_stride = conv_params->dst_stride;
   1486  int width = w;
   1487 
   1488  if (w == 4 || h == 4) {
   1489    do {
   1490      const uint8_t *s = src_ptr;
   1491      CONV_BUF_TYPE *d = dst_ptr;
   1492      uint8_t *d_u8 = dst8_ptr;
   1493      int height = h;
   1494 
   1495      uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
   1496      uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
   1497      uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
   1498      uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
   1499      uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
   1500 
   1501      int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   1502      int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
   1503      int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
   1504      int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
   1505      int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
   1506 
   1507      s += 5 * src_stride;
   1508 
   1509      do {
   1510 #if AOM_ARCH_AARCH64
   1511        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
   1512        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
   1513        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
   1514        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
   1515 
   1516        int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   1517        int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
   1518        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
   1519        int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
   1520 
   1521        uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
   1522                                      vget_low_s16(round_offset_vec));
   1523        uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
   1524                                      vget_low_s16(round_offset_vec));
   1525        uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
   1526                                      vget_low_s16(round_offset_vec));
   1527        uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
   1528                                      vget_low_s16(round_offset_vec));
   1529 
   1530        uint16x4_t dd0, dd1, dd2, dd3;
   1531        load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
   1532 
   1533        uint8x8_t d01, d23;
   1534        compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
   1535                              round_offset_vec, &d01, &d23);
   1536 
   1537        store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
   1538        store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
   1539 
   1540        s0 = s4;
   1541        s1 = s5;
   1542        s2 = s6;
   1543        s3 = s7;
   1544        s4 = s8;
   1545        s += 4 * src_stride;
   1546        d += 4 * dst_stride;
   1547        d_u8 += 4 * dst8_stride;
   1548        height -= 4;
   1549 #else   // !AOM_ARCH_AARCH64
   1550        t0 = load_unaligned_u8_4x1(s);
   1551        int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   1552 
   1553        uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
   1554                                      vget_low_s16(round_offset_vec));
   1555 
   1556        uint16x4_t dd0 = vld1_u16(d);
   1557 
   1558        uint8x8_t d01;
   1559        compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
   1560 
   1561        store_u8_4x1(d_u8, d01);
   1562 
   1563        s0 = s1;
   1564        s1 = s2;
   1565        s2 = s3;
   1566        s3 = s4;
   1567        s4 = s5;
   1568        s += src_stride;
   1569        d += dst_stride;
   1570        d_u8 += dst8_stride;
   1571        height--;
   1572 #endif  // AOM_ARCH_AARCH64
   1573      } while (height != 0);
   1574      src_ptr += 4;
   1575      dst_ptr += 4;
   1576      dst8_ptr += 4;
   1577      width -= 4;
   1578    } while (width != 0);
   1579  } else {
   1580    do {
   1581      const uint8_t *s = src_ptr + (5 * src_stride);
   1582      CONV_BUF_TYPE *d = dst_ptr;
   1583      uint8_t *d_u8 = dst8_ptr;
   1584      int height = h;
   1585 
   1586      uint8x8_t t0, t1, t2, t3, t4;
   1587      load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
   1588 
   1589      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1590      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
   1591      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
   1592      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
   1593      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
   1594 
   1595      do {
   1596 #if AOM_ARCH_AARCH64
   1597        uint8x8_t t5, t6, t7;
   1598        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1599 
   1600        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1601        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
   1602        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
   1603        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
   1604        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
   1605        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
   1606        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
   1607        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
   1608 
   1609        uint16x8_t d0 =
   1610            convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
   1611        uint16x8_t d1 =
   1612            convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
   1613        uint16x8_t d2 =
   1614            convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
   1615        uint16x8_t d3 =
   1616            convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
   1617        uint16x8_t d4 =
   1618            convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
   1619        uint16x8_t d5 =
   1620            convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
   1621        uint16x8_t d6 =
   1622            convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
   1623        uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
   1624                                      round_offset_vec);
   1625 
   1626        uint16x8_t dd0, dd1, dd2, dd3;
   1627        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
   1628 
   1629        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
   1630        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
   1631                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
   1632 
   1633        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
   1634        d_u8 += 4 * dst8_stride;
   1635 
   1636        uint16x8_t dd4, dd5, dd6, dd7;
   1637        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
   1638 
   1639        uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
   1640        compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
   1641                              round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
   1642 
   1643        store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
   1644        d_u8 += 4 * dst8_stride;
   1645 
   1646        s0 = s8;
   1647        s1 = s9;
   1648        s2 = s10;
   1649        s3 = s11;
   1650        s4 = s12;
   1651        s += 8 * src_stride;
   1652        d += 8 * dst_stride;
   1653        height -= 8;
   1654 #else   // !AOM_ARCH_AARCH64
   1655        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
   1656 
   1657        uint16x8_t d0 =
   1658            convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
   1659 
   1660        s0 = s1;
   1661        s1 = s2;
   1662        s2 = s3;
   1663        s3 = s4;
   1664        s4 = s5;
   1665 
   1666        uint16x8_t dd0 = vld1q_u16(d);
   1667 
   1668        uint8x8_t d0_u8;
   1669        compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
   1670 
   1671        vst1_u8(d_u8, d0_u8);
   1672        d_u8 += dst8_stride;
   1673 
   1674        s += src_stride;
   1675        d += dst_stride;
   1676        height--;
   1677 #endif  // AOM_ARCH_AARCH64
   1678      } while (height != 0);
   1679      src_ptr += 8;
   1680      dst_ptr += 8;
   1681      dst8_ptr += 8;
   1682      width -= 8;
   1683    } while (width != 0);
   1684  }
   1685 }
   1686 
   1687 static inline void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr,
   1688                                                 int src_stride, int w, int h,
   1689                                                 const int16x8_t y_filter,
   1690                                                 ConvolveParams *conv_params) {
   1691  const int bd = 8;
   1692  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
   1693  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
   1694                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
   1695  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
   1696 
   1697  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
   1698  const int dst_stride = conv_params->dst_stride;
   1699  int width = w;
   1700 
   1701  if (w == 4 || h == 4) {
   1702    do {
   1703      const uint8_t *s = src_ptr;
   1704      CONV_BUF_TYPE *d = dst_ptr;
   1705      int height = h;
   1706 
   1707      uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
   1708      uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
   1709      uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
   1710      uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
   1711      uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
   1712 
   1713      int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   1714      int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
   1715      int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
   1716      int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
   1717      int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
   1718 
   1719      s += 5 * src_stride;
   1720 
   1721      do {
   1722 #if AOM_ARCH_AARCH64
   1723        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
   1724        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
   1725        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
   1726        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
   1727 
   1728        int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   1729        int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
   1730        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
   1731        int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
   1732 
   1733        uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
   1734                                      vget_low_s16(round_offset_vec));
   1735        uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter,
   1736                                      vget_low_s16(round_offset_vec));
   1737        uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter,
   1738                                      vget_low_s16(round_offset_vec));
   1739        uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter,
   1740                                      vget_low_s16(round_offset_vec));
   1741 
   1742        store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
   1743 
   1744        s0 = s4;
   1745        s1 = s5;
   1746        s2 = s6;
   1747        s3 = s7;
   1748        s4 = s8;
   1749        s += 4 * src_stride;
   1750        d += 4 * dst_stride;
   1751        height -= 4;
   1752 #else   // !AOM_ARCH_AARCH64
   1753        t0 = load_unaligned_u8_4x1(s);
   1754        int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   1755 
   1756        uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter,
   1757                                      vget_low_s16(round_offset_vec));
   1758 
   1759        vst1_u16(d, d0);
   1760 
   1761        s0 = s1;
   1762        s1 = s2;
   1763        s2 = s3;
   1764        s3 = s4;
   1765        s4 = s5;
   1766        s += src_stride;
   1767        d += dst_stride;
   1768        height--;
   1769 #endif  // AOM_ARCH_AARCH64
   1770      } while (height != 0);
   1771      src_ptr += 4;
   1772      dst_ptr += 4;
   1773      width -= 4;
   1774    } while (width != 0);
   1775  } else {
   1776    do {
   1777      const uint8_t *s = src_ptr + (5 * src_stride);
   1778      CONV_BUF_TYPE *d = dst_ptr;
   1779      int height = h;
   1780 
   1781      uint8x8_t t0, t1, t2, t3, t4;
   1782      load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4);
   1783 
   1784      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1785      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
   1786      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
   1787      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
   1788      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
   1789 
   1790      do {
   1791 #if AOM_ARCH_AARCH64
   1792        uint8x8_t t5, t6, t7;
   1793        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   1794 
   1795        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0));
   1796        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1));
   1797        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2));
   1798        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3));
   1799        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4));
   1800        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5));
   1801        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6));
   1802        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7));
   1803 
   1804        uint16x8_t d0 =
   1805            convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
   1806        uint16x8_t d1 =
   1807            convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec);
   1808        uint16x8_t d2 =
   1809            convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec);
   1810        uint16x8_t d3 =
   1811            convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec);
   1812        uint16x8_t d4 =
   1813            convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec);
   1814        uint16x8_t d5 =
   1815            convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec);
   1816        uint16x8_t d6 =
   1817            convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec);
   1818        uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter,
   1819                                      round_offset_vec);
   1820 
   1821        store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
   1822 
   1823        s0 = s8;
   1824        s1 = s9;
   1825        s2 = s10;
   1826        s3 = s11;
   1827        s4 = s12;
   1828        s += 8 * src_stride;
   1829        d += 8 * dst_stride;
   1830        height -= 8;
   1831 #else   // !AOM_ARCH_AARCH64
   1832        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
   1833 
   1834        uint16x8_t d0 =
   1835            convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec);
   1836 
   1837        s0 = s1;
   1838        s1 = s2;
   1839        s2 = s3;
   1840        s3 = s4;
   1841        s4 = s5;
   1842 
   1843        vst1q_u16(d, d0);
   1844 
   1845        s += src_stride;
   1846        d += dst_stride;
   1847        height--;
   1848 #endif  // AOM_ARCH_AARCH64
   1849      } while (height != 0);
   1850      src_ptr += 8;
   1851      dst_ptr += 8;
   1852      width -= 8;
   1853    } while (width != 0);
   1854  }
   1855 }
   1856 
   1857 static inline uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1,
   1858                                       const int16x4_t s2, const int16x4_t s3,
   1859                                       const int16x4_t s4, const int16x4_t s5,
   1860                                       const int16x4_t s6, const int16x4_t s7,
   1861                                       const int16x8_t y_filter,
   1862                                       const int16x4_t round_offset) {
   1863  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   1864  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
   1865 
   1866  int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 0);
   1867  sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1);
   1868  sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2);
   1869  sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3);
   1870  sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0);
   1871  sum = vmla_lane_s16(sum, s5, y_filter_4_7, 1);
   1872  sum = vmla_lane_s16(sum, s6, y_filter_4_7, 2);
   1873  sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3);
   1874 
   1875  // We halved the convolution filter values so -1 from the right shift.
   1876  int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1);
   1877  return vreinterpret_u16_s16(res);
   1878 }
   1879 
   1880 static inline uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1,
   1881                                       const int16x8_t s2, const int16x8_t s3,
   1882                                       const int16x8_t s4, const int16x8_t s5,
   1883                                       const int16x8_t s6, const int16x8_t s7,
   1884                                       const int16x8_t y_filter,
   1885                                       const int16x8_t round_offset) {
   1886  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   1887  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
   1888 
   1889  int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 0);
   1890  sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1);
   1891  sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2);
   1892  sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3);
   1893  sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0);
   1894  sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 1);
   1895  sum = vmlaq_lane_s16(sum, s6, y_filter_4_7, 2);
   1896  sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3);
   1897 
   1898  // We halved the convolution filter values so -1 from the right shift.
   1899  int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1);
   1900  return vreinterpretq_u16_s16(res);
   1901 }
   1902 
   1903 static inline void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(
   1904    const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
   1905    const int dst8_stride, int w, int h, const int16x8_t y_filter,
   1906    ConvolveParams *conv_params) {
   1907  const int bd = 8;
   1908  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
   1909  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
   1910                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
   1911  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
   1912 
   1913  const uint16_t fwd_offset = conv_params->fwd_offset;
   1914  const uint16_t bck_offset = conv_params->bck_offset;
   1915 
   1916  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
   1917  const int dst_stride = conv_params->dst_stride;
   1918  int width = w;
   1919 
   1920  if (w == 4 || h == 4) {
   1921    do {
   1922      const uint8_t *s = src_ptr;
   1923      CONV_BUF_TYPE *d = dst_ptr;
   1924      uint8_t *d_u8 = dst8_ptr;
   1925      int height = h;
   1926 
   1927      __builtin_prefetch(s + 0 * src_stride);
   1928      __builtin_prefetch(s + 1 * src_stride);
   1929      __builtin_prefetch(s + 2 * src_stride);
   1930      __builtin_prefetch(s + 3 * src_stride);
   1931 
   1932      uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
   1933      uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
   1934      uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
   1935      uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
   1936      uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
   1937      uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
   1938      uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
   1939 
   1940      int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   1941      int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
   1942      int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
   1943      int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
   1944      int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
   1945      int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
   1946      int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
   1947 
   1948      __builtin_prefetch(d + 0 * dst_stride);
   1949      __builtin_prefetch(d + 1 * dst_stride);
   1950      __builtin_prefetch(d + 2 * dst_stride);
   1951      __builtin_prefetch(d + 3 * dst_stride);
   1952 
   1953      s += 7 * src_stride;
   1954 
   1955      do {
   1956 #if AOM_ARCH_AARCH64
   1957        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
   1958        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
   1959        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
   1960        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
   1961 
   1962        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   1963        int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
   1964        int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
   1965        int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
   1966 
   1967        uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   1968                                      vget_low_s16(round_offset_vec));
   1969        uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
   1970                                      vget_low_s16(round_offset_vec));
   1971        uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
   1972                                      vget_low_s16(round_offset_vec));
   1973        uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
   1974                                      vget_low_s16(round_offset_vec));
   1975 
   1976        __builtin_prefetch(d + 0 * dst_stride);
   1977        __builtin_prefetch(d + 1 * dst_stride);
   1978        __builtin_prefetch(d + 2 * dst_stride);
   1979        __builtin_prefetch(d + 3 * dst_stride);
   1980 
   1981        __builtin_prefetch(d_u8 + 0 * dst8_stride);
   1982        __builtin_prefetch(d_u8 + 1 * dst8_stride);
   1983        __builtin_prefetch(d_u8 + 2 * dst8_stride);
   1984        __builtin_prefetch(d_u8 + 3 * dst8_stride);
   1985 
   1986        uint16x4_t dd0, dd1, dd2, dd3;
   1987        load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
   1988 
   1989        uint8x8_t d01, d23;
   1990        compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
   1991                                 bck_offset, round_offset_vec, &d01, &d23);
   1992 
   1993        store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
   1994        store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
   1995 
   1996        s0 = s4;
   1997        s1 = s5;
   1998        s2 = s6;
   1999        s3 = s7;
   2000        s4 = s8;
   2001        s5 = s9;
   2002        s6 = s10;
   2003        s += 4 * src_stride;
   2004        d += 4 * dst_stride;
   2005        d_u8 += 4 * dst8_stride;
   2006        height -= 4;
   2007 #else   // !AOM_ARCH_AARCH64
   2008        t0 = load_unaligned_u8_4x1(s);
   2009        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   2010 
   2011        uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   2012                                      vget_low_s16(round_offset_vec));
   2013 
   2014        __builtin_prefetch(d);
   2015 
   2016        uint16x4_t dd0 = vld1_u16(d);
   2017 
   2018        uint8x8_t d01;
   2019        compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset,
   2020                                 vget_low_s16(round_offset_vec), &d01);
   2021 
   2022        store_u8_4x1(d_u8, d01);
   2023 
   2024        s0 = s1;
   2025        s1 = s2;
   2026        s2 = s3;
   2027        s3 = s4;
   2028        s4 = s5;
   2029        s5 = s6;
   2030        s6 = s7;
   2031        s += src_stride;
   2032        d += dst_stride;
   2033        d_u8 += dst8_stride;
   2034        height--;
   2035 #endif  // AOM_ARCH_AARCH64
   2036      } while (height != 0);
   2037      src_ptr += 4;
   2038      dst_ptr += 4;
   2039      dst8_ptr += 4;
   2040      width -= 4;
   2041    } while (width != 0);
   2042  } else {
   2043    do {
   2044      const uint8_t *s = src_ptr;
   2045      CONV_BUF_TYPE *d = dst_ptr;
   2046      uint8_t *d_u8 = dst8_ptr;
   2047      int height = h;
   2048 
   2049      __builtin_prefetch(s + 0 * src_stride);
   2050      __builtin_prefetch(s + 1 * src_stride);
   2051      __builtin_prefetch(s + 2 * src_stride);
   2052      __builtin_prefetch(s + 3 * src_stride);
   2053      __builtin_prefetch(s + 4 * src_stride);
   2054      __builtin_prefetch(s + 5 * src_stride);
   2055      __builtin_prefetch(s + 6 * src_stride);
   2056      __builtin_prefetch(s + 7 * src_stride);
   2057 
   2058      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
   2059      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
   2060 
   2061      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
   2062      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
   2063      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
   2064      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
   2065      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
   2066      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
   2067      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
   2068 
   2069      s += 7 * src_stride;
   2070 
   2071      do {
   2072 #if AOM_ARCH_AARCH64
   2073        uint8x8_t t7;
   2074        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   2075 
   2076        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
   2077        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
   2078        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
   2079        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
   2080        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
   2081        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
   2082        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
   2083        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
   2084 
   2085        __builtin_prefetch(dst_ptr + 0 * dst_stride);
   2086        __builtin_prefetch(dst_ptr + 1 * dst_stride);
   2087        __builtin_prefetch(dst_ptr + 2 * dst_stride);
   2088        __builtin_prefetch(dst_ptr + 3 * dst_stride);
   2089 
   2090        uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   2091                                      round_offset_vec);
   2092        uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
   2093                                      round_offset_vec);
   2094        uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
   2095                                      round_offset_vec);
   2096        uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
   2097                                      round_offset_vec);
   2098        uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
   2099                                      y_filter, round_offset_vec);
   2100        uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
   2101                                      y_filter, round_offset_vec);
   2102        uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
   2103                                      y_filter, round_offset_vec);
   2104        uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
   2105                                      y_filter, round_offset_vec);
   2106 
   2107        __builtin_prefetch(d + 0 * dst8_stride);
   2108        __builtin_prefetch(d + 1 * dst8_stride);
   2109        __builtin_prefetch(d + 2 * dst8_stride);
   2110        __builtin_prefetch(d + 3 * dst8_stride);
   2111 
   2112        uint16x8_t dd0, dd1, dd2, dd3;
   2113        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
   2114 
   2115        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
   2116        compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
   2117                                 bck_offset, round_offset_vec, &d0_u8, &d1_u8,
   2118                                 &d2_u8, &d3_u8);
   2119 
   2120        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
   2121        d_u8 += 4 * dst8_stride;
   2122 
   2123        uint16x8_t dd4, dd5, dd6, dd7;
   2124        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
   2125 
   2126        uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
   2127        compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset,
   2128                                 bck_offset, round_offset_vec, &d4_u8, &d5_u8,
   2129                                 &d6_u8, &d7_u8);
   2130 
   2131        store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
   2132        d_u8 += 4 * dst8_stride;
   2133 
   2134        s0 = s8;
   2135        s1 = s9;
   2136        s2 = s10;
   2137        s3 = s11;
   2138        s4 = s12;
   2139        s5 = s13;
   2140        s6 = s14;
   2141        s += 8 * src_stride;
   2142        d += 8 * dst_stride;
   2143        height -= 8;
   2144 #else   // !AOM_ARCH_AARCH64
   2145        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
   2146 
   2147        __builtin_prefetch(dst_ptr);
   2148 
   2149        uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   2150                                      round_offset_vec);
   2151 
   2152        s0 = s1;
   2153        s1 = s2;
   2154        s2 = s3;
   2155        s3 = s4;
   2156        s4 = s5;
   2157        s5 = s6;
   2158        s6 = s7;
   2159 
   2160        __builtin_prefetch(d);
   2161 
   2162        uint16x8_t dd0 = vld1q_u16(d);
   2163 
   2164        uint8x8_t d0_u8;
   2165        compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset,
   2166                                 round_offset_vec, &d0_u8);
   2167 
   2168        vst1_u8(d_u8, d0_u8);
   2169        d_u8 += dst8_stride;
   2170 
   2171        s += src_stride;
   2172        d += dst_stride;
   2173        height--;
   2174 #endif  // AOM_ARCH_AARCH64
   2175      } while (height != 0);
   2176      src_ptr += 8;
   2177      dst_ptr += 8;
   2178      dst8_ptr += 8;
   2179      width -= 8;
   2180    } while (width != 0);
   2181  }
   2182 }
   2183 
   2184 static inline void dist_wtd_convolve_y_8tap_avg_neon(
   2185    const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr,
   2186    const int dst8_stride, int w, int h, const int16x8_t y_filter,
   2187    ConvolveParams *conv_params) {
   2188  const int bd = 8;
   2189  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
   2190  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
   2191                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
   2192  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
   2193 
   2194  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
   2195  const int dst_stride = conv_params->dst_stride;
   2196  int width = w;
   2197 
   2198  if (w == 4 || h == 4) {
   2199    do {
   2200      const uint8_t *s = src_ptr;
   2201      CONV_BUF_TYPE *d = dst_ptr;
   2202      uint8_t *d_u8 = dst8_ptr;
   2203      int height = h;
   2204 
   2205      __builtin_prefetch(s + 0 * src_stride);
   2206      __builtin_prefetch(s + 1 * src_stride);
   2207      __builtin_prefetch(s + 2 * src_stride);
   2208      __builtin_prefetch(s + 3 * src_stride);
   2209 
   2210      uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
   2211      uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
   2212      uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
   2213      uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
   2214      uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
   2215      uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
   2216      uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
   2217 
   2218      int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   2219      int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
   2220      int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
   2221      int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
   2222      int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
   2223      int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
   2224      int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
   2225 
   2226      __builtin_prefetch(d + 0 * dst_stride);
   2227      __builtin_prefetch(d + 1 * dst_stride);
   2228      __builtin_prefetch(d + 2 * dst_stride);
   2229      __builtin_prefetch(d + 3 * dst_stride);
   2230 
   2231      s += 7 * src_stride;
   2232 
   2233      do {
   2234 #if AOM_ARCH_AARCH64
   2235        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
   2236        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
   2237        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
   2238        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
   2239 
   2240        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   2241        int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
   2242        int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
   2243        int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
   2244 
   2245        uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   2246                                      vget_low_s16(round_offset_vec));
   2247        uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
   2248                                      vget_low_s16(round_offset_vec));
   2249        uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
   2250                                      vget_low_s16(round_offset_vec));
   2251        uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
   2252                                      vget_low_s16(round_offset_vec));
   2253 
   2254        __builtin_prefetch(d + 0 * dst_stride);
   2255        __builtin_prefetch(d + 1 * dst_stride);
   2256        __builtin_prefetch(d + 2 * dst_stride);
   2257        __builtin_prefetch(d + 3 * dst_stride);
   2258 
   2259        __builtin_prefetch(d_u8 + 0 * dst8_stride);
   2260        __builtin_prefetch(d_u8 + 1 * dst8_stride);
   2261        __builtin_prefetch(d_u8 + 2 * dst8_stride);
   2262        __builtin_prefetch(d_u8 + 3 * dst8_stride);
   2263 
   2264        uint16x4_t dd0, dd1, dd2, dd3;
   2265        load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
   2266 
   2267        uint8x8_t d01, d23;
   2268        compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
   2269                              round_offset_vec, &d01, &d23);
   2270 
   2271        store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01);
   2272        store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23);
   2273 
   2274        s0 = s4;
   2275        s1 = s5;
   2276        s2 = s6;
   2277        s3 = s7;
   2278        s4 = s8;
   2279        s5 = s9;
   2280        s6 = s10;
   2281        s += 4 * src_stride;
   2282        d += 4 * dst_stride;
   2283        d_u8 += 4 * dst8_stride;
   2284        height -= 4;
   2285 #else   // !AOM_ARCH_AARCH64
   2286        t0 = load_unaligned_u8_4x1(s);
   2287        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   2288 
   2289        uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   2290                                      vget_low_s16(round_offset_vec));
   2291 
   2292        __builtin_prefetch(d);
   2293 
   2294        uint16x4_t dd0 = vld1_u16(d);
   2295 
   2296        uint8x8_t d01;
   2297        compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01);
   2298 
   2299        store_u8_4x1(d_u8, d01);
   2300 
   2301        s0 = s1;
   2302        s1 = s2;
   2303        s2 = s3;
   2304        s3 = s4;
   2305        s4 = s5;
   2306        s5 = s6;
   2307        s6 = s7;
   2308        s += src_stride;
   2309        d += dst_stride;
   2310        d_u8 += dst8_stride;
   2311        height--;
   2312 #endif  // AOM_ARCH_AARCH64
   2313      } while (height != 0);
   2314      src_ptr += 4;
   2315      dst_ptr += 4;
   2316      dst8_ptr += 4;
   2317      width -= 4;
   2318    } while (width != 0);
   2319  } else {
   2320    do {
   2321      const uint8_t *s = src_ptr;
   2322      CONV_BUF_TYPE *d = dst_ptr;
   2323      uint8_t *d_u8 = dst8_ptr;
   2324      int height = h;
   2325 
   2326      __builtin_prefetch(s + 0 * src_stride);
   2327      __builtin_prefetch(s + 1 * src_stride);
   2328      __builtin_prefetch(s + 2 * src_stride);
   2329      __builtin_prefetch(s + 3 * src_stride);
   2330      __builtin_prefetch(s + 4 * src_stride);
   2331      __builtin_prefetch(s + 5 * src_stride);
   2332      __builtin_prefetch(s + 6 * src_stride);
   2333      __builtin_prefetch(s + 7 * src_stride);
   2334 
   2335      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
   2336      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
   2337 
   2338      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
   2339      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
   2340      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
   2341      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
   2342      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
   2343      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
   2344      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
   2345 
   2346      s += 7 * src_stride;
   2347 
   2348      do {
   2349 #if AOM_ARCH_AARCH64
   2350        uint8x8_t t7;
   2351        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   2352 
   2353        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
   2354        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
   2355        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
   2356        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
   2357        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
   2358        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
   2359        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
   2360        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
   2361 
   2362        __builtin_prefetch(dst_ptr + 0 * dst_stride);
   2363        __builtin_prefetch(dst_ptr + 1 * dst_stride);
   2364        __builtin_prefetch(dst_ptr + 2 * dst_stride);
   2365        __builtin_prefetch(dst_ptr + 3 * dst_stride);
   2366 
   2367        uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   2368                                      round_offset_vec);
   2369        uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
   2370                                      round_offset_vec);
   2371        uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
   2372                                      round_offset_vec);
   2373        uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
   2374                                      round_offset_vec);
   2375        uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
   2376                                      y_filter, round_offset_vec);
   2377        uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
   2378                                      y_filter, round_offset_vec);
   2379        uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
   2380                                      y_filter, round_offset_vec);
   2381        uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
   2382                                      y_filter, round_offset_vec);
   2383 
   2384        __builtin_prefetch(d + 0 * dst8_stride);
   2385        __builtin_prefetch(d + 1 * dst8_stride);
   2386        __builtin_prefetch(d + 2 * dst8_stride);
   2387        __builtin_prefetch(d + 3 * dst8_stride);
   2388 
   2389        uint16x8_t dd0, dd1, dd2, dd3;
   2390        load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
   2391 
   2392        uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
   2393        compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
   2394                              round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
   2395 
   2396        store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
   2397        d_u8 += 4 * dst8_stride;
   2398 
   2399        uint16x8_t dd4, dd5, dd6, dd7;
   2400        load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7);
   2401 
   2402        uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8;
   2403        compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7,
   2404                              round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8);
   2405 
   2406        store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8);
   2407        d_u8 += 4 * dst8_stride;
   2408 
   2409        s0 = s8;
   2410        s1 = s9;
   2411        s2 = s10;
   2412        s3 = s11;
   2413        s4 = s12;
   2414        s5 = s13;
   2415        s6 = s14;
   2416        s += 8 * src_stride;
   2417        d += 8 * dst_stride;
   2418        height -= 8;
   2419 #else   // !AOM_ARCH_AARCH64
   2420        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
   2421 
   2422        __builtin_prefetch(dst_ptr);
   2423 
   2424        uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   2425                                      round_offset_vec);
   2426 
   2427        s0 = s1;
   2428        s1 = s2;
   2429        s2 = s3;
   2430        s3 = s4;
   2431        s4 = s5;
   2432        s5 = s6;
   2433        s6 = s7;
   2434 
   2435        __builtin_prefetch(d);
   2436 
   2437        uint16x8_t dd0 = vld1q_u16(d);
   2438 
   2439        uint8x8_t d0_u8;
   2440        compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8);
   2441 
   2442        vst1_u8(d_u8, d0_u8);
   2443        d_u8 += dst8_stride;
   2444 
   2445        s += src_stride;
   2446        d += dst_stride;
   2447        height--;
   2448 #endif  // AOM_ARCH_AARCH64
   2449      } while (height != 0);
   2450      src_ptr += 8;
   2451      dst_ptr += 8;
   2452      dst8_ptr += 8;
   2453      width -= 8;
   2454    } while (width != 0);
   2455  }
   2456 }
   2457 
   2458 static inline void dist_wtd_convolve_y_8tap_neon(const uint8_t *src_ptr,
   2459                                                 int src_stride, int w, int h,
   2460                                                 const int16x8_t y_filter,
   2461                                                 ConvolveParams *conv_params) {
   2462  const int bd = 8;
   2463  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
   2464  const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
   2465                               (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
   2466  const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
   2467 
   2468  CONV_BUF_TYPE *dst_ptr = conv_params->dst;
   2469  const int dst_stride = conv_params->dst_stride;
   2470  int width = w;
   2471 
   2472  if (w == 4 || h == 4) {
   2473    do {
   2474      const uint8_t *s = src_ptr;
   2475      CONV_BUF_TYPE *d = dst_ptr;
   2476      int height = h;
   2477 
   2478      __builtin_prefetch(s + 0 * src_stride);
   2479      __builtin_prefetch(s + 1 * src_stride);
   2480      __builtin_prefetch(s + 2 * src_stride);
   2481      __builtin_prefetch(s + 3 * src_stride);
   2482 
   2483      uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
   2484      uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
   2485      uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
   2486      uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
   2487      uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride);
   2488      uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride);
   2489      uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride);
   2490 
   2491      int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   2492      int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
   2493      int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
   2494      int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
   2495      int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
   2496      int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
   2497      int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
   2498 
   2499      __builtin_prefetch(d + 0 * dst_stride);
   2500      __builtin_prefetch(d + 1 * dst_stride);
   2501      __builtin_prefetch(d + 2 * dst_stride);
   2502      __builtin_prefetch(d + 3 * dst_stride);
   2503 
   2504      s += 7 * src_stride;
   2505 
   2506      do {
   2507 #if AOM_ARCH_AARCH64
   2508        t0 = load_unaligned_u8_4x1(s + 0 * src_stride);
   2509        t1 = load_unaligned_u8_4x1(s + 1 * src_stride);
   2510        t2 = load_unaligned_u8_4x1(s + 2 * src_stride);
   2511        t3 = load_unaligned_u8_4x1(s + 3 * src_stride);
   2512 
   2513        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   2514        int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
   2515        int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
   2516        int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
   2517 
   2518        uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   2519                                      vget_low_s16(round_offset_vec));
   2520        uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
   2521                                      vget_low_s16(round_offset_vec));
   2522        uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
   2523                                      vget_low_s16(round_offset_vec));
   2524        uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
   2525                                      vget_low_s16(round_offset_vec));
   2526 
   2527        store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
   2528 
   2529        s0 = s4;
   2530        s1 = s5;
   2531        s2 = s6;
   2532        s3 = s7;
   2533        s4 = s8;
   2534        s5 = s9;
   2535        s6 = s10;
   2536        s += 4 * src_stride;
   2537        d += 4 * dst_stride;
   2538        height -= 4;
   2539 #else   // !AOM_ARCH_AARCH64
   2540        t0 = load_unaligned_u8_4x1(s);
   2541        int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
   2542 
   2543        uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   2544                                      vget_low_s16(round_offset_vec));
   2545 
   2546        vst1_u16(d, d0);
   2547 
   2548        s0 = s1;
   2549        s1 = s2;
   2550        s2 = s3;
   2551        s3 = s4;
   2552        s4 = s5;
   2553        s5 = s6;
   2554        s6 = s7;
   2555        s += src_stride;
   2556        d += dst_stride;
   2557        height--;
   2558 #endif  // AOM_ARCH_AARCH64
   2559      } while (height != 0);
   2560      src_ptr += 4;
   2561      dst_ptr += 4;
   2562      width -= 4;
   2563    } while (width != 0);
   2564  } else {
   2565    do {
   2566      const uint8_t *s = src_ptr;
   2567      CONV_BUF_TYPE *d = dst_ptr;
   2568      int height = h;
   2569 
   2570      __builtin_prefetch(s + 0 * src_stride);
   2571      __builtin_prefetch(s + 1 * src_stride);
   2572      __builtin_prefetch(s + 2 * src_stride);
   2573      __builtin_prefetch(s + 3 * src_stride);
   2574      __builtin_prefetch(s + 4 * src_stride);
   2575      __builtin_prefetch(s + 5 * src_stride);
   2576      __builtin_prefetch(s + 6 * src_stride);
   2577      __builtin_prefetch(s + 7 * src_stride);
   2578 
   2579      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
   2580      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
   2581 
   2582      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
   2583      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
   2584      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
   2585      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
   2586      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
   2587      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
   2588      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
   2589 
   2590      s += 7 * src_stride;
   2591 
   2592      do {
   2593 #if AOM_ARCH_AARCH64
   2594        uint8x8_t t7;
   2595        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
   2596 
   2597        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
   2598        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
   2599        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
   2600        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
   2601        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
   2602        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
   2603        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
   2604        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
   2605 
   2606        __builtin_prefetch(dst_ptr + 0 * dst_stride);
   2607        __builtin_prefetch(dst_ptr + 1 * dst_stride);
   2608        __builtin_prefetch(dst_ptr + 2 * dst_stride);
   2609        __builtin_prefetch(dst_ptr + 3 * dst_stride);
   2610 
   2611        uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   2612                                      round_offset_vec);
   2613        uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
   2614                                      round_offset_vec);
   2615        uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
   2616                                      round_offset_vec);
   2617        uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
   2618                                      round_offset_vec);
   2619        uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11,
   2620                                      y_filter, round_offset_vec);
   2621        uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12,
   2622                                      y_filter, round_offset_vec);
   2623        uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13,
   2624                                      y_filter, round_offset_vec);
   2625        uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14,
   2626                                      y_filter, round_offset_vec);
   2627 
   2628        store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
   2629 
   2630        s0 = s8;
   2631        s1 = s9;
   2632        s2 = s10;
   2633        s3 = s11;
   2634        s4 = s12;
   2635        s5 = s13;
   2636        s6 = s14;
   2637        s += 8 * src_stride;
   2638        d += 8 * dst_stride;
   2639        height -= 8;
   2640 #else   // !AOM_ARCH_AARCH64
   2641        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
   2642 
   2643        __builtin_prefetch(dst_ptr);
   2644 
   2645        uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   2646                                      round_offset_vec);
   2647 
   2648        s0 = s1;
   2649        s1 = s2;
   2650        s2 = s3;
   2651        s3 = s4;
   2652        s4 = s5;
   2653        s5 = s6;
   2654        s6 = s7;
   2655 
   2656        vst1q_u16(d, d0);
   2657 
   2658        s += src_stride;
   2659        d += dst_stride;
   2660        height--;
   2661 #endif  // AOM_ARCH_AARCH64
   2662      } while (height != 0);
   2663      src_ptr += 8;
   2664      dst_ptr += 8;
   2665      width -= 8;
   2666    } while (width != 0);
   2667  }
   2668 }
   2669 
   2670 void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
   2671                                  uint8_t *dst8, int dst8_stride, int w, int h,
   2672                                  const InterpFilterParams *filter_params_y,
   2673                                  const int subpel_y_qn,
   2674                                  ConvolveParams *conv_params) {
   2675  assert(w % 4 == 0);
   2676  assert(h % 4 == 0);
   2677 
   2678  // Vertical filter.
   2679  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
   2680      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   2681  // Filter values are even, so downshift by 1 to reduce intermediate
   2682  // precision requirements.
   2683  const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
   2684 
   2685  const int vert_offset = filter_params_y->taps / 2 - 1;
   2686  const uint8_t *src_ptr = src - (vert_offset * src_stride);
   2687 
   2688  if (get_filter_tap(filter_params_y, subpel_y_qn) <= 6) {
   2689    if (conv_params->do_average) {
   2690      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
   2691        dist_wtd_convolve_y_6tap_dist_wtd_avg_neon(
   2692            src_ptr + src_stride, src_stride, dst8, dst8_stride, w, h, y_filter,
   2693            conv_params);
   2694      } else {
   2695        dist_wtd_convolve_y_6tap_avg_neon(src_ptr + src_stride, src_stride,
   2696                                          dst8, dst8_stride, w, h, y_filter,
   2697                                          conv_params);
   2698      }
   2699    } else {
   2700      dist_wtd_convolve_y_6tap_neon(src_ptr + src_stride, src_stride, w, h,
   2701                                    y_filter, conv_params);
   2702    }
   2703  } else {
   2704    if (conv_params->do_average) {
   2705      if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
   2706        dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(src_ptr, src_stride, dst8,
   2707                                                   dst8_stride, w, h, y_filter,
   2708                                                   conv_params);
   2709      } else {
   2710        dist_wtd_convolve_y_8tap_avg_neon(src_ptr, src_stride, dst8,
   2711                                          dst8_stride, w, h, y_filter,
   2712                                          conv_params);
   2713      }
   2714    } else {
   2715      dist_wtd_convolve_y_8tap_neon(src_ptr, src_stride, w, h, y_filter,
   2716                                    conv_params);
   2717    }
   2718  }
   2719 }