tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_compound_convolve_neon.c (75719B)


      1 /*
      2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <arm_neon.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/av1_rtcd.h"
     17 
     18 #include "aom_dsp/aom_dsp_common.h"
     19 #include "aom_dsp/arm/mem_neon.h"
     20 #include "aom_ports/mem.h"
     21 #include "av1/common/convolve.h"
     22 #include "av1/common/filter.h"
     23 #include "av1/common/arm/highbd_compound_convolve_neon.h"
     24 #include "av1/common/arm/highbd_convolve_neon.h"
     25 
     26 static inline uint16x4_t highbd_12_convolve6_4(
     27    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     28    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     29    const int16x8_t filter, const int32x4_t offset) {
     30  // Values at indices 0 and 7 of y_filter are zero.
     31  const int16x4_t filter_0_3 = vget_low_s16(filter);
     32  const int16x4_t filter_4_7 = vget_high_s16(filter);
     33 
     34  int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
     35  sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
     36  sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
     37  sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
     38  sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
     39  sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
     40 
     41  return vqshrun_n_s32(sum, ROUND0_BITS + 2);
     42 }
     43 
     44 static inline uint16x4_t highbd_convolve6_4(
     45    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     46    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     47    const int16x8_t filter, const int32x4_t offset) {
     48  // Values at indices 0 and 7 of y_filter are zero.
     49  const int16x4_t filter_0_3 = vget_low_s16(filter);
     50  const int16x4_t filter_4_7 = vget_high_s16(filter);
     51 
     52  int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
     53  sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
     54  sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
     55  sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
     56  sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
     57  sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
     58 
     59  return vqshrun_n_s32(sum, ROUND0_BITS);
     60 }
     61 
     62 static inline uint16x8_t highbd_12_convolve6_8(
     63    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     64    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     65    const int16x8_t filter, const int32x4_t offset) {
     66  // Values at indices 0 and 7 of y_filter are zero.
     67  const int16x4_t filter_0_3 = vget_low_s16(filter);
     68  const int16x4_t filter_4_7 = vget_high_s16(filter);
     69 
     70  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
     71  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
     72  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
     73  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
     74  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
     75  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
     76 
     77  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
     78  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
     79  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
     80  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
     81  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
     82  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
     83 
     84  return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
     85                      vqshrun_n_s32(sum1, ROUND0_BITS + 2));
     86 }
     87 
     88 static inline uint16x8_t highbd_convolve6_8(
     89    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     90    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     91    const int16x8_t filter, const int32x4_t offset) {
     92  // Values at indices 0 and 7 of y_filter are zero.
     93  const int16x4_t filter_0_3 = vget_low_s16(filter);
     94  const int16x4_t filter_4_7 = vget_high_s16(filter);
     95 
     96  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
     97  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
     98  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
     99  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
    100  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
    101  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
    102 
    103  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
    104  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
    105  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
    106  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
    107  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
    108  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
    109 
    110  return vcombine_u16(vqshrun_n_s32(sum0, 3), vqshrun_n_s32(sum1, ROUND0_BITS));
    111 }
    112 
    113 static inline void highbd_12_dist_wtd_convolve_x_6tap_neon(
    114    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    115    int w, int h, const int16_t *x_filter_ptr, const int offset) {
    116  const int32x4_t offset_vec = vdupq_n_s32(offset);
    117 
    118  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
    119 
    120  int height = h;
    121 
    122  do {
    123    int width = w;
    124    const int16_t *s = (const int16_t *)src_ptr;
    125    uint16_t *d = dst_ptr;
    126 
    127    do {
    128      int16x8_t s0[6], s1[6], s2[6], s3[6];
    129      load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
    130                   &s0[4], &s0[5]);
    131      load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
    132                   &s1[4], &s1[5]);
    133      load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
    134                   &s2[4], &s2[5]);
    135      load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
    136                   &s3[4], &s3[5]);
    137 
    138      uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
    139                                            s0[5], x_filter, offset_vec);
    140      uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
    141                                            s1[5], x_filter, offset_vec);
    142      uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
    143                                            s2[5], x_filter, offset_vec);
    144      uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
    145                                            s3[5], x_filter, offset_vec);
    146 
    147      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    148 
    149      s += 8;
    150      d += 8;
    151      width -= 8;
    152    } while (width != 0);
    153    src_ptr += 4 * src_stride;
    154    dst_ptr += 4 * dst_stride;
    155    height -= 4;
    156  } while (height != 0);
    157 }
    158 
    159 static inline void highbd_dist_wtd_convolve_x_6tap_neon(
    160    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    161    int w, int h, const int16_t *x_filter_ptr, const int offset) {
    162  const int32x4_t offset_vec = vdupq_n_s32(offset);
    163 
    164  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
    165 
    166  int height = h;
    167 
    168  do {
    169    int width = w;
    170    const int16_t *s = (const int16_t *)src_ptr;
    171    uint16_t *d = dst_ptr;
    172 
    173    do {
    174      int16x8_t s0[6], s1[6], s2[6], s3[6];
    175      load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
    176                   &s0[4], &s0[5]);
    177      load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
    178                   &s1[4], &s1[5]);
    179      load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
    180                   &s2[4], &s2[5]);
    181      load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
    182                   &s3[4], &s3[5]);
    183 
    184      uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
    185                                         s0[5], x_filter, offset_vec);
    186      uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
    187                                         s1[5], x_filter, offset_vec);
    188      uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
    189                                         s2[5], x_filter, offset_vec);
    190      uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
    191                                         s3[5], x_filter, offset_vec);
    192 
    193      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    194 
    195      s += 8;
    196      d += 8;
    197      width -= 8;
    198    } while (width != 0);
    199    src_ptr += 4 * src_stride;
    200    dst_ptr += 4 * dst_stride;
    201    height -= 4;
    202  } while (height != 0);
    203 }
    204 
    205 static inline uint16x4_t highbd_12_convolve8_4(
    206    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
    207    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
    208    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
    209    const int32x4_t offset) {
    210  const int16x4_t filter_0_3 = vget_low_s16(filter);
    211  const int16x4_t filter_4_7 = vget_high_s16(filter);
    212 
    213  int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
    214  sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
    215  sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
    216  sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
    217  sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
    218  sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
    219  sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
    220  sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
    221 
    222  return vqshrun_n_s32(sum, ROUND0_BITS + 2);
    223 }
    224 
    225 static inline uint16x4_t highbd_convolve8_4(
    226    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
    227    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
    228    const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
    229    const int32x4_t offset) {
    230  const int16x4_t filter_0_3 = vget_low_s16(filter);
    231  const int16x4_t filter_4_7 = vget_high_s16(filter);
    232 
    233  int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
    234  sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
    235  sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
    236  sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
    237  sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
    238  sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
    239  sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
    240  sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
    241 
    242  return vqshrun_n_s32(sum, ROUND0_BITS);
    243 }
    244 
    245 static inline uint16x8_t highbd_12_convolve8_8(
    246    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
    247    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
    248    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
    249    const int32x4_t offset) {
    250  const int16x4_t filter_0_3 = vget_low_s16(filter);
    251  const int16x4_t filter_4_7 = vget_high_s16(filter);
    252 
    253  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
    254  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
    255  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
    256  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
    257  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
    258  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
    259  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
    260  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
    261 
    262  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
    263  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
    264  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
    265  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
    266  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
    267  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
    268  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
    269  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
    270 
    271  return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
    272                      vqshrun_n_s32(sum1, ROUND0_BITS + 2));
    273 }
    274 
    275 static inline uint16x8_t highbd_convolve8_8(
    276    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
    277    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
    278    const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
    279    const int32x4_t offset) {
    280  const int16x4_t filter_0_3 = vget_low_s16(filter);
    281  const int16x4_t filter_4_7 = vget_high_s16(filter);
    282 
    283  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
    284  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
    285  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
    286  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
    287  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
    288  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
    289  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
    290  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
    291 
    292  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
    293  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
    294  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
    295  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
    296  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
    297  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
    298  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
    299  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
    300 
    301  return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS),
    302                      vqshrun_n_s32(sum1, ROUND0_BITS));
    303 }
    304 
    305 static inline uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4],
    306                                                 const int16x4_t x_filter,
    307                                                 const int32x4_t offset) {
    308  int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
    309  sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
    310  sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
    311  sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
    312 
    313  return vqshrun_n_s32(sum, 5);
    314 }
    315 
    316 static inline uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
    317                                              const int16x4_t x_filter,
    318                                              const int32x4_t offset) {
    319  int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
    320  sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
    321  sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
    322  sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
    323 
    324  return vqshrun_n_s32(sum, ROUND0_BITS);
    325 }
    326 
    327 static inline void highbd_12_dist_wtd_convolve_x_neon(
    328    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    329    int w, int h, const int16_t *x_filter_ptr, const int offset) {
    330  const int32x4_t offset_vec = vdupq_n_s32(offset);
    331 
    332  if (w == 4) {
    333    // 4-tap filters are used for blocks having width == 4.
    334    const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
    335    const int16_t *s = (const int16_t *)(src_ptr + 2);
    336    uint16_t *d = dst_ptr;
    337 
    338    do {
    339      int16x4_t s0[4], s1[4], s2[4], s3[4];
    340      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
    341      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
    342      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
    343      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
    344 
    345      uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
    346      uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
    347      uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
    348      uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
    349 
    350      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
    351 
    352      s += 4 * src_stride;
    353      d += 4 * dst_stride;
    354      h -= 4;
    355    } while (h != 0);
    356  } else {
    357    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
    358    int height = h;
    359 
    360    do {
    361      int width = w;
    362      const int16_t *s = (const int16_t *)src_ptr;
    363      uint16_t *d = dst_ptr;
    364 
    365      do {
    366        int16x8_t s0[8], s1[8], s2[8], s3[8];
    367        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
    368                     &s0[4], &s0[5], &s0[6], &s0[7]);
    369        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
    370                     &s1[4], &s1[5], &s1[6], &s1[7]);
    371        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
    372                     &s2[4], &s2[5], &s2[6], &s2[7]);
    373        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
    374                     &s3[4], &s3[5], &s3[6], &s3[7]);
    375 
    376        uint16x8_t d0 =
    377            highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
    378                                  s0[6], s0[7], x_filter, offset_vec);
    379        uint16x8_t d1 =
    380            highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
    381                                  s1[6], s1[7], x_filter, offset_vec);
    382        uint16x8_t d2 =
    383            highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
    384                                  s2[6], s2[7], x_filter, offset_vec);
    385        uint16x8_t d3 =
    386            highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
    387                                  s3[6], s3[7], x_filter, offset_vec);
    388 
    389        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    390 
    391        s += 8;
    392        d += 8;
    393        width -= 8;
    394      } while (width != 0);
    395      src_ptr += 4 * src_stride;
    396      dst_ptr += 4 * dst_stride;
    397      height -= 4;
    398    } while (height != 0);
    399  }
    400 }
    401 
    402 static inline void highbd_dist_wtd_convolve_x_neon(
    403    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    404    int w, int h, const int16_t *x_filter_ptr, const int offset) {
    405  const int32x4_t offset_vec = vdupq_n_s32(offset);
    406 
    407  if (w == 4) {
    408    // 4-tap filters are used for blocks having width == 4.
    409    const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
    410    const int16_t *s = (const int16_t *)(src_ptr + 2);
    411    uint16_t *d = dst_ptr;
    412 
    413    do {
    414      int16x4_t s0[4], s1[4], s2[4], s3[4];
    415      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
    416      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
    417      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
    418      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
    419 
    420      uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
    421      uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
    422      uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
    423      uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
    424 
    425      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
    426 
    427      s += 4 * src_stride;
    428      d += 4 * dst_stride;
    429      h -= 4;
    430    } while (h != 0);
    431  } else {
    432    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
    433    int height = h;
    434 
    435    do {
    436      int width = w;
    437      const int16_t *s = (const int16_t *)src_ptr;
    438      uint16_t *d = dst_ptr;
    439 
    440      do {
    441        int16x8_t s0[8], s1[8], s2[8], s3[8];
    442        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
    443                     &s0[4], &s0[5], &s0[6], &s0[7]);
    444        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
    445                     &s1[4], &s1[5], &s1[6], &s1[7]);
    446        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
    447                     &s2[4], &s2[5], &s2[6], &s2[7]);
    448        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
    449                     &s3[4], &s3[5], &s3[6], &s3[7]);
    450 
    451        uint16x8_t d0 =
    452            highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
    453                               s0[7], x_filter, offset_vec);
    454        uint16x8_t d1 =
    455            highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
    456                               s1[7], x_filter, offset_vec);
    457        uint16x8_t d2 =
    458            highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
    459                               s2[7], x_filter, offset_vec);
    460        uint16x8_t d3 =
    461            highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
    462                               s3[7], x_filter, offset_vec);
    463 
    464        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    465 
    466        s += 8;
    467        d += 8;
    468        width -= 8;
    469      } while (width != 0);
    470      src_ptr += 4 * src_stride;
    471      dst_ptr += 4 * dst_stride;
    472      height -= 4;
    473    } while (height != 0);
    474  }
    475 }
    476 
    477 void av1_highbd_dist_wtd_convolve_x_neon(
    478    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
    479    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
    480    ConvolveParams *conv_params, int bd) {
    481  DECLARE_ALIGNED(16, uint16_t,
    482                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
    483  CONV_BUF_TYPE *dst16 = conv_params->dst;
    484  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
    485  int dst16_stride = conv_params->dst_stride;
    486  const int im_stride = MAX_SB_SIZE;
    487  const int horiz_offset = filter_params_x->taps / 2 - 1;
    488  assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
    489  const int offset_convolve = (1 << (conv_params->round_0 - 1)) +
    490                              (1 << (bd + FILTER_BITS)) +
    491                              (1 << (bd + FILTER_BITS - 1));
    492 
    493  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
    494      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    495 
    496  src -= horiz_offset;
    497 
    498  // horizontal filter
    499  if (bd == 12) {
    500    if (conv_params->do_average) {
    501      if (x_filter_taps <= 6 && w != 4) {
    502        highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
    503                                                im_stride, w, h, x_filter_ptr,
    504                                                offset_convolve);
    505      } else {
    506        highbd_12_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride,
    507                                           w, h, x_filter_ptr, offset_convolve);
    508      }
    509      if (conv_params->use_dist_wtd_comp_avg) {
    510        highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
    511                                         w, h, conv_params);
    512      } else {
    513        highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
    514                                conv_params);
    515      }
    516    } else {
    517      if (x_filter_taps <= 6 && w != 4) {
    518        highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
    519                                                dst16_stride, w, h,
    520                                                x_filter_ptr, offset_convolve);
    521      } else {
    522        highbd_12_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride,
    523                                           w, h, x_filter_ptr, offset_convolve);
    524      }
    525    }
    526  } else {
    527    if (conv_params->do_average) {
    528      if (x_filter_taps <= 6 && w != 4) {
    529        highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
    530                                             im_stride, w, h, x_filter_ptr,
    531                                             offset_convolve);
    532      } else {
    533        highbd_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, w,
    534                                        h, x_filter_ptr, offset_convolve);
    535      }
    536      if (conv_params->use_dist_wtd_comp_avg) {
    537        highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
    538                                      h, conv_params, bd);
    539      } else {
    540        highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
    541                             conv_params, bd);
    542      }
    543    } else {
    544      if (x_filter_taps <= 6 && w != 4) {
    545        highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
    546                                             dst16_stride, w, h, x_filter_ptr,
    547                                             offset_convolve);
    548      } else {
    549        highbd_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, w,
    550                                        h, x_filter_ptr, offset_convolve);
    551      }
    552    }
    553  }
    554 }
    555 
    556 static inline void highbd_12_dist_wtd_convolve_y_6tap_neon(
    557    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    558    int w, int h, const int16_t *y_filter_ptr, const int offset) {
    559  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
    560  const int32x4_t offset_vec = vdupq_n_s32(offset);
    561 
    562  if (w == 4) {
    563    const int16_t *s = (const int16_t *)src_ptr;
    564    uint16_t *d = dst_ptr;
    565 
    566    int16x4_t s0, s1, s2, s3, s4;
    567    load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
    568    s += 5 * src_stride;
    569 
    570    do {
    571      int16x4_t s5, s6, s7, s8;
    572      load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
    573 
    574      uint16x4_t d0 =
    575          highbd_12_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
    576      uint16x4_t d1 =
    577          highbd_12_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
    578      uint16x4_t d2 =
    579          highbd_12_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
    580      uint16x4_t d3 =
    581          highbd_12_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
    582 
    583      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
    584 
    585      s0 = s4;
    586      s1 = s5;
    587      s2 = s6;
    588      s3 = s7;
    589      s4 = s8;
    590      s += 4 * src_stride;
    591      d += 4 * dst_stride;
    592      h -= 4;
    593    } while (h != 0);
    594  } else {
    595    do {
    596      int height = h;
    597      const int16_t *s = (const int16_t *)src_ptr;
    598      uint16_t *d = dst_ptr;
    599 
    600      int16x8_t s0, s1, s2, s3, s4;
    601      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
    602      s += 5 * src_stride;
    603 
    604      do {
    605        int16x8_t s5, s6, s7, s8;
    606        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
    607 
    608        uint16x8_t d0 =
    609            highbd_12_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
    610        uint16x8_t d1 =
    611            highbd_12_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
    612        uint16x8_t d2 =
    613            highbd_12_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
    614        uint16x8_t d3 =
    615            highbd_12_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
    616 
    617        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    618 
    619        s0 = s4;
    620        s1 = s5;
    621        s2 = s6;
    622        s3 = s7;
    623        s4 = s8;
    624        s += 4 * src_stride;
    625        d += 4 * dst_stride;
    626        height -= 4;
    627      } while (height != 0);
    628      src_ptr += 8;
    629      dst_ptr += 8;
    630      w -= 8;
    631    } while (w != 0);
    632  }
    633 }
    634 
    635 static inline void highbd_dist_wtd_convolve_y_6tap_neon(
    636    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    637    int w, int h, const int16_t *y_filter_ptr, const int offset) {
    638  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
    639  const int32x4_t offset_vec = vdupq_n_s32(offset);
    640 
    641  if (w == 4) {
    642    const int16_t *s = (const int16_t *)src_ptr;
    643    uint16_t *d = dst_ptr;
    644 
    645    int16x4_t s0, s1, s2, s3, s4;
    646    load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
    647    s += 5 * src_stride;
    648 
    649    do {
    650      int16x4_t s5, s6, s7, s8;
    651      load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
    652 
    653      uint16x4_t d0 =
    654          highbd_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
    655      uint16x4_t d1 =
    656          highbd_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
    657      uint16x4_t d2 =
    658          highbd_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
    659      uint16x4_t d3 =
    660          highbd_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
    661 
    662      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
    663 
    664      s0 = s4;
    665      s1 = s5;
    666      s2 = s6;
    667      s3 = s7;
    668      s4 = s8;
    669      s += 4 * src_stride;
    670      d += 4 * dst_stride;
    671      h -= 4;
    672    } while (h != 0);
    673  } else {
    674    do {
    675      int height = h;
    676      const int16_t *s = (const int16_t *)src_ptr;
    677      uint16_t *d = dst_ptr;
    678 
    679      int16x8_t s0, s1, s2, s3, s4;
    680      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
    681      s += 5 * src_stride;
    682 
    683      do {
    684        int16x8_t s5, s6, s7, s8;
    685        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
    686 
    687        uint16x8_t d0 =
    688            highbd_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
    689        uint16x8_t d1 =
    690            highbd_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
    691        uint16x8_t d2 =
    692            highbd_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
    693        uint16x8_t d3 =
    694            highbd_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
    695 
    696        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    697 
    698        s0 = s4;
    699        s1 = s5;
    700        s2 = s6;
    701        s3 = s7;
    702        s4 = s8;
    703        s += 4 * src_stride;
    704        d += 4 * dst_stride;
    705        height -= 4;
    706      } while (height != 0);
    707      src_ptr += 8;
    708      dst_ptr += 8;
    709      w -= 8;
    710    } while (w != 0);
    711  }
    712 }
    713 
    714 static inline uint16x4_t highbd_12_convolve4_4(
    715    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
    716    const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) {
    717  int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0);
    718  sum = vmlal_lane_s16(sum, s1, filter, 1);
    719  sum = vmlal_lane_s16(sum, s2, filter, 2);
    720  sum = vmlal_lane_s16(sum, s3, filter, 3);
    721 
    722  return vqshrun_n_s32(sum, ROUND0_BITS + 2);
    723 }
    724 
    725 static inline uint16x8_t highbd_12_convolve4_8(
    726    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
    727    const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) {
    728  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0);
    729  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
    730  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
    731  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
    732 
    733  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0);
    734  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
    735  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
    736  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
    737 
    738  return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
    739                      vqshrun_n_s32(sum1, ROUND0_BITS + 2));
    740 }
    741 
    742 static inline void highbd_12_dist_wtd_convolve_y_4tap_neon(
    743    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    744    int w, int h, const int16_t *y_filter_ptr, const int offset) {
    745  const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
    746  const int32x4_t offset_vec = vdupq_n_s32(offset);
    747 
    748  if (w == 4) {
    749    const int16_t *s = (const int16_t *)src_ptr;
    750    uint16_t *d = dst_ptr;
    751 
    752    int16x4_t s0, s1, s2;
    753    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
    754    s += 3 * src_stride;
    755 
    756    do {
    757      int16x4_t s3, s4, s5, s6;
    758      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
    759 
    760      uint16x4_t d0 =
    761          highbd_12_convolve4_4(s0, s1, s2, s3, y_filter, offset_vec);
    762      uint16x4_t d1 =
    763          highbd_12_convolve4_4(s1, s2, s3, s4, y_filter, offset_vec);
    764      uint16x4_t d2 =
    765          highbd_12_convolve4_4(s2, s3, s4, s5, y_filter, offset_vec);
    766      uint16x4_t d3 =
    767          highbd_12_convolve4_4(s3, s4, s5, s6, y_filter, offset_vec);
    768 
    769      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
    770 
    771      s0 = s4;
    772      s1 = s5;
    773      s2 = s6;
    774 
    775      s += 4 * src_stride;
    776      d += 4 * dst_stride;
    777      h -= 4;
    778    } while (h != 0);
    779  } else {
    780    do {
    781      int height = h;
    782      const int16_t *s = (const int16_t *)src_ptr;
    783      uint16_t *d = dst_ptr;
    784 
    785      int16x8_t s0, s1, s2;
    786      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
    787      s += 3 * src_stride;
    788 
    789      do {
    790        int16x8_t s3, s4, s5, s6;
    791        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
    792 
    793        uint16x8_t d0 =
    794            highbd_12_convolve4_8(s0, s1, s2, s3, y_filter, offset_vec);
    795        uint16x8_t d1 =
    796            highbd_12_convolve4_8(s1, s2, s3, s4, y_filter, offset_vec);
    797        uint16x8_t d2 =
    798            highbd_12_convolve4_8(s2, s3, s4, s5, y_filter, offset_vec);
    799        uint16x8_t d3 =
    800            highbd_12_convolve4_8(s3, s4, s5, s6, y_filter, offset_vec);
    801 
    802        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    803 
    804        s0 = s4;
    805        s1 = s5;
    806        s2 = s6;
    807 
    808        s += 4 * src_stride;
    809        d += 4 * dst_stride;
    810        height -= 4;
    811      } while (height != 0);
    812      src_ptr += 8;
    813      dst_ptr += 8;
    814      w -= 8;
    815    } while (w != 0);
    816  }
    817 }
    818 
    819 static inline uint16x4_t highbd_convolve4_4(
    820    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
    821    const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) {
    822  int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0);
    823  sum = vmlal_lane_s16(sum, s1, filter, 1);
    824  sum = vmlal_lane_s16(sum, s2, filter, 2);
    825  sum = vmlal_lane_s16(sum, s3, filter, 3);
    826 
    827  return vqshrun_n_s32(sum, ROUND0_BITS);
    828 }
    829 
    830 static inline uint16x8_t highbd_convolve4_8(
    831    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
    832    const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) {
    833  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0);
    834  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1);
    835  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2);
    836  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3);
    837 
    838  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0);
    839  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1);
    840  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2);
    841  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3);
    842 
    843  return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS),
    844                      vqshrun_n_s32(sum1, ROUND0_BITS));
    845 }
    846 
    847 static inline void highbd_dist_wtd_convolve_y_4tap_neon(
    848    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    849    int w, int h, const int16_t *y_filter_ptr, const int offset) {
    850  const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
    851  const int32x4_t offset_vec = vdupq_n_s32(offset);
    852 
    853  if (w == 4) {
    854    const int16_t *s = (const int16_t *)src_ptr;
    855    uint16_t *d = dst_ptr;
    856 
    857    int16x4_t s0, s1, s2;
    858    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
    859    s += 3 * src_stride;
    860 
    861    do {
    862      int16x4_t s3, s4, s5, s6;
    863      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
    864 
    865      uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, y_filter, offset_vec);
    866      uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, y_filter, offset_vec);
    867      uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, y_filter, offset_vec);
    868      uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, y_filter, offset_vec);
    869 
    870      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
    871 
    872      s0 = s4;
    873      s1 = s5;
    874      s2 = s6;
    875 
    876      s += 4 * src_stride;
    877      d += 4 * dst_stride;
    878      h -= 4;
    879    } while (h != 0);
    880  } else {
    881    do {
    882      int height = h;
    883      const int16_t *s = (const int16_t *)src_ptr;
    884      uint16_t *d = dst_ptr;
    885 
    886      int16x8_t s0, s1, s2;
    887      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
    888      s += 3 * src_stride;
    889 
    890      do {
    891        int16x8_t s3, s4, s5, s6;
    892        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
    893 
    894        uint16x8_t d0 =
    895            highbd_convolve4_8(s0, s1, s2, s3, y_filter, offset_vec);
    896        uint16x8_t d1 =
    897            highbd_convolve4_8(s1, s2, s3, s4, y_filter, offset_vec);
    898        uint16x8_t d2 =
    899            highbd_convolve4_8(s2, s3, s4, s5, y_filter, offset_vec);
    900        uint16x8_t d3 =
    901            highbd_convolve4_8(s3, s4, s5, s6, y_filter, offset_vec);
    902 
    903        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    904 
    905        s0 = s4;
    906        s1 = s5;
    907        s2 = s6;
    908 
    909        s += 4 * src_stride;
    910        d += 4 * dst_stride;
    911        height -= 4;
    912      } while (height != 0);
    913      src_ptr += 8;
    914      dst_ptr += 8;
    915      w -= 8;
    916    } while (w != 0);
    917  }
    918 }
    919 
    920 static inline void highbd_12_dist_wtd_convolve_y_8tap_neon(
    921    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    922    int w, int h, const int16_t *y_filter_ptr, const int offset) {
    923  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
    924  const int32x4_t offset_vec = vdupq_n_s32(offset);
    925 
    926  if (w == 4) {
    927    const int16_t *s = (const int16_t *)src_ptr;
    928    uint16_t *d = dst_ptr;
    929 
    930    int16x4_t s0, s1, s2, s3, s4, s5, s6;
    931    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
    932    s += 7 * src_stride;
    933 
    934    do {
    935      int16x4_t s7, s8, s9, s10;
    936      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
    937 
    938      uint16x4_t d0 = highbd_12_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
    939                                            y_filter, offset_vec);
    940      uint16x4_t d1 = highbd_12_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
    941                                            y_filter, offset_vec);
    942      uint16x4_t d2 = highbd_12_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
    943                                            y_filter, offset_vec);
    944      uint16x4_t d3 = highbd_12_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
    945                                            y_filter, offset_vec);
    946 
    947      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
    948 
    949      s0 = s4;
    950      s1 = s5;
    951      s2 = s6;
    952      s3 = s7;
    953      s4 = s8;
    954      s5 = s9;
    955      s6 = s10;
    956      s += 4 * src_stride;
    957      d += 4 * dst_stride;
    958      h -= 4;
    959    } while (h != 0);
    960  } else {
    961    do {
    962      int height = h;
    963      const int16_t *s = (const int16_t *)src_ptr;
    964      uint16_t *d = dst_ptr;
    965 
    966      int16x8_t s0, s1, s2, s3, s4, s5, s6;
    967      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
    968      s += 7 * src_stride;
    969 
    970      do {
    971        int16x8_t s7, s8, s9, s10;
    972        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
    973 
    974        uint16x8_t d0 = highbd_12_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
    975                                              y_filter, offset_vec);
    976        uint16x8_t d1 = highbd_12_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
    977                                              y_filter, offset_vec);
    978        uint16x8_t d2 = highbd_12_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
    979                                              y_filter, offset_vec);
    980        uint16x8_t d3 = highbd_12_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
    981                                              y_filter, offset_vec);
    982 
    983        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    984 
    985        s0 = s4;
    986        s1 = s5;
    987        s2 = s6;
    988        s3 = s7;
    989        s4 = s8;
    990        s5 = s9;
    991        s6 = s10;
    992        s += 4 * src_stride;
    993        d += 4 * dst_stride;
    994        height -= 4;
    995      } while (height != 0);
    996      src_ptr += 8;
    997      dst_ptr += 8;
    998      w -= 8;
    999    } while (w != 0);
   1000  }
   1001 }
   1002 static inline void highbd_dist_wtd_convolve_y_8tap_neon(
   1003    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1004    int w, int h, const int16_t *y_filter_ptr, const int offset) {
   1005  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
   1006  const int32x4_t offset_vec = vdupq_n_s32(offset);
   1007 
   1008  if (w == 4) {
   1009    const int16_t *s = (const int16_t *)src_ptr;
   1010    uint16_t *d = dst_ptr;
   1011 
   1012    int16x4_t s0, s1, s2, s3, s4, s5, s6;
   1013    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
   1014    s += 7 * src_stride;
   1015 
   1016    do {
   1017      int16x4_t s7, s8, s9, s10;
   1018      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
   1019 
   1020      uint16x4_t d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
   1021                                         y_filter, offset_vec);
   1022      uint16x4_t d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
   1023                                         y_filter, offset_vec);
   1024      uint16x4_t d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
   1025                                         y_filter, offset_vec);
   1026      uint16x4_t d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
   1027                                         y_filter, offset_vec);
   1028 
   1029      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
   1030 
   1031      s0 = s4;
   1032      s1 = s5;
   1033      s2 = s6;
   1034      s3 = s7;
   1035      s4 = s8;
   1036      s5 = s9;
   1037      s6 = s10;
   1038      s += 4 * src_stride;
   1039      d += 4 * dst_stride;
   1040      h -= 4;
   1041    } while (h != 0);
   1042  } else {
   1043    do {
   1044      int height = h;
   1045      const int16_t *s = (const int16_t *)src_ptr;
   1046      uint16_t *d = dst_ptr;
   1047 
   1048      int16x8_t s0, s1, s2, s3, s4, s5, s6;
   1049      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
   1050      s += 7 * src_stride;
   1051 
   1052      do {
   1053        int16x8_t s7, s8, s9, s10;
   1054        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
   1055 
   1056        uint16x8_t d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
   1057                                           y_filter, offset_vec);
   1058        uint16x8_t d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
   1059                                           y_filter, offset_vec);
   1060        uint16x8_t d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
   1061                                           y_filter, offset_vec);
   1062        uint16x8_t d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
   1063                                           y_filter, offset_vec);
   1064 
   1065        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1066 
   1067        s0 = s4;
   1068        s1 = s5;
   1069        s2 = s6;
   1070        s3 = s7;
   1071        s4 = s8;
   1072        s5 = s9;
   1073        s6 = s10;
   1074        s += 4 * src_stride;
   1075        d += 4 * dst_stride;
   1076        height -= 4;
   1077      } while (height != 0);
   1078      src_ptr += 8;
   1079      dst_ptr += 8;
   1080      w -= 8;
   1081    } while (w != 0);
   1082  }
   1083 }
   1084 
   1085 void av1_highbd_dist_wtd_convolve_y_neon(
   1086    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
   1087    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
   1088    ConvolveParams *conv_params, int bd) {
   1089  DECLARE_ALIGNED(16, uint16_t,
   1090                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
   1091  CONV_BUF_TYPE *dst16 = conv_params->dst;
   1092  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   1093  int dst16_stride = conv_params->dst_stride;
   1094  const int im_stride = MAX_SB_SIZE;
   1095  const int vert_offset = filter_params_y->taps / 2 - 1;
   1096  assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
   1097  const int round_offset_conv = (1 << (conv_params->round_0 - 1)) +
   1098                                (1 << (bd + FILTER_BITS)) +
   1099                                (1 << (bd + FILTER_BITS - 1));
   1100 
   1101  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1102      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   1103 
   1104  src -= vert_offset * src_stride;
   1105 
   1106  if (bd == 12) {
   1107    if (conv_params->do_average) {
   1108      if (y_filter_taps <= 4) {
   1109        highbd_12_dist_wtd_convolve_y_4tap_neon(
   1110            src + 2 * src_stride, src_stride, im_block, im_stride, w, h,
   1111            y_filter_ptr, round_offset_conv);
   1112      } else if (y_filter_taps == 6) {
   1113        highbd_12_dist_wtd_convolve_y_6tap_neon(
   1114            src + src_stride, src_stride, im_block, im_stride, w, h,
   1115            y_filter_ptr, round_offset_conv);
   1116      } else {
   1117        highbd_12_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
   1118                                                im_stride, w, h, y_filter_ptr,
   1119                                                round_offset_conv);
   1120      }
   1121      if (conv_params->use_dist_wtd_comp_avg) {
   1122        highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
   1123                                         w, h, conv_params);
   1124      } else {
   1125        highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
   1126                                conv_params);
   1127      }
   1128    } else {
   1129      if (y_filter_taps <= 4) {
   1130        highbd_12_dist_wtd_convolve_y_4tap_neon(
   1131            src + 2 * src_stride, src_stride, dst16, dst16_stride, w, h,
   1132            y_filter_ptr, round_offset_conv);
   1133      } else if (y_filter_taps == 6) {
   1134        highbd_12_dist_wtd_convolve_y_6tap_neon(
   1135            src + src_stride, src_stride, dst16, dst16_stride, w, h,
   1136            y_filter_ptr, round_offset_conv);
   1137      } else {
   1138        highbd_12_dist_wtd_convolve_y_8tap_neon(
   1139            src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr,
   1140            round_offset_conv);
   1141      }
   1142    }
   1143  } else {
   1144    if (conv_params->do_average) {
   1145      if (y_filter_taps <= 4) {
   1146        highbd_dist_wtd_convolve_y_4tap_neon(src + 2 * src_stride, src_stride,
   1147                                             im_block, im_stride, w, h,
   1148                                             y_filter_ptr, round_offset_conv);
   1149      } else if (y_filter_taps == 6) {
   1150        highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
   1151                                             im_block, im_stride, w, h,
   1152                                             y_filter_ptr, round_offset_conv);
   1153      } else {
   1154        highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
   1155                                             im_stride, w, h, y_filter_ptr,
   1156                                             round_offset_conv);
   1157      }
   1158      if (conv_params->use_dist_wtd_comp_avg) {
   1159        highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
   1160                                      h, conv_params, bd);
   1161      } else {
   1162        highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
   1163                             conv_params, bd);
   1164      }
   1165    } else {
   1166      if (y_filter_taps <= 4) {
   1167        highbd_dist_wtd_convolve_y_4tap_neon(src + 2 * src_stride, src_stride,
   1168                                             dst16, dst16_stride, w, h,
   1169                                             y_filter_ptr, round_offset_conv);
   1170      } else if (y_filter_taps == 6) {
   1171        highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
   1172                                             dst16, dst16_stride, w, h,
   1173                                             y_filter_ptr, round_offset_conv);
   1174      } else {
   1175        highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, dst16,
   1176                                             dst16_stride, w, h, y_filter_ptr,
   1177                                             round_offset_conv);
   1178      }
   1179    }
   1180  }
   1181 }
   1182 
   1183 static inline void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride,
   1184                                       uint16_t *dst_ptr, int dst_stride, int w,
   1185                                       int h, const int round_bits,
   1186                                       const int offset) {
   1187  if (w <= 4) {
   1188    const int16x4_t round_shift_s16 = vdup_n_s16(round_bits);
   1189    const uint16x4_t offset_u16 = vdup_n_u16(offset);
   1190 
   1191    for (int y = 0; y < h; ++y) {
   1192      const uint16x4_t s = vld1_u16(src_ptr + y * src_stride);
   1193      uint16x4_t d = vshl_u16(s, round_shift_s16);
   1194      d = vadd_u16(d, offset_u16);
   1195      if (w == 2) {
   1196        store_u16_2x1(dst_ptr + y * dst_stride, d);
   1197      } else {
   1198        vst1_u16(dst_ptr + y * dst_stride, d);
   1199      }
   1200    }
   1201  } else {
   1202    const int16x8_t round_shift_s16 = vdupq_n_s16(round_bits);
   1203    const uint16x8_t offset_u16 = vdupq_n_u16(offset);
   1204 
   1205    for (int y = 0; y < h; ++y) {
   1206      for (int x = 0; x < w; x += 8) {
   1207        const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x);
   1208        uint16x8_t d = vshlq_u16(s, round_shift_s16);
   1209        d = vaddq_u16(d, offset_u16);
   1210        vst1q_u16(dst_ptr + y * dst_stride + x, d);
   1211      }
   1212    }
   1213  }
   1214 }
   1215 
   1216 void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src,
   1217                                               int src_stride, uint16_t *dst,
   1218                                               int dst_stride, int w, int h,
   1219                                               ConvolveParams *conv_params,
   1220                                               int bd) {
   1221  DECLARE_ALIGNED(16, uint16_t,
   1222                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
   1223 
   1224  const int im_stride = MAX_SB_SIZE;
   1225  CONV_BUF_TYPE *dst16 = conv_params->dst;
   1226  int dst16_stride = conv_params->dst_stride;
   1227  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   1228  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
   1229                           (1 << (offset_bits - conv_params->round_1 - 1));
   1230  const int round_bits =
   1231      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
   1232  assert(round_bits >= 0);
   1233 
   1234  if (conv_params->do_average) {
   1235    highbd_2d_copy_neon(src, src_stride, im_block, im_stride, w, h, round_bits,
   1236                        round_offset);
   1237  } else {
   1238    highbd_2d_copy_neon(src, src_stride, dst16, dst16_stride, w, h, round_bits,
   1239                        round_offset);
   1240  }
   1241 
   1242  if (conv_params->do_average) {
   1243    if (conv_params->use_dist_wtd_comp_avg) {
   1244      if (bd == 12) {
   1245        highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
   1246                                         w, h, conv_params);
   1247      } else {
   1248        highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
   1249                                      h, conv_params, bd);
   1250      }
   1251    } else {
   1252      if (bd == 12) {
   1253        highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
   1254                                conv_params);
   1255      } else {
   1256        highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
   1257                             conv_params, bd);
   1258      }
   1259    }
   1260  }
   1261 }
   1262 
   1263 static inline uint16x4_t highbd_convolve6_4_2d_v(
   1264    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   1265    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
   1266    const int16x8_t y_filter, const int32x4_t offset) {
   1267  // Values at indices 0 and 7 of y_filter are zero.
   1268  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   1269  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
   1270 
   1271  int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
   1272  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
   1273  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
   1274  sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
   1275  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
   1276  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
   1277 
   1278  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
   1279 }
   1280 
   1281 static inline uint16x8_t highbd_convolve6_8_2d_v(
   1282    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
   1283    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
   1284    const int16x8_t y_filter, const int32x4_t offset) {
   1285  // Values at indices 0 and 7 of y_filter are zero.
   1286  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   1287  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
   1288 
   1289  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
   1290  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
   1291  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
   1292  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
   1293  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
   1294  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
   1295 
   1296  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
   1297  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
   1298  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
   1299  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
   1300  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
   1301  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
   1302 
   1303  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
   1304                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
   1305 }
   1306 
   1307 static inline void highbd_dist_wtd_convolve_2d_vert_6tap_neon(
   1308    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1309    int w, int h, const int16_t *y_filter_ptr, int offset) {
   1310  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
   1311  const int32x4_t offset_vec = vdupq_n_s32(offset);
   1312 
   1313  if (w == 4) {
   1314    const int16_t *s = (const int16_t *)src_ptr;
   1315    uint16_t *d = dst_ptr;
   1316 
   1317    int16x4_t s0, s1, s2, s3, s4;
   1318    load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
   1319    s += 5 * src_stride;
   1320 
   1321    do {
   1322      int16x4_t s5, s6, s7, s8;
   1323      load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
   1324 
   1325      uint16x4_t d0 =
   1326          highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
   1327      uint16x4_t d1 =
   1328          highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
   1329      uint16x4_t d2 =
   1330          highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
   1331      uint16x4_t d3 =
   1332          highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
   1333 
   1334      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
   1335 
   1336      s0 = s4;
   1337      s1 = s5;
   1338      s2 = s6;
   1339      s3 = s7;
   1340      s4 = s8;
   1341      s += 4 * src_stride;
   1342      d += 4 * dst_stride;
   1343      h -= 4;
   1344    } while (h != 0);
   1345  } else {
   1346    do {
   1347      int height = h;
   1348      const int16_t *s = (const int16_t *)src_ptr;
   1349      uint16_t *d = dst_ptr;
   1350 
   1351      int16x8_t s0, s1, s2, s3, s4;
   1352      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
   1353      s += 5 * src_stride;
   1354 
   1355      do {
   1356        int16x8_t s5, s6, s7, s8;
   1357        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
   1358 
   1359        uint16x8_t d0 = highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5,
   1360                                                y_filter, offset_vec);
   1361        uint16x8_t d1 = highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6,
   1362                                                y_filter, offset_vec);
   1363        uint16x8_t d2 = highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7,
   1364                                                y_filter, offset_vec);
   1365        uint16x8_t d3 = highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8,
   1366                                                y_filter, offset_vec);
   1367 
   1368        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1369 
   1370        s0 = s4;
   1371        s1 = s5;
   1372        s2 = s6;
   1373        s3 = s7;
   1374        s4 = s8;
   1375        s += 4 * src_stride;
   1376        d += 4 * dst_stride;
   1377        height -= 4;
   1378      } while (height != 0);
   1379      src_ptr += 8;
   1380      dst_ptr += 8;
   1381      w -= 8;
   1382    } while (w != 0);
   1383  }
   1384 }
   1385 
   1386 static inline uint16x4_t highbd_convolve8_4_2d_v(
   1387    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   1388    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
   1389    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
   1390    const int32x4_t offset) {
   1391  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   1392  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
   1393 
   1394  int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
   1395  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
   1396  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
   1397  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
   1398  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
   1399  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
   1400  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
   1401  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
   1402 
   1403  return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
   1404 }
   1405 
   1406 static inline uint16x8_t highbd_convolve8_8_2d_v(
   1407    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
   1408    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
   1409    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
   1410    const int32x4_t offset) {
   1411  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
   1412  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
   1413 
   1414  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
   1415  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
   1416  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
   1417  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
   1418  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
   1419  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
   1420  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
   1421  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
   1422 
   1423  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
   1424  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
   1425  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
   1426  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
   1427  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
   1428  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
   1429  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
   1430  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
   1431 
   1432  return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
   1433                      vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
   1434 }
   1435 
   1436 static inline void highbd_dist_wtd_convolve_2d_vert_8tap_neon(
   1437    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1438    int w, int h, const int16_t *y_filter_ptr, int offset) {
   1439  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
   1440  const int32x4_t offset_vec = vdupq_n_s32(offset);
   1441 
   1442  if (w <= 4) {
   1443    const int16_t *s = (const int16_t *)src_ptr;
   1444    uint16_t *d = dst_ptr;
   1445 
   1446    int16x4_t s0, s1, s2, s3, s4, s5, s6;
   1447    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
   1448    s += 7 * src_stride;
   1449 
   1450    do {
   1451      int16x4_t s7, s8, s9, s10;
   1452      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
   1453 
   1454      uint16x4_t d0 = highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
   1455                                              y_filter, offset_vec);
   1456      uint16x4_t d1 = highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
   1457                                              y_filter, offset_vec);
   1458      uint16x4_t d2 = highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
   1459                                              y_filter, offset_vec);
   1460      uint16x4_t d3 = highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
   1461                                              y_filter, offset_vec);
   1462 
   1463      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
   1464 
   1465      s0 = s4;
   1466      s1 = s5;
   1467      s2 = s6;
   1468      s3 = s7;
   1469      s4 = s8;
   1470      s5 = s9;
   1471      s6 = s10;
   1472      s += 4 * src_stride;
   1473      d += 4 * dst_stride;
   1474      h -= 4;
   1475    } while (h != 0);
   1476  } else {
   1477    do {
   1478      int height = h;
   1479      const int16_t *s = (const int16_t *)src_ptr;
   1480      uint16_t *d = dst_ptr;
   1481 
   1482      int16x8_t s0, s1, s2, s3, s4, s5, s6;
   1483      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
   1484      s += 7 * src_stride;
   1485 
   1486      do {
   1487        int16x8_t s7, s8, s9, s10;
   1488        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
   1489 
   1490        uint16x8_t d0 = highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
   1491                                                y_filter, offset_vec);
   1492        uint16x8_t d1 = highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
   1493                                                y_filter, offset_vec);
   1494        uint16x8_t d2 = highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
   1495                                                y_filter, offset_vec);
   1496        uint16x8_t d3 = highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
   1497                                                y_filter, offset_vec);
   1498 
   1499        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1500 
   1501        s0 = s4;
   1502        s1 = s5;
   1503        s2 = s6;
   1504        s3 = s7;
   1505        s4 = s8;
   1506        s5 = s9;
   1507        s6 = s10;
   1508        s += 4 * src_stride;
   1509        d += 4 * dst_stride;
   1510        height -= 4;
   1511      } while (height != 0);
   1512      src_ptr += 8;
   1513      dst_ptr += 8;
   1514      w -= 8;
   1515    } while (w != 0);
   1516  }
   1517 }
   1518 
   1519 static inline void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
   1520    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1521    int w, int h, const int16_t *x_filter_ptr, const int offset) {
   1522  // The smallest block height is 4, and the horizontal convolution needs to
   1523  // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
   1524  assert(h >= 5);
   1525  const int32x4_t offset_vec = vdupq_n_s32(offset);
   1526 
   1527  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
   1528 
   1529  int height = h;
   1530 
   1531  do {
   1532    int width = w;
   1533    const int16_t *s = (const int16_t *)src_ptr;
   1534    uint16_t *d = dst_ptr;
   1535 
   1536    do {
   1537      int16x8_t s0[6], s1[6], s2[6], s3[6];
   1538      load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
   1539                   &s0[4], &s0[5]);
   1540      load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
   1541                   &s1[4], &s1[5]);
   1542      load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
   1543                   &s2[4], &s2[5]);
   1544      load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
   1545                   &s3[4], &s3[5]);
   1546 
   1547      uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
   1548                                            s0[5], x_filter, offset_vec);
   1549      uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
   1550                                            s1[5], x_filter, offset_vec);
   1551      uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
   1552                                            s2[5], x_filter, offset_vec);
   1553      uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
   1554                                            s3[5], x_filter, offset_vec);
   1555 
   1556      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1557 
   1558      s += 8;
   1559      d += 8;
   1560      width -= 8;
   1561    } while (width != 0);
   1562    src_ptr += 4 * src_stride;
   1563    dst_ptr += 4 * dst_stride;
   1564    height -= 4;
   1565  } while (height > 4);
   1566 
   1567  do {
   1568    int width = w;
   1569    const int16_t *s = (const int16_t *)src_ptr;
   1570    uint16_t *d = dst_ptr;
   1571 
   1572    do {
   1573      int16x8_t s0[6];
   1574      load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
   1575 
   1576      uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
   1577                                            s0[5], x_filter, offset_vec);
   1578      vst1q_u16(d, d0);
   1579 
   1580      s += 8;
   1581      d += 8;
   1582      width -= 8;
   1583    } while (width != 0);
   1584    src_ptr += src_stride;
   1585    dst_ptr += dst_stride;
   1586  } while (--height != 0);
   1587 }
   1588 
   1589 static inline void highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
   1590    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1591    int w, int h, const int16_t *x_filter_ptr, const int offset) {
   1592  // The smallest block height is 4, and the horizontal convolution needs to
   1593  // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
   1594  assert(h >= 5);
   1595  const int32x4_t offset_vec = vdupq_n_s32(offset);
   1596 
   1597  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
   1598 
   1599  int height = h;
   1600 
   1601  do {
   1602    int width = w;
   1603    const int16_t *s = (const int16_t *)src_ptr;
   1604    uint16_t *d = dst_ptr;
   1605 
   1606    do {
   1607      int16x8_t s0[6], s1[6], s2[6], s3[6];
   1608      load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
   1609                   &s0[4], &s0[5]);
   1610      load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
   1611                   &s1[4], &s1[5]);
   1612      load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
   1613                   &s2[4], &s2[5]);
   1614      load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
   1615                   &s3[4], &s3[5]);
   1616 
   1617      uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
   1618                                         s0[5], x_filter, offset_vec);
   1619      uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
   1620                                         s1[5], x_filter, offset_vec);
   1621      uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
   1622                                         s2[5], x_filter, offset_vec);
   1623      uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
   1624                                         s3[5], x_filter, offset_vec);
   1625 
   1626      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1627 
   1628      s += 8;
   1629      d += 8;
   1630      width -= 8;
   1631    } while (width != 0);
   1632    src_ptr += 4 * src_stride;
   1633    dst_ptr += 4 * dst_stride;
   1634    height -= 4;
   1635  } while (height > 4);
   1636 
   1637  do {
   1638    int width = w;
   1639    const int16_t *s = (const int16_t *)src_ptr;
   1640    uint16_t *d = dst_ptr;
   1641 
   1642    do {
   1643      int16x8_t s0[6];
   1644      load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
   1645 
   1646      uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
   1647                                         s0[5], x_filter, offset_vec);
   1648      vst1q_u16(d, d0);
   1649 
   1650      s += 8;
   1651      d += 8;
   1652      width -= 8;
   1653    } while (width != 0);
   1654    src_ptr += src_stride;
   1655    dst_ptr += dst_stride;
   1656  } while (--height != 0);
   1657 }
   1658 
   1659 static inline void highbd_12_dist_wtd_convolve_2d_horiz_neon(
   1660    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1661    int w, int h, const int16_t *x_filter_ptr, const int offset) {
   1662  // The smallest block height is 4, and the horizontal convolution needs to
   1663  // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
   1664  assert(h >= 5);
   1665  const int32x4_t offset_vec = vdupq_n_s32(offset);
   1666 
   1667  if (w == 4) {
   1668    // 4-tap filters are used for blocks having width == 4.
   1669    const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
   1670    const int16_t *s = (const int16_t *)(src_ptr + 1);
   1671    uint16_t *d = dst_ptr;
   1672 
   1673    do {
   1674      int16x4_t s0[4], s1[4], s2[4], s3[4];
   1675      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
   1676      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
   1677      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
   1678      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
   1679 
   1680      uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
   1681      uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
   1682      uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
   1683      uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
   1684 
   1685      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
   1686 
   1687      s += 4 * src_stride;
   1688      d += 4 * dst_stride;
   1689      h -= 4;
   1690    } while (h > 4);
   1691 
   1692    do {
   1693      int16x4_t s0[4];
   1694      load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
   1695 
   1696      uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
   1697      vst1_u16(d, d0);
   1698 
   1699      s += src_stride;
   1700      d += dst_stride;
   1701    } while (--h != 0);
   1702  } else {
   1703    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
   1704    int height = h;
   1705 
   1706    do {
   1707      int width = w;
   1708      const int16_t *s = (const int16_t *)src_ptr;
   1709      uint16_t *d = dst_ptr;
   1710 
   1711      do {
   1712        int16x8_t s0[8], s1[8], s2[8], s3[8];
   1713        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
   1714                     &s0[4], &s0[5], &s0[6], &s0[7]);
   1715        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
   1716                     &s1[4], &s1[5], &s1[6], &s1[7]);
   1717        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
   1718                     &s2[4], &s2[5], &s2[6], &s2[7]);
   1719        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
   1720                     &s3[4], &s3[5], &s3[6], &s3[7]);
   1721 
   1722        uint16x8_t d0 =
   1723            highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
   1724                                  s0[6], s0[7], x_filter, offset_vec);
   1725        uint16x8_t d1 =
   1726            highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
   1727                                  s1[6], s1[7], x_filter, offset_vec);
   1728        uint16x8_t d2 =
   1729            highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
   1730                                  s2[6], s2[7], x_filter, offset_vec);
   1731        uint16x8_t d3 =
   1732            highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
   1733                                  s3[6], s3[7], x_filter, offset_vec);
   1734 
   1735        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1736 
   1737        s += 8;
   1738        d += 8;
   1739        width -= 8;
   1740      } while (width != 0);
   1741      src_ptr += 4 * src_stride;
   1742      dst_ptr += 4 * dst_stride;
   1743      height -= 4;
   1744    } while (height > 4);
   1745 
   1746    do {
   1747      int width = w;
   1748      const int16_t *s = (const int16_t *)src_ptr;
   1749      uint16_t *d = dst_ptr;
   1750 
   1751      do {
   1752        int16x8_t s0[8];
   1753        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
   1754                     &s0[4], &s0[5], &s0[6], &s0[7]);
   1755 
   1756        uint16x8_t d0 =
   1757            highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
   1758                                  s0[6], s0[7], x_filter, offset_vec);
   1759        vst1q_u16(d, d0);
   1760 
   1761        s += 8;
   1762        d += 8;
   1763        width -= 8;
   1764      } while (width != 0);
   1765      src_ptr += src_stride;
   1766      dst_ptr += dst_stride;
   1767    } while (--height != 0);
   1768  }
   1769 }
   1770 
   1771 static inline void highbd_dist_wtd_convolve_2d_horiz_neon(
   1772    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1773    int w, int h, const int16_t *x_filter_ptr, const int offset) {
   1774  // The smallest block height is 4, and the horizontal convolution needs to
   1775  // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
   1776  assert(h >= 5);
   1777  const int32x4_t offset_vec = vdupq_n_s32(offset);
   1778 
   1779  if (w == 4) {
   1780    // 4-tap filters are used for blocks having width == 4.
   1781    const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
   1782    const int16_t *s = (const int16_t *)(src_ptr + 1);
   1783    uint16_t *d = dst_ptr;
   1784 
   1785    do {
   1786      int16x4_t s0[4], s1[4], s2[4], s3[4];
   1787      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
   1788      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
   1789      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
   1790      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
   1791 
   1792      uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
   1793      uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
   1794      uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
   1795      uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
   1796 
   1797      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
   1798 
   1799      s += 4 * src_stride;
   1800      d += 4 * dst_stride;
   1801      h -= 4;
   1802    } while (h > 4);
   1803 
   1804    do {
   1805      int16x4_t s0[4];
   1806      load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
   1807 
   1808      uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
   1809      vst1_u16(d, d0);
   1810 
   1811      s += src_stride;
   1812      d += dst_stride;
   1813    } while (--h != 0);
   1814  } else {
   1815    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
   1816    int height = h;
   1817 
   1818    do {
   1819      int width = w;
   1820      const int16_t *s = (const int16_t *)src_ptr;
   1821      uint16_t *d = dst_ptr;
   1822 
   1823      do {
   1824        int16x8_t s0[8], s1[8], s2[8], s3[8];
   1825        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
   1826                     &s0[4], &s0[5], &s0[6], &s0[7]);
   1827        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
   1828                     &s1[4], &s1[5], &s1[6], &s1[7]);
   1829        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
   1830                     &s2[4], &s2[5], &s2[6], &s2[7]);
   1831        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
   1832                     &s3[4], &s3[5], &s3[6], &s3[7]);
   1833 
   1834        uint16x8_t d0 =
   1835            highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
   1836                               s0[7], x_filter, offset_vec);
   1837        uint16x8_t d1 =
   1838            highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
   1839                               s1[7], x_filter, offset_vec);
   1840        uint16x8_t d2 =
   1841            highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
   1842                               s2[7], x_filter, offset_vec);
   1843        uint16x8_t d3 =
   1844            highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
   1845                               s3[7], x_filter, offset_vec);
   1846 
   1847        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1848 
   1849        s += 8;
   1850        d += 8;
   1851        width -= 8;
   1852      } while (width != 0);
   1853      src_ptr += 4 * src_stride;
   1854      dst_ptr += 4 * dst_stride;
   1855      height -= 4;
   1856    } while (height > 4);
   1857 
   1858    do {
   1859      int width = w;
   1860      const int16_t *s = (const int16_t *)src_ptr;
   1861      uint16_t *d = dst_ptr;
   1862 
   1863      do {
   1864        int16x8_t s0[8];
   1865        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
   1866                     &s0[4], &s0[5], &s0[6], &s0[7]);
   1867 
   1868        uint16x8_t d0 =
   1869            highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
   1870                               s0[7], x_filter, offset_vec);
   1871        vst1q_u16(d, d0);
   1872 
   1873        s += 8;
   1874        d += 8;
   1875        width -= 8;
   1876      } while (width != 0);
   1877      src_ptr += src_stride;
   1878      dst_ptr += dst_stride;
   1879    } while (--height != 0);
   1880  }
   1881 }
   1882 
   1883 void av1_highbd_dist_wtd_convolve_2d_neon(
   1884    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
   1885    int h, const InterpFilterParams *filter_params_x,
   1886    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
   1887    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   1888  DECLARE_ALIGNED(16, uint16_t,
   1889                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
   1890  DECLARE_ALIGNED(16, uint16_t,
   1891                  im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
   1892 
   1893  CONV_BUF_TYPE *dst16 = conv_params->dst;
   1894  int dst16_stride = conv_params->dst_stride;
   1895  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
   1896  const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
   1897  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   1898  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
   1899 
   1900  const int im_h = h + clamped_y_taps - 1;
   1901  const int im_stride = MAX_SB_SIZE;
   1902  const int vert_offset = clamped_y_taps / 2 - 1;
   1903  const int horiz_offset = clamped_x_taps / 2 - 1;
   1904  // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a
   1905  // faster non-rounding non-saturating left shift.
   1906  const int round_offset_conv_x =
   1907      (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1));
   1908  const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   1909  const int round_offset_conv_y = (1 << y_offset_bits);
   1910 
   1911  const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
   1912 
   1913  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1914      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   1915  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1916      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   1917 
   1918  // horizontal filter
   1919  if (bd == 12) {
   1920    if (x_filter_taps <= 6 && w != 4) {
   1921      highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
   1922          src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
   1923          round_offset_conv_x);
   1924    } else {
   1925      highbd_12_dist_wtd_convolve_2d_horiz_neon(
   1926          src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
   1927          round_offset_conv_x);
   1928    }
   1929  } else {
   1930    if (x_filter_taps <= 6 && w != 4) {
   1931      highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
   1932          src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
   1933          round_offset_conv_x);
   1934    } else {
   1935      highbd_dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block,
   1936                                             im_stride, w, im_h, x_filter_ptr,
   1937                                             round_offset_conv_x);
   1938    }
   1939  }
   1940 
   1941  // vertical filter
   1942  if (y_filter_taps <= 6) {
   1943    if (conv_params->do_average) {
   1944      highbd_dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, im_block2,
   1945                                                 im_stride, w, h, y_filter_ptr,
   1946                                                 round_offset_conv_y);
   1947    } else {
   1948      highbd_dist_wtd_convolve_2d_vert_6tap_neon(
   1949          im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
   1950          round_offset_conv_y);
   1951    }
   1952  } else {
   1953    if (conv_params->do_average) {
   1954      highbd_dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, im_block2,
   1955                                                 im_stride, w, h, y_filter_ptr,
   1956                                                 round_offset_conv_y);
   1957    } else {
   1958      highbd_dist_wtd_convolve_2d_vert_8tap_neon(
   1959          im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
   1960          round_offset_conv_y);
   1961    }
   1962  }
   1963 
   1964  // Do the compound averaging outside the loop, avoids branching within the
   1965  // main loop
   1966  if (conv_params->do_average) {
   1967    if (conv_params->use_dist_wtd_comp_avg) {
   1968      if (bd == 12) {
   1969        highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride,
   1970                                         w, h, conv_params);
   1971      } else {
   1972        highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
   1973                                      h, conv_params, bd);
   1974      }
   1975    } else {
   1976      if (bd == 12) {
   1977        highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
   1978                                conv_params);
   1979      } else {
   1980        highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
   1981                             conv_params, bd);
   1982      }
   1983    }
   1984  }
   1985 }