tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_convolve_neon.c (81586B)


      1 /*
      2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <arm_neon.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/av1_rtcd.h"
     17 
     18 #include "aom_dsp/aom_dsp_common.h"
     19 #include "aom_dsp/arm/mem_neon.h"
     20 #include "aom_ports/mem.h"
     21 #include "av1/common/convolve.h"
     22 #include "av1/common/filter.h"
     23 
     24 static inline uint16x4_t highbd_convolve6_4_y(
     25    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
     26    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
     27    const int16x8_t y_filter, const uint16x4_t max) {
     28  // Values at indices 0 and 7 of y_filter are zero.
     29  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
     30  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
     31 
     32  int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 1);
     33  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
     34  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
     35  sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
     36  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
     37  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
     38 
     39  uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
     40  return vmin_u16(res, max);
     41 }
     42 
     43 static inline uint16x8_t highbd_convolve6_8_y(
     44    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
     45    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
     46    const int16x8_t y_filter, const uint16x8_t max) {
     47  // Values at indices 0 and 7 of y_filter are zero.
     48  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
     49  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
     50 
     51  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 1);
     52  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
     53  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
     54  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
     55  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
     56  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
     57 
     58  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 1);
     59  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
     60  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
     61  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
     62  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
     63  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
     64 
     65  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
     66                                vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
     67  return vminq_u16(res, max);
     68 }
     69 
     70 static inline void highbd_convolve_y_sr_6tap_neon(
     71    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     72    int w, int h, const int16_t *y_filter_ptr, const int bd) {
     73  const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
     74 
     75  if (w == 4) {
     76    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
     77    const int16_t *s = (const int16_t *)(src_ptr + src_stride);
     78    uint16_t *d = dst_ptr;
     79 
     80    int16x4_t s0, s1, s2, s3, s4;
     81    load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
     82    s += 5 * src_stride;
     83 
     84    do {
     85      int16x4_t s5, s6, s7, s8;
     86      load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
     87 
     88      uint16x4_t d0 =
     89          highbd_convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter_0_7, max);
     90      uint16x4_t d1 =
     91          highbd_convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter_0_7, max);
     92      uint16x4_t d2 =
     93          highbd_convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter_0_7, max);
     94      uint16x4_t d3 =
     95          highbd_convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter_0_7, max);
     96 
     97      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
     98 
     99      s0 = s4;
    100      s1 = s5;
    101      s2 = s6;
    102      s3 = s7;
    103      s4 = s8;
    104      s += 4 * src_stride;
    105      d += 4 * dst_stride;
    106      h -= 4;
    107    } while (h != 0);
    108  } else {
    109    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    110    // Width is a multiple of 8 and height is a multiple of 4.
    111    do {
    112      int height = h;
    113      const int16_t *s = (const int16_t *)(src_ptr + src_stride);
    114      uint16_t *d = dst_ptr;
    115 
    116      int16x8_t s0, s1, s2, s3, s4;
    117      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
    118      s += 5 * src_stride;
    119 
    120      do {
    121        int16x8_t s5, s6, s7, s8;
    122        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
    123 
    124        uint16x8_t d0 =
    125            highbd_convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter_0_7, max);
    126        uint16x8_t d1 =
    127            highbd_convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter_0_7, max);
    128        uint16x8_t d2 =
    129            highbd_convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter_0_7, max);
    130        uint16x8_t d3 =
    131            highbd_convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter_0_7, max);
    132 
    133        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    134 
    135        s0 = s4;
    136        s1 = s5;
    137        s2 = s6;
    138        s3 = s7;
    139        s4 = s8;
    140        s += 4 * src_stride;
    141        d += 4 * dst_stride;
    142        height -= 4;
    143      } while (height != 0);
    144 
    145      src_ptr += 8;
    146      dst_ptr += 8;
    147      w -= 8;
    148    } while (w != 0);
    149  }
    150 }
    151 
    152 static inline uint16x4_t highbd_convolve8_4_y(
    153    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
    154    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
    155    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
    156    const uint16x4_t max) {
    157  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
    158  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
    159 
    160  int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
    161  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
    162  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
    163  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
    164  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
    165  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
    166  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
    167  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
    168 
    169  uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
    170  return vmin_u16(res, max);
    171 }
    172 
    173 static inline uint16x8_t highbd_convolve8_8_y(
    174    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
    175    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
    176    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
    177    const uint16x8_t max) {
    178  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
    179  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
    180 
    181  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
    182  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
    183  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
    184  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
    185  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
    186  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
    187  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
    188  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
    189 
    190  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
    191  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
    192  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
    193  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
    194  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
    195  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
    196  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
    197  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
    198 
    199  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
    200                                vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
    201  return vminq_u16(res, max);
    202 }
    203 
    204 static inline void highbd_convolve_y_sr_8tap_neon(
    205    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    206    int w, int h, const int16_t *y_filter_ptr, int bd) {
    207  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
    208 
    209  if (w == 4) {
    210    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
    211    const int16_t *s = (const int16_t *)src_ptr;
    212    uint16_t *d = dst_ptr;
    213 
    214    int16x4_t s0, s1, s2, s3, s4, s5, s6;
    215    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
    216    s += 7 * src_stride;
    217 
    218    do {
    219      int16x4_t s7, s8, s9, s10;
    220      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
    221 
    222      uint16x4_t d0 =
    223          highbd_convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
    224      uint16x4_t d1 =
    225          highbd_convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
    226      uint16x4_t d2 =
    227          highbd_convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
    228      uint16x4_t d3 =
    229          highbd_convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max);
    230 
    231      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
    232 
    233      s0 = s4;
    234      s1 = s5;
    235      s2 = s6;
    236      s3 = s7;
    237      s4 = s8;
    238      s5 = s9;
    239      s6 = s10;
    240      s += 4 * src_stride;
    241      d += 4 * dst_stride;
    242      h -= 4;
    243    } while (h != 0);
    244  } else {
    245    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    246 
    247    do {
    248      int height = h;
    249      const int16_t *s = (const int16_t *)src_ptr;
    250      uint16_t *d = dst_ptr;
    251 
    252      int16x8_t s0, s1, s2, s3, s4, s5, s6;
    253      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
    254      s += 7 * src_stride;
    255 
    256      do {
    257        int16x8_t s7, s8, s9, s10;
    258        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
    259 
    260        uint16x8_t d0 =
    261            highbd_convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max);
    262        uint16x8_t d1 =
    263            highbd_convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max);
    264        uint16x8_t d2 =
    265            highbd_convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max);
    266        uint16x8_t d3 = highbd_convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10,
    267                                             y_filter, max);
    268 
    269        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    270 
    271        s0 = s4;
    272        s1 = s5;
    273        s2 = s6;
    274        s3 = s7;
    275        s4 = s8;
    276        s5 = s9;
    277        s6 = s10;
    278        s += 4 * src_stride;
    279        d += 4 * dst_stride;
    280        height -= 4;
    281      } while (height != 0);
    282      src_ptr += 8;
    283      dst_ptr += 8;
    284      w -= 8;
    285    } while (w != 0);
    286  }
    287 }
    288 
    289 static inline uint16x4_t highbd_convolve12_4_y(
    290    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
    291    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
    292    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
    293    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
    294    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
    295    const uint16x4_t max) {
    296  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
    297  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
    298 
    299  int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
    300  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
    301  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
    302  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
    303  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
    304  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
    305  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
    306  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
    307  sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
    308  sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
    309  sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
    310  sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
    311 
    312  uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
    313  return vmin_u16(res, max);
    314 }
    315 
    316 static inline uint16x8_t highbd_convolve12_8_y(
    317    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
    318    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
    319    const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
    320    const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
    321    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
    322    const uint16x8_t max) {
    323  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
    324  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
    325 
    326  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
    327  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
    328  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
    329  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
    330  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
    331  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
    332  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
    333  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
    334  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
    335  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
    336  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
    337  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
    338 
    339  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
    340  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
    341  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
    342  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
    343  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
    344  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
    345  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
    346  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
    347  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
    348  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
    349  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
    350  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
    351 
    352  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
    353                                vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
    354  return vminq_u16(res, max);
    355 }
    356 
    357 static inline void highbd_convolve_y_sr_12tap_neon(
    358    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    359    int w, int h, const int16_t *y_filter_ptr, int bd) {
    360  const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
    361  const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
    362 
    363  if (w == 4) {
    364    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
    365    const int16_t *s = (const int16_t *)src_ptr;
    366    uint16_t *d = dst_ptr;
    367 
    368    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
    369    load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
    370                  &s9, &s10);
    371    s += 11 * src_stride;
    372 
    373    do {
    374      int16x4_t s11, s12, s13, s14;
    375      load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14);
    376 
    377      uint16x4_t d0 =
    378          highbd_convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
    379                                s11, y_filter_0_7, y_filter_8_11, max);
    380      uint16x4_t d1 =
    381          highbd_convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
    382                                s12, y_filter_0_7, y_filter_8_11, max);
    383      uint16x4_t d2 =
    384          highbd_convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
    385                                s13, y_filter_0_7, y_filter_8_11, max);
    386      uint16x4_t d3 =
    387          highbd_convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
    388                                s14, y_filter_0_7, y_filter_8_11, max);
    389 
    390      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
    391 
    392      s0 = s4;
    393      s1 = s5;
    394      s2 = s6;
    395      s3 = s7;
    396      s4 = s8;
    397      s5 = s9;
    398      s6 = s10;
    399      s7 = s11;
    400      s8 = s12;
    401      s9 = s13;
    402      s10 = s14;
    403      s += 4 * src_stride;
    404      d += 4 * dst_stride;
    405      h -= 4;
    406    } while (h != 0);
    407  } else {
    408    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    409 
    410    do {
    411      int height = h;
    412      const int16_t *s = (const int16_t *)src_ptr;
    413      uint16_t *d = dst_ptr;
    414 
    415      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
    416      load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
    417                    &s9, &s10);
    418      s += 11 * src_stride;
    419 
    420      do {
    421        int16x8_t s11, s12, s13, s14;
    422        load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
    423 
    424        uint16x8_t d0 =
    425            highbd_convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
    426                                  s11, y_filter_0_7, y_filter_8_11, max);
    427        uint16x8_t d1 =
    428            highbd_convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
    429                                  s12, y_filter_0_7, y_filter_8_11, max);
    430        uint16x8_t d2 =
    431            highbd_convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
    432                                  s13, y_filter_0_7, y_filter_8_11, max);
    433        uint16x8_t d3 =
    434            highbd_convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
    435                                  s13, s14, y_filter_0_7, y_filter_8_11, max);
    436 
    437        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    438 
    439        s0 = s4;
    440        s1 = s5;
    441        s2 = s6;
    442        s3 = s7;
    443        s4 = s8;
    444        s5 = s9;
    445        s6 = s10;
    446        s7 = s11;
    447        s8 = s12;
    448        s9 = s13;
    449        s10 = s14;
    450        s += 4 * src_stride;
    451        d += 4 * dst_stride;
    452        height -= 4;
    453      } while (height != 0);
    454 
    455      src_ptr += 8;
    456      dst_ptr += 8;
    457      w -= 8;
    458    } while (w != 0);
    459  }
    460 }
    461 
    462 void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride,
    463                                   uint16_t *dst, int dst_stride, int w, int h,
    464                                   const InterpFilterParams *filter_params_y,
    465                                   const int subpel_y_qn, int bd) {
    466  if (w == 2 || h == 2) {
    467    av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
    468                               filter_params_y, subpel_y_qn, bd);
    469    return;
    470  }
    471  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
    472  const int vert_offset = filter_params_y->taps / 2 - 1;
    473  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
    474      filter_params_y, subpel_y_qn & SUBPEL_MASK);
    475 
    476  src -= vert_offset * src_stride;
    477 
    478  if (y_filter_taps > 8) {
    479    highbd_convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
    480                                    y_filter_ptr, bd);
    481    return;
    482  }
    483  if (y_filter_taps < 8) {
    484    highbd_convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h,
    485                                   y_filter_ptr, bd);
    486    return;
    487  }
    488 
    489  highbd_convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h,
    490                                 y_filter_ptr, bd);
    491 }
    492 
    493 static inline uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6],
    494                                              const int16x8_t x_filter,
    495                                              const int32x4_t offset,
    496                                              const uint16x8_t max) {
    497  // Values at indices 0 and 7 of y_filter are zero.
    498  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
    499  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
    500 
    501  int32x4_t sum0 = offset;
    502  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 1);
    503  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2);
    504  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3);
    505  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0);
    506  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1);
    507  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2);
    508 
    509  int32x4_t sum1 = offset;
    510  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 1);
    511  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2);
    512  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3);
    513  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0);
    514  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
    515  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
    516 
    517  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
    518                                vqrshrun_n_s32(sum1, FILTER_BITS));
    519  return vminq_u16(res, max);
    520 }
    521 
    522 static inline void highbd_convolve_x_sr_6tap_neon(
    523    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    524    int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
    525    int bd) {
    526  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
    527  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    528  // This shim allows to do only one rounding shift instead of two.
    529  const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
    530 
    531  int height = h;
    532 
    533  do {
    534    int width = w;
    535    const int16_t *s = (const int16_t *)src_ptr;
    536    uint16_t *d = dst_ptr;
    537 
    538    do {
    539      int16x8_t s0[6], s1[6], s2[6], s3[6];
    540      load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
    541                   &s0[4], &s0[5]);
    542      load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
    543                   &s1[4], &s1[5]);
    544      load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
    545                   &s2[4], &s2[5]);
    546      load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
    547                   &s3[4], &s3[5]);
    548 
    549      uint16x8_t d0 = highbd_convolve6_8_x(s0, x_filter, offset, max);
    550      uint16x8_t d1 = highbd_convolve6_8_x(s1, x_filter, offset, max);
    551      uint16x8_t d2 = highbd_convolve6_8_x(s2, x_filter, offset, max);
    552      uint16x8_t d3 = highbd_convolve6_8_x(s3, x_filter, offset, max);
    553 
    554      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    555 
    556      s += 8;
    557      d += 8;
    558      width -= 8;
    559    } while (width != 0);
    560 
    561    src_ptr += 4 * src_stride;
    562    dst_ptr += 4 * dst_stride;
    563    height -= 4;
    564  } while (height != 0);
    565 }
    566 
    567 static inline uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
    568                                              const int16x4_t x_filter,
    569                                              const int32x4_t offset,
    570                                              const uint16x4_t max) {
    571  int32x4_t sum = offset;
    572  sum = vmlal_lane_s16(sum, s[0], x_filter, 0);
    573  sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
    574  sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
    575  sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
    576 
    577  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
    578  return vmin_u16(res, max);
    579 }
    580 
    581 static inline uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8],
    582                                              const int16x8_t x_filter,
    583                                              const int32x4_t offset,
    584                                              const uint16x8_t max) {
    585  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
    586  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
    587 
    588  int32x4_t sum0 = offset;
    589  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0);
    590  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
    591  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
    592  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
    593  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
    594  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
    595  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
    596  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
    597 
    598  int32x4_t sum1 = offset;
    599  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0);
    600  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
    601  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
    602  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
    603  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
    604  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
    605  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
    606  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
    607 
    608  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
    609                                vqrshrun_n_s32(sum1, FILTER_BITS));
    610  return vminq_u16(res, max);
    611 }
    612 
    613 static inline void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
    614                                             int src_stride, uint16_t *dst_ptr,
    615                                             int dst_stride, int w, int h,
    616                                             const int16_t *x_filter_ptr,
    617                                             ConvolveParams *conv_params,
    618                                             int bd) {
    619  // This shim allows to do only one rounding shift instead of two.
    620  const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
    621 
    622  if (w == 4) {
    623    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
    624    // 4-tap filters are used for blocks having width == 4.
    625    const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
    626    const int16_t *s = (const int16_t *)(src_ptr + 2);
    627    uint16_t *d = dst_ptr;
    628 
    629    do {
    630      int16x4_t s0[4], s1[4], s2[4], s3[4];
    631      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
    632      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
    633      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
    634      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
    635 
    636      uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset, max);
    637      uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset, max);
    638      uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset, max);
    639      uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset, max);
    640 
    641      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
    642 
    643      s += 4 * src_stride;
    644      d += 4 * dst_stride;
    645      h -= 4;
    646    } while (h != 0);
    647  } else {
    648    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    649    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
    650    int height = h;
    651 
    652    do {
    653      int width = w;
    654      const int16_t *s = (const int16_t *)src_ptr;
    655      uint16_t *d = dst_ptr;
    656 
    657      do {
    658        int16x8_t s0[8], s1[8], s2[8], s3[8];
    659        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
    660                     &s0[4], &s0[5], &s0[6], &s0[7]);
    661        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
    662                     &s1[4], &s1[5], &s1[6], &s1[7]);
    663        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
    664                     &s2[4], &s2[5], &s2[6], &s2[7]);
    665        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
    666                     &s3[4], &s3[5], &s3[6], &s3[7]);
    667 
    668        uint16x8_t d0 = highbd_convolve8_8_x(s0, x_filter, offset, max);
    669        uint16x8_t d1 = highbd_convolve8_8_x(s1, x_filter, offset, max);
    670        uint16x8_t d2 = highbd_convolve8_8_x(s2, x_filter, offset, max);
    671        uint16x8_t d3 = highbd_convolve8_8_x(s3, x_filter, offset, max);
    672 
    673        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    674 
    675        s += 8;
    676        d += 8;
    677        width -= 8;
    678      } while (width != 0);
    679      src_ptr += 4 * src_stride;
    680      dst_ptr += 4 * dst_stride;
    681      height -= 4;
    682    } while (height != 0);
    683  }
    684 }
    685 
    686 static inline uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12],
    687                                               const int16x8_t x_filter_0_7,
    688                                               const int16x4_t x_filter_8_11,
    689                                               const int32x4_t offset,
    690                                               const uint16x4_t max) {
    691  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
    692  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
    693 
    694  int32x4_t sum = offset;
    695  sum = vmlal_lane_s16(sum, s[0], x_filter_0_3, 0);
    696  sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1);
    697  sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2);
    698  sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3);
    699  sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0);
    700  sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1);
    701  sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2);
    702  sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3);
    703  sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0);
    704  sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1);
    705  sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
    706  sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
    707 
    708  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
    709  return vmin_u16(res, max);
    710 }
    711 
    712 static inline uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12],
    713                                               const int16x8_t x_filter_0_7,
    714                                               const int16x4_t x_filter_8_11,
    715                                               const int32x4_t offset,
    716                                               const uint16x8_t max) {
    717  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
    718  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
    719 
    720  int32x4_t sum0 = offset;
    721  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0);
    722  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
    723  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
    724  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
    725  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
    726  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
    727  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
    728  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
    729  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0);
    730  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1);
    731  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2);
    732  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3);
    733 
    734  int32x4_t sum1 = offset;
    735  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0);
    736  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
    737  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
    738  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
    739  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
    740  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
    741  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
    742  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
    743  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0);
    744  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1);
    745  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
    746  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
    747 
    748  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
    749                                vqrshrun_n_s32(sum1, FILTER_BITS));
    750  return vminq_u16(res, max);
    751 }
    752 
    753 static inline void highbd_convolve_x_sr_12tap_neon(
    754    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    755    int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
    756    int bd) {
    757  // This shim allows to do only one rounding shift instead of two.
    758  const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
    759  const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
    760  const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
    761 
    762  if (w == 4) {
    763    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
    764    const int16_t *s = (const int16_t *)src_ptr;
    765    uint16_t *d = dst_ptr;
    766 
    767    do {
    768      int16x4_t s0[12], s1[12], s2[12], s3[12];
    769      load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
    770                    &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
    771                    &s0[11]);
    772      load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
    773                    &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
    774                    &s1[11]);
    775      load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
    776                    &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
    777                    &s2[11]);
    778      load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
    779                    &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
    780                    &s3[11]);
    781 
    782      uint16x4_t d0 =
    783          highbd_convolve12_4_x(s0, x_filter_0_7, x_filter_8_11, offset, max);
    784      uint16x4_t d1 =
    785          highbd_convolve12_4_x(s1, x_filter_0_7, x_filter_8_11, offset, max);
    786      uint16x4_t d2 =
    787          highbd_convolve12_4_x(s2, x_filter_0_7, x_filter_8_11, offset, max);
    788      uint16x4_t d3 =
    789          highbd_convolve12_4_x(s3, x_filter_0_7, x_filter_8_11, offset, max);
    790 
    791      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
    792 
    793      s += 4 * src_stride;
    794      d += 4 * dst_stride;
    795      h -= 4;
    796    } while (h != 0);
    797  } else {
    798    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    799    int height = h;
    800 
    801    do {
    802      int width = w;
    803      const int16_t *s = (const int16_t *)src_ptr;
    804      uint16_t *d = dst_ptr;
    805 
    806      do {
    807        int16x8_t s0[12], s1[12], s2[12], s3[12];
    808        load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
    809                      &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
    810                      &s0[11]);
    811        load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
    812                      &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
    813                      &s1[11]);
    814        load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
    815                      &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
    816                      &s2[11]);
    817        load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
    818                      &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
    819                      &s3[11]);
    820 
    821        uint16x8_t d0 =
    822            highbd_convolve12_8_x(s0, x_filter_0_7, x_filter_8_11, offset, max);
    823        uint16x8_t d1 =
    824            highbd_convolve12_8_x(s1, x_filter_0_7, x_filter_8_11, offset, max);
    825        uint16x8_t d2 =
    826            highbd_convolve12_8_x(s2, x_filter_0_7, x_filter_8_11, offset, max);
    827        uint16x8_t d3 =
    828            highbd_convolve12_8_x(s3, x_filter_0_7, x_filter_8_11, offset, max);
    829 
    830        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    831 
    832        s += 8;
    833        d += 8;
    834        width -= 8;
    835      } while (width != 0);
    836      src_ptr += 4 * src_stride;
    837      dst_ptr += 4 * dst_stride;
    838      height -= 4;
    839    } while (height != 0);
    840  }
    841 }
    842 
    843 void av1_highbd_convolve_x_sr_neon(const uint16_t *src, int src_stride,
    844                                   uint16_t *dst, int dst_stride, int w, int h,
    845                                   const InterpFilterParams *filter_params_x,
    846                                   const int subpel_x_qn,
    847                                   ConvolveParams *conv_params, int bd) {
    848  if (w == 2 || h == 2) {
    849    av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
    850                               filter_params_x, subpel_x_qn, conv_params, bd);
    851    return;
    852  }
    853  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
    854  const int horiz_offset = filter_params_x->taps / 2 - 1;
    855  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
    856      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    857 
    858  src -= horiz_offset;
    859 
    860  if (x_filter_taps > 8) {
    861    highbd_convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
    862                                    x_filter_ptr, conv_params, bd);
    863    return;
    864  }
    865  if (x_filter_taps <= 6 && w != 4) {
    866    highbd_convolve_x_sr_6tap_neon(src + 1, src_stride, dst, dst_stride, w, h,
    867                                   x_filter_ptr, conv_params, bd);
    868    return;
    869  }
    870 
    871  highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h,
    872                            x_filter_ptr, conv_params, bd);
    873 }
    874 
    875 static inline uint16x4_t highbd_convolve6_4_2d_v(
    876    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
    877    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
    878    const int16x8_t y_filter, const int32x4_t round_shift,
    879    const int32x4_t offset, const uint16x4_t max) {
    880  // Values at indices 0 and 7 of y_filter are zero.
    881  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
    882  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
    883 
    884  int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
    885  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
    886  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
    887  sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
    888  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
    889  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
    890 
    891  sum = vshlq_s32(sum, round_shift);
    892  uint16x4_t res = vqmovun_s32(sum);
    893  return vmin_u16(res, max);
    894 }
    895 
    896 static inline uint16x8_t highbd_convolve6_8_2d_v(
    897    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
    898    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
    899    const int16x8_t y_filter, const int32x4_t round_shift,
    900    const int32x4_t offset, const uint16x8_t max) {
    901  // Values at indices 0 and 7 of y_filter are zero.
    902  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
    903  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
    904 
    905  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
    906  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
    907  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
    908  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
    909  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
    910  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
    911 
    912  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
    913  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
    914  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
    915  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
    916  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
    917  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
    918 
    919  sum0 = vshlq_s32(sum0, round_shift);
    920  sum1 = vshlq_s32(sum1, round_shift);
    921 
    922  uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
    923  return vminq_u16(res, max);
    924 }
    925 
    926 static inline void highbd_convolve_2d_sr_vert_6tap_neon(
    927    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    928    int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
    929    int bd, const int offset) {
    930  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
    931  const int32x4_t offset_s32 = vdupq_n_s32(offset);
    932  const int round1_shift = conv_params->round_1;
    933  const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
    934 
    935  if (w == 4) {
    936    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
    937    const int16_t *s = (const int16_t *)src_ptr;
    938    uint16_t *d = dst_ptr;
    939    int16x4_t s0, s1, s2, s3, s4;
    940    load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
    941    s += 5 * src_stride;
    942 
    943    do {
    944      int16x4_t s5, s6, s7, s8;
    945      load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
    946 
    947      uint16x4_t d0 = highbd_convolve6_4_2d_v(
    948          s0, s1, s2, s3, s4, s5, y_filter, round1_shift_s32, offset_s32, max);
    949      uint16x4_t d1 = highbd_convolve6_4_2d_v(
    950          s1, s2, s3, s4, s5, s6, y_filter, round1_shift_s32, offset_s32, max);
    951      uint16x4_t d2 = highbd_convolve6_4_2d_v(
    952          s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32, max);
    953      uint16x4_t d3 = highbd_convolve6_4_2d_v(
    954          s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32, max);
    955 
    956      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
    957 
    958      s0 = s4;
    959      s1 = s5;
    960      s2 = s6;
    961      s3 = s7;
    962      s4 = s8;
    963      s += 4 * src_stride;
    964      d += 4 * dst_stride;
    965      h -= 4;
    966    } while (h != 0);
    967  } else {
    968    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    969 
    970    do {
    971      int height = h;
    972      const int16_t *s = (const int16_t *)src_ptr;
    973      uint16_t *d = dst_ptr;
    974      int16x8_t s0, s1, s2, s3, s4;
    975      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
    976      s += 5 * src_stride;
    977 
    978      do {
    979        int16x8_t s5, s6, s7, s8;
    980        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
    981 
    982        uint16x8_t d0 =
    983            highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter,
    984                                    round1_shift_s32, offset_s32, max);
    985        uint16x8_t d1 =
    986            highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter,
    987                                    round1_shift_s32, offset_s32, max);
    988        uint16x8_t d2 =
    989            highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter,
    990                                    round1_shift_s32, offset_s32, max);
    991        uint16x8_t d3 =
    992            highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter,
    993                                    round1_shift_s32, offset_s32, max);
    994 
    995        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    996 
    997        s0 = s4;
    998        s1 = s5;
    999        s2 = s6;
   1000        s3 = s7;
   1001        s4 = s8;
   1002        s += 4 * src_stride;
   1003        d += 4 * dst_stride;
   1004        height -= 4;
   1005      } while (height != 0);
   1006      src_ptr += 8;
   1007      dst_ptr += 8;
   1008      w -= 8;
   1009    } while (w != 0);
   1010  }
   1011 }
   1012 
   1013 static inline uint16x4_t highbd_convolve8_4_2d_v(
   1014    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   1015    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
   1016    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
   1017    const int32x4_t round_shift, const int32x4_t offset, const uint16x4_t max) {
   1018  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
   1019  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
   1020 
   1021  int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0);
   1022  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
   1023  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
   1024  sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
   1025  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
   1026  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
   1027  sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
   1028  sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
   1029 
   1030  sum = vshlq_s32(sum, round_shift);
   1031  uint16x4_t res = vqmovun_s32(sum);
   1032  return vmin_u16(res, max);
   1033 }
   1034 
   1035 static inline uint16x8_t highbd_convolve8_8_2d_v(
   1036    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
   1037    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
   1038    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
   1039    const int32x4_t round_shift, const int32x4_t offset, const uint16x8_t max) {
   1040  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
   1041  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
   1042 
   1043  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0);
   1044  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
   1045  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
   1046  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
   1047  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
   1048  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
   1049  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
   1050  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
   1051 
   1052  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0);
   1053  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
   1054  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
   1055  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
   1056  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
   1057  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
   1058  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
   1059  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
   1060 
   1061  sum0 = vshlq_s32(sum0, round_shift);
   1062  sum1 = vshlq_s32(sum1, round_shift);
   1063 
   1064  uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
   1065  return vminq_u16(res, max);
   1066 }
   1067 
   1068 static inline void highbd_convolve_2d_sr_vert_8tap_neon(
   1069    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1070    int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
   1071    int bd, const int offset) {
   1072  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
   1073  const int32x4_t offset_s32 = vdupq_n_s32(offset);
   1074  const int round1_shift = conv_params->round_1;
   1075  const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
   1076 
   1077  if (w == 4) {
   1078    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
   1079    const int16_t *s = (const int16_t *)src_ptr;
   1080    uint16_t *d = dst_ptr;
   1081 
   1082    int16x4_t s0, s1, s2, s3, s4, s5, s6;
   1083    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
   1084    s += 7 * src_stride;
   1085 
   1086    do {
   1087      int16x4_t s7, s8, s9, s10;
   1088      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
   1089 
   1090      uint16x4_t d0 =
   1091          highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   1092                                  round1_shift_s32, offset_s32, max);
   1093      uint16x4_t d1 =
   1094          highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
   1095                                  round1_shift_s32, offset_s32, max);
   1096      uint16x4_t d2 =
   1097          highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
   1098                                  round1_shift_s32, offset_s32, max);
   1099      uint16x4_t d3 =
   1100          highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
   1101                                  round1_shift_s32, offset_s32, max);
   1102 
   1103      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
   1104 
   1105      s0 = s4;
   1106      s1 = s5;
   1107      s2 = s6;
   1108      s3 = s7;
   1109      s4 = s8;
   1110      s5 = s9;
   1111      s6 = s10;
   1112      s += 4 * src_stride;
   1113      d += 4 * dst_stride;
   1114      h -= 4;
   1115    } while (h != 0);
   1116  } else {
   1117    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   1118 
   1119    do {
   1120      int height = h;
   1121      const int16_t *s = (const int16_t *)src_ptr;
   1122      uint16_t *d = dst_ptr;
   1123 
   1124      int16x8_t s0, s1, s2, s3, s4, s5, s6;
   1125      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
   1126      s += 7 * src_stride;
   1127 
   1128      do {
   1129        int16x8_t s7, s8, s9, s10;
   1130        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
   1131 
   1132        uint16x8_t d0 =
   1133            highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
   1134                                    round1_shift_s32, offset_s32, max);
   1135        uint16x8_t d1 =
   1136            highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
   1137                                    round1_shift_s32, offset_s32, max);
   1138        uint16x8_t d2 =
   1139            highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
   1140                                    round1_shift_s32, offset_s32, max);
   1141        uint16x8_t d3 =
   1142            highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
   1143                                    round1_shift_s32, offset_s32, max);
   1144 
   1145        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1146 
   1147        s0 = s4;
   1148        s1 = s5;
   1149        s2 = s6;
   1150        s3 = s7;
   1151        s4 = s8;
   1152        s5 = s9;
   1153        s6 = s10;
   1154        s += 4 * src_stride;
   1155        d += 4 * dst_stride;
   1156        height -= 4;
   1157      } while (height != 0);
   1158      src_ptr += 8;
   1159      dst_ptr += 8;
   1160      w -= 8;
   1161    } while (w != 0);
   1162  }
   1163 }
   1164 
   1165 static inline uint16x4_t highbd_convolve12_4_2d_v(
   1166    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
   1167    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
   1168    const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
   1169    const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
   1170    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
   1171    const int32x4_t round_shift, const int32x4_t offset, const uint16x4_t max) {
   1172  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
   1173  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
   1174 
   1175  int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
   1176  sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
   1177  sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
   1178  sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
   1179  sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
   1180  sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
   1181  sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
   1182  sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
   1183  sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
   1184  sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
   1185  sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
   1186  sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
   1187 
   1188  sum = vshlq_s32(sum, round_shift);
   1189  uint16x4_t res = vqmovun_s32(sum);
   1190  return vmin_u16(res, max);
   1191 }
   1192 
   1193 static inline uint16x8_t highbd_convolve12_8_2d_v(
   1194    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
   1195    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
   1196    const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
   1197    const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
   1198    const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
   1199    const int32x4_t round_shift, const int32x4_t offset, const uint16x8_t max) {
   1200  const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
   1201  const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
   1202 
   1203  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
   1204  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
   1205  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
   1206  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
   1207  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
   1208  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
   1209  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
   1210  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
   1211  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
   1212  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
   1213  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
   1214  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
   1215 
   1216  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
   1217  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
   1218  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
   1219  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
   1220  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
   1221  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
   1222  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
   1223  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
   1224  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
   1225  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
   1226  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
   1227  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
   1228 
   1229  sum0 = vshlq_s32(sum0, round_shift);
   1230  sum1 = vshlq_s32(sum1, round_shift);
   1231 
   1232  uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
   1233  return vminq_u16(res, max);
   1234 }
   1235 
   1236 static inline void highbd_convolve_2d_sr_vert_12tap_neon(
   1237    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1238    int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
   1239    const int bd, const int offset) {
   1240  const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
   1241  const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
   1242  const int32x4_t offset_s32 = vdupq_n_s32(offset);
   1243  const int round1_shift = conv_params->round_1;
   1244  const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
   1245 
   1246  if (w == 4) {
   1247    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
   1248    const int16_t *s = (const int16_t *)src_ptr;
   1249    uint16_t *d = dst_ptr;
   1250 
   1251    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
   1252    load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
   1253                  &s9, &s10);
   1254    s += 11 * src_stride;
   1255 
   1256    do {
   1257      int16x4_t s11, s12, s13, s14;
   1258      load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14);
   1259 
   1260      uint16x4_t d0 = highbd_convolve12_4_2d_v(
   1261          s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
   1262          y_filter_8_11, round1_shift_s32, offset_s32, max);
   1263      uint16x4_t d1 = highbd_convolve12_4_2d_v(
   1264          s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
   1265          y_filter_8_11, round1_shift_s32, offset_s32, max);
   1266      uint16x4_t d2 = highbd_convolve12_4_2d_v(
   1267          s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
   1268          y_filter_8_11, round1_shift_s32, offset_s32, max);
   1269      uint16x4_t d3 = highbd_convolve12_4_2d_v(
   1270          s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
   1271          y_filter_8_11, round1_shift_s32, offset_s32, max);
   1272 
   1273      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
   1274 
   1275      s0 = s4;
   1276      s1 = s5;
   1277      s2 = s6;
   1278      s3 = s7;
   1279      s4 = s8;
   1280      s5 = s9;
   1281      s6 = s10;
   1282      s7 = s11;
   1283      s8 = s12;
   1284      s9 = s13;
   1285      s10 = s14;
   1286      s += 4 * src_stride;
   1287      d += 4 * dst_stride;
   1288      h -= 4;
   1289    } while (h != 0);
   1290  } else {
   1291    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   1292 
   1293    do {
   1294      int height = h;
   1295      const int16_t *s = (const int16_t *)src_ptr;
   1296      uint16_t *d = dst_ptr;
   1297 
   1298      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
   1299      load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
   1300                    &s9, &s10);
   1301      s += 11 * src_stride;
   1302 
   1303      do {
   1304        int16x8_t s11, s12, s13, s14;
   1305        load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
   1306 
   1307        uint16x8_t d0 = highbd_convolve12_8_2d_v(
   1308            s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
   1309            y_filter_8_11, round1_shift_s32, offset_s32, max);
   1310        uint16x8_t d1 = highbd_convolve12_8_2d_v(
   1311            s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
   1312            y_filter_8_11, round1_shift_s32, offset_s32, max);
   1313        uint16x8_t d2 = highbd_convolve12_8_2d_v(
   1314            s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
   1315            y_filter_8_11, round1_shift_s32, offset_s32, max);
   1316        uint16x8_t d3 = highbd_convolve12_8_2d_v(
   1317            s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
   1318            y_filter_8_11, round1_shift_s32, offset_s32, max);
   1319 
   1320        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1321 
   1322        s0 = s4;
   1323        s1 = s5;
   1324        s2 = s6;
   1325        s3 = s7;
   1326        s4 = s8;
   1327        s5 = s9;
   1328        s6 = s10;
   1329        s7 = s11;
   1330        s8 = s12;
   1331        s9 = s13;
   1332        s10 = s14;
   1333        s += 4 * src_stride;
   1334        d += 4 * dst_stride;
   1335        height -= 4;
   1336      } while (height != 0);
   1337 
   1338      src_ptr += 8;
   1339      dst_ptr += 8;
   1340      w -= 8;
   1341    } while (w != 0);
   1342  }
   1343 }
   1344 
   1345 static inline uint16x8_t highbd_convolve6_8_2d_h(const int16x8_t s[6],
   1346                                                 const int16x8_t x_filter,
   1347                                                 const int32x4_t shift_s32,
   1348                                                 const int32x4_t offset) {
   1349  // Values at indices 0 and 7 of y_filter are zero.
   1350  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
   1351  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
   1352 
   1353  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 1);
   1354  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2);
   1355  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3);
   1356  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0);
   1357  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1);
   1358  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2);
   1359 
   1360  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 1);
   1361  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2);
   1362  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3);
   1363  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0);
   1364  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
   1365  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
   1366 
   1367  sum0 = vqrshlq_s32(sum0, shift_s32);
   1368  sum1 = vqrshlq_s32(sum1, shift_s32);
   1369 
   1370  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
   1371 }
   1372 
   1373 static inline void highbd_convolve_2d_sr_horiz_6tap_neon(
   1374    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1375    int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
   1376    const int offset) {
   1377  // The smallest block height processed by the SIMD functions is 4, and the
   1378  // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
   1379  // for the vertical convolution.
   1380  assert(h >= 5);
   1381  const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
   1382  const int32x4_t offset_s32 = vdupq_n_s32(offset);
   1383 
   1384  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
   1385  int height = h;
   1386 
   1387  do {
   1388    int width = w;
   1389    const int16_t *s = (const int16_t *)src_ptr;
   1390    uint16_t *d = dst_ptr;
   1391 
   1392    do {
   1393      int16x8_t s0[6], s1[6], s2[6], s3[6];
   1394      load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
   1395                   &s0[4], &s0[5]);
   1396      load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
   1397                   &s1[4], &s1[5]);
   1398      load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
   1399                   &s2[4], &s2[5]);
   1400      load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
   1401                   &s3[4], &s3[5]);
   1402 
   1403      uint16x8_t d0 =
   1404          highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32);
   1405      uint16x8_t d1 =
   1406          highbd_convolve6_8_2d_h(s1, x_filter, shift_s32, offset_s32);
   1407      uint16x8_t d2 =
   1408          highbd_convolve6_8_2d_h(s2, x_filter, shift_s32, offset_s32);
   1409      uint16x8_t d3 =
   1410          highbd_convolve6_8_2d_h(s3, x_filter, shift_s32, offset_s32);
   1411 
   1412      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1413 
   1414      s += 8;
   1415      d += 8;
   1416      width -= 8;
   1417    } while (width != 0);
   1418    src_ptr += 4 * src_stride;
   1419    dst_ptr += 4 * dst_stride;
   1420    height -= 4;
   1421  } while (height > 4);
   1422  do {
   1423    int width = w;
   1424    const int16_t *s = (const int16_t *)src_ptr;
   1425    uint16_t *d = dst_ptr;
   1426 
   1427    do {
   1428      int16x8_t s0[6];
   1429      load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
   1430 
   1431      uint16x8_t d0 =
   1432          highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32);
   1433      vst1q_u16(d, d0);
   1434 
   1435      s += 8;
   1436      d += 8;
   1437      width -= 8;
   1438    } while (width != 0);
   1439    src_ptr += src_stride;
   1440    dst_ptr += dst_stride;
   1441  } while (--height != 0);
   1442 }
   1443 
   1444 static inline uint16x4_t highbd_convolve4_4_2d_h(const int16x4_t s[4],
   1445                                                 const int16x4_t x_filter,
   1446                                                 const int32x4_t shift_s32,
   1447                                                 const int32x4_t offset) {
   1448  int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
   1449  sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
   1450  sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
   1451  sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
   1452 
   1453  sum = vqrshlq_s32(sum, shift_s32);
   1454  return vqmovun_s32(sum);
   1455 }
   1456 
   1457 static inline uint16x8_t highbd_convolve8_8_2d_h(const int16x8_t s[8],
   1458                                                 const int16x8_t x_filter,
   1459                                                 const int32x4_t shift_s32,
   1460                                                 const int32x4_t offset) {
   1461  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
   1462  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
   1463 
   1464  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0);
   1465  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
   1466  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
   1467  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
   1468  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
   1469  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
   1470  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
   1471  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
   1472 
   1473  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0);
   1474  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
   1475  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
   1476  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
   1477  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
   1478  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
   1479  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
   1480  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
   1481 
   1482  sum0 = vqrshlq_s32(sum0, shift_s32);
   1483  sum1 = vqrshlq_s32(sum1, shift_s32);
   1484 
   1485  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
   1486 }
   1487 
   1488 static inline void highbd_convolve_2d_sr_horiz_neon(
   1489    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1490    int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
   1491    const int offset) {
   1492  // The smallest block height processed by the SIMD functions is 4, and the
   1493  // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
   1494  // for the vertical convolution.
   1495  assert(h >= 5);
   1496  const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
   1497  const int32x4_t offset_s32 = vdupq_n_s32(offset);
   1498 
   1499  if (w == 4) {
   1500    // 4-tap filters are used for blocks having width <= 4.
   1501    const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
   1502    const int16_t *s = (const int16_t *)(src_ptr + 1);
   1503    uint16_t *d = dst_ptr;
   1504 
   1505    do {
   1506      int16x4_t s0[4], s1[4], s2[4], s3[4];
   1507      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
   1508      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
   1509      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
   1510      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
   1511 
   1512      uint16x4_t d0 =
   1513          highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32);
   1514      uint16x4_t d1 =
   1515          highbd_convolve4_4_2d_h(s1, x_filter, shift_s32, offset_s32);
   1516      uint16x4_t d2 =
   1517          highbd_convolve4_4_2d_h(s2, x_filter, shift_s32, offset_s32);
   1518      uint16x4_t d3 =
   1519          highbd_convolve4_4_2d_h(s3, x_filter, shift_s32, offset_s32);
   1520 
   1521      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
   1522 
   1523      s += 4 * src_stride;
   1524      d += 4 * dst_stride;
   1525      h -= 4;
   1526    } while (h > 4);
   1527 
   1528    do {
   1529      int16x4_t s0[4];
   1530      load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
   1531 
   1532      uint16x4_t d0 =
   1533          highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32);
   1534 
   1535      vst1_u16(d, d0);
   1536 
   1537      s += src_stride;
   1538      d += dst_stride;
   1539    } while (--h != 0);
   1540  } else {
   1541    const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
   1542    int height = h;
   1543 
   1544    do {
   1545      int width = w;
   1546      const int16_t *s = (const int16_t *)src_ptr;
   1547      uint16_t *d = dst_ptr;
   1548 
   1549      do {
   1550        int16x8_t s0[8], s1[8], s2[8], s3[8];
   1551        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
   1552                     &s0[4], &s0[5], &s0[6], &s0[7]);
   1553        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
   1554                     &s1[4], &s1[5], &s1[6], &s1[7]);
   1555        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
   1556                     &s2[4], &s2[5], &s2[6], &s2[7]);
   1557        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
   1558                     &s3[4], &s3[5], &s3[6], &s3[7]);
   1559 
   1560        uint16x8_t d0 =
   1561            highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32);
   1562        uint16x8_t d1 =
   1563            highbd_convolve8_8_2d_h(s1, x_filter, shift_s32, offset_s32);
   1564        uint16x8_t d2 =
   1565            highbd_convolve8_8_2d_h(s2, x_filter, shift_s32, offset_s32);
   1566        uint16x8_t d3 =
   1567            highbd_convolve8_8_2d_h(s3, x_filter, shift_s32, offset_s32);
   1568 
   1569        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1570 
   1571        s += 8;
   1572        d += 8;
   1573        width -= 8;
   1574      } while (width != 0);
   1575      src_ptr += 4 * src_stride;
   1576      dst_ptr += 4 * dst_stride;
   1577      height -= 4;
   1578    } while (height > 4);
   1579 
   1580    do {
   1581      int width = w;
   1582      const int16_t *s = (const int16_t *)src_ptr;
   1583      uint16_t *d = dst_ptr;
   1584 
   1585      do {
   1586        int16x8_t s0[8];
   1587        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
   1588                     &s0[4], &s0[5], &s0[6], &s0[7]);
   1589 
   1590        uint16x8_t d0 =
   1591            highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32);
   1592        vst1q_u16(d, d0);
   1593 
   1594        s += 8;
   1595        d += 8;
   1596        width -= 8;
   1597      } while (width != 0);
   1598      src_ptr += src_stride;
   1599      dst_ptr += dst_stride;
   1600    } while (--height != 0);
   1601  }
   1602 }
   1603 
   1604 static inline uint16x4_t highbd_convolve12_4_2d_h(const int16x4_t s[12],
   1605                                                  const int16x8_t x_filter_0_7,
   1606                                                  const int16x4_t x_filter_8_11,
   1607                                                  const int32x4_t shift_s32,
   1608                                                  const int32x4_t offset) {
   1609  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
   1610  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
   1611 
   1612  int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter_0_3, 0);
   1613  sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1);
   1614  sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2);
   1615  sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3);
   1616  sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0);
   1617  sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1);
   1618  sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2);
   1619  sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3);
   1620  sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0);
   1621  sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1);
   1622  sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
   1623  sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
   1624 
   1625  sum = vqrshlq_s32(sum, shift_s32);
   1626  return vqmovun_s32(sum);
   1627 }
   1628 
   1629 static inline uint16x8_t highbd_convolve12_8_2d_h(const int16x8_t s[12],
   1630                                                  const int16x8_t x_filter_0_7,
   1631                                                  const int16x4_t x_filter_8_11,
   1632                                                  const int32x4_t shift_s32,
   1633                                                  const int32x4_t offset) {
   1634  const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
   1635  const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
   1636 
   1637  int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0);
   1638  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
   1639  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
   1640  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
   1641  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
   1642  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
   1643  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
   1644  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
   1645  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0);
   1646  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1);
   1647  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2);
   1648  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3);
   1649 
   1650  int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0);
   1651  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
   1652  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
   1653  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
   1654  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
   1655  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
   1656  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
   1657  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
   1658  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0);
   1659  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1);
   1660  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
   1661  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
   1662 
   1663  sum0 = vqrshlq_s32(sum0, shift_s32);
   1664  sum1 = vqrshlq_s32(sum1, shift_s32);
   1665 
   1666  return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
   1667 }
   1668 
   1669 static inline void highbd_convolve_2d_sr_horiz_12tap_neon(
   1670    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1671    int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
   1672    const int offset) {
   1673  // The smallest block height processed by the SIMD functions is 4, and the
   1674  // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
   1675  // for the vertical convolution.
   1676  assert(h >= 5);
   1677  const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
   1678  const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
   1679  const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
   1680  const int32x4_t offset_s32 = vdupq_n_s32(offset);
   1681 
   1682  if (w == 4) {
   1683    const int16_t *s = (const int16_t *)src_ptr;
   1684    uint16_t *d = dst_ptr;
   1685 
   1686    do {
   1687      int16x4_t s0[12], s1[12], s2[12], s3[12];
   1688      load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
   1689                    &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
   1690                    &s0[11]);
   1691      load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
   1692                    &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
   1693                    &s1[11]);
   1694      load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
   1695                    &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
   1696                    &s2[11]);
   1697      load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
   1698                    &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
   1699                    &s3[11]);
   1700 
   1701      uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11,
   1702                                               shift_s32, offset_s32);
   1703      uint16x4_t d1 = highbd_convolve12_4_2d_h(s1, x_filter_0_7, x_filter_8_11,
   1704                                               shift_s32, offset_s32);
   1705      uint16x4_t d2 = highbd_convolve12_4_2d_h(s2, x_filter_0_7, x_filter_8_11,
   1706                                               shift_s32, offset_s32);
   1707      uint16x4_t d3 = highbd_convolve12_4_2d_h(s3, x_filter_0_7, x_filter_8_11,
   1708                                               shift_s32, offset_s32);
   1709 
   1710      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
   1711 
   1712      s += 4 * src_stride;
   1713      d += 4 * dst_stride;
   1714      h -= 4;
   1715    } while (h > 4);
   1716 
   1717    do {
   1718      int16x4_t s0[12];
   1719      load_s16_4x12(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5],
   1720                    &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]);
   1721 
   1722      uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11,
   1723                                               shift_s32, offset_s32);
   1724 
   1725      vst1_u16(d, d0);
   1726 
   1727      s += src_stride;
   1728      d += dst_stride;
   1729    } while (--h != 0);
   1730  } else {
   1731    int height = h;
   1732 
   1733    do {
   1734      int width = w;
   1735      const int16_t *s = (const int16_t *)src_ptr;
   1736      uint16_t *d = dst_ptr;
   1737 
   1738      do {
   1739        int16x8_t s0[12], s1[12], s2[12], s3[12];
   1740        load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
   1741                      &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
   1742                      &s0[11]);
   1743        load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
   1744                      &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
   1745                      &s1[11]);
   1746        load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
   1747                      &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
   1748                      &s2[11]);
   1749        load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
   1750                      &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
   1751                      &s3[11]);
   1752 
   1753        uint16x8_t d0 = highbd_convolve12_8_2d_h(
   1754            s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
   1755        uint16x8_t d1 = highbd_convolve12_8_2d_h(
   1756            s1, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
   1757        uint16x8_t d2 = highbd_convolve12_8_2d_h(
   1758            s2, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
   1759        uint16x8_t d3 = highbd_convolve12_8_2d_h(
   1760            s3, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
   1761 
   1762        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1763 
   1764        s += 8;
   1765        d += 8;
   1766        width -= 8;
   1767      } while (width != 0);
   1768      src_ptr += 4 * src_stride;
   1769      dst_ptr += 4 * dst_stride;
   1770      height -= 4;
   1771    } while (height > 4);
   1772 
   1773    do {
   1774      int width = w;
   1775      const int16_t *s = (const int16_t *)src_ptr;
   1776      uint16_t *d = dst_ptr;
   1777 
   1778      do {
   1779        int16x8_t s0[12];
   1780        load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
   1781                      &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
   1782                      &s0[11]);
   1783 
   1784        uint16x8_t d0 = highbd_convolve12_8_2d_h(
   1785            s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
   1786        vst1q_u16(d, d0);
   1787 
   1788        s += 8;
   1789        d += 8;
   1790        width -= 8;
   1791      } while (width > 0);
   1792      src_ptr += src_stride;
   1793      dst_ptr += dst_stride;
   1794    } while (--height != 0);
   1795  }
   1796 }
   1797 
   1798 void av1_highbd_convolve_2d_sr_neon(const uint16_t *src, int src_stride,
   1799                                    uint16_t *dst, int dst_stride, int w, int h,
   1800                                    const InterpFilterParams *filter_params_x,
   1801                                    const InterpFilterParams *filter_params_y,
   1802                                    const int subpel_x_qn,
   1803                                    const int subpel_y_qn,
   1804                                    ConvolveParams *conv_params, int bd) {
   1805  if (w == 2 || h == 2) {
   1806    av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
   1807                                filter_params_x, filter_params_y, subpel_x_qn,
   1808                                subpel_y_qn, conv_params, bd);
   1809    return;
   1810  }
   1811  DECLARE_ALIGNED(16, uint16_t,
   1812                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
   1813  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
   1814  const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
   1815 
   1816  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   1817  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
   1818  const int im_h = h + clamped_y_taps - 1;
   1819  const int im_stride = MAX_SB_SIZE;
   1820  const int vert_offset = clamped_y_taps / 2 - 1;
   1821  const int horiz_offset = clamped_x_taps / 2 - 1;
   1822  const int x_offset_initial = (1 << (bd + FILTER_BITS - 1));
   1823  const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   1824  // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a
   1825  // simple shift left instead of a rounding saturating shift left.
   1826  const int y_offset =
   1827      (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1));
   1828 
   1829  const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
   1830 
   1831  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1832      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   1833  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1834      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   1835 
   1836  if (x_filter_taps > 8) {
   1837    highbd_convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block,
   1838                                           im_stride, w, im_h, x_filter_ptr,
   1839                                           conv_params, x_offset_initial);
   1840 
   1841    highbd_convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride,
   1842                                          w, h, y_filter_ptr, conv_params, bd,
   1843                                          y_offset);
   1844    return;
   1845  }
   1846  if (x_filter_taps <= 6 && w != 4) {
   1847    highbd_convolve_2d_sr_horiz_6tap_neon(src_ptr, src_stride, im_block,
   1848                                          im_stride, w, im_h, x_filter_ptr,
   1849                                          conv_params, x_offset_initial);
   1850  } else {
   1851    highbd_convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride,
   1852                                     w, im_h, x_filter_ptr, conv_params,
   1853                                     x_offset_initial);
   1854  }
   1855 
   1856  if (y_filter_taps <= 6) {
   1857    highbd_convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride,
   1858                                         w, h, y_filter_ptr, conv_params, bd,
   1859                                         y_offset);
   1860  } else {
   1861    highbd_convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride,
   1862                                         w, h, y_filter_ptr, conv_params, bd,
   1863                                         y_offset);
   1864  }
   1865 }
   1866 
   1867 // Filter used is [64, 64].
   1868 void av1_highbd_convolve_x_sr_intrabc_neon(
   1869    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
   1870    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
   1871    ConvolveParams *conv_params, int bd) {
   1872  assert(subpel_x_qn == 8);
   1873  assert(filter_params_x->taps == 2);
   1874  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
   1875  (void)filter_params_x;
   1876  (void)subpel_x_qn;
   1877  (void)conv_params;
   1878  (void)bd;
   1879 
   1880  if (w <= 4) {
   1881    do {
   1882      uint16x4_t s0 = vld1_u16(src);
   1883      uint16x4_t s1 = vld1_u16(src + 1);
   1884 
   1885      uint16x4_t d0 = vrhadd_u16(s0, s1);
   1886 
   1887      if (w == 2) {
   1888        store_u16_2x1(dst, d0);
   1889      } else {
   1890        vst1_u16(dst, d0);
   1891      }
   1892 
   1893      src += src_stride;
   1894      dst += dst_stride;
   1895    } while (--h != 0);
   1896  } else {
   1897    do {
   1898      const uint16_t *src_ptr = src;
   1899      uint16_t *dst_ptr = dst;
   1900      int width = w;
   1901 
   1902      do {
   1903        uint16x8_t s0 = vld1q_u16(src_ptr);
   1904        uint16x8_t s1 = vld1q_u16(src_ptr + 1);
   1905 
   1906        uint16x8_t d0 = vrhaddq_u16(s0, s1);
   1907 
   1908        vst1q_u16(dst_ptr, d0);
   1909 
   1910        src_ptr += 8;
   1911        dst_ptr += 8;
   1912        width -= 8;
   1913      } while (width != 0);
   1914      src += src_stride;
   1915      dst += dst_stride;
   1916    } while (--h != 0);
   1917  }
   1918 }
   1919 
   1920 // Filter used is [64, 64].
   1921 void av1_highbd_convolve_y_sr_intrabc_neon(
   1922    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
   1923    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
   1924    int bd) {
   1925  assert(subpel_y_qn == 8);
   1926  assert(filter_params_y->taps == 2);
   1927  (void)filter_params_y;
   1928  (void)subpel_y_qn;
   1929  (void)bd;
   1930 
   1931  if (w <= 4) {
   1932    do {
   1933      uint16x4_t s0 = vld1_u16(src);
   1934      uint16x4_t s1 = vld1_u16(src + src_stride);
   1935 
   1936      uint16x4_t d0 = vrhadd_u16(s0, s1);
   1937 
   1938      if (w == 2) {
   1939        store_u16_2x1(dst, d0);
   1940      } else {
   1941        vst1_u16(dst, d0);
   1942      }
   1943 
   1944      src += src_stride;
   1945      dst += dst_stride;
   1946    } while (--h != 0);
   1947  } else {
   1948    do {
   1949      const uint16_t *src_ptr = src;
   1950      uint16_t *dst_ptr = dst;
   1951      int height = h;
   1952 
   1953      do {
   1954        uint16x8_t s0 = vld1q_u16(src_ptr);
   1955        uint16x8_t s1 = vld1q_u16(src_ptr + src_stride);
   1956 
   1957        uint16x8_t d0 = vrhaddq_u16(s0, s1);
   1958 
   1959        vst1q_u16(dst_ptr, d0);
   1960 
   1961        src_ptr += src_stride;
   1962        dst_ptr += dst_stride;
   1963      } while (--height != 0);
   1964      src += 8;
   1965      dst += 8;
   1966      w -= 8;
   1967    } while (w != 0);
   1968  }
   1969 }
   1970 
   1971 // Both horizontal and vertical passes use the same 2-tap filter: [64, 64].
   1972 void av1_highbd_convolve_2d_sr_intrabc_neon(
   1973    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
   1974    int h, const InterpFilterParams *filter_params_x,
   1975    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
   1976    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   1977  assert(subpel_x_qn == 8);
   1978  assert(subpel_y_qn == 8);
   1979  assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
   1980  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
   1981  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
   1982  (void)filter_params_x;
   1983  (void)subpel_x_qn;
   1984  (void)filter_params_y;
   1985  (void)subpel_y_qn;
   1986  (void)conv_params;
   1987  (void)bd;
   1988 
   1989  DECLARE_ALIGNED(16, uint16_t,
   1990                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
   1991  int im_h = h + 1;
   1992  int im_stride = MAX_SB_SIZE;
   1993 
   1994  uint16x8_t vert_offset = vdupq_n_u16(1);
   1995 
   1996  uint16_t *im = im_block;
   1997 
   1998  // Horizontal filter.
   1999  if (w <= 4) {
   2000    do {
   2001      uint16x4_t s0 = vld1_u16(src);
   2002      uint16x4_t s1 = vld1_u16(src + 1);
   2003 
   2004      uint16x4_t d0 = vadd_u16(s0, s1);
   2005 
   2006      // Safe to store the whole vector, the im buffer is big enough.
   2007      vst1_u16(im, d0);
   2008 
   2009      src += src_stride;
   2010      im += im_stride;
   2011    } while (--im_h != 0);
   2012  } else {
   2013    do {
   2014      const uint16_t *src_ptr = src;
   2015      uint16_t *im_ptr = im;
   2016      int width = w;
   2017 
   2018      do {
   2019        uint16x8_t s0 = vld1q_u16(src_ptr);
   2020        uint16x8_t s1 = vld1q_u16(src_ptr + 1);
   2021 
   2022        uint16x8_t d0 = vaddq_u16(s0, s1);
   2023 
   2024        vst1q_u16(im_ptr, d0);
   2025 
   2026        src_ptr += 8;
   2027        im_ptr += 8;
   2028        width -= 8;
   2029      } while (width != 0);
   2030      src += src_stride;
   2031      im += im_stride;
   2032    } while (--im_h != 0);
   2033  }
   2034 
   2035  im = im_block;
   2036 
   2037  // Vertical filter.
   2038  if (w <= 4) {
   2039    do {
   2040      uint16x4_t s0 = vld1_u16(im);
   2041      uint16x4_t s1 = vld1_u16(im + im_stride);
   2042 
   2043      uint16x4_t d0 = vhadd_u16(s0, s1);
   2044      d0 = vhadd_u16(d0, vget_low_u16(vert_offset));
   2045 
   2046      if (w == 2) {
   2047        store_u16_2x1(dst, d0);
   2048      } else {
   2049        vst1_u16(dst, d0);
   2050      }
   2051 
   2052      im += im_stride;
   2053      dst += dst_stride;
   2054    } while (--h != 0);
   2055  } else {
   2056    do {
   2057      uint16_t *im_ptr = im;
   2058      uint16_t *dst_ptr = dst;
   2059      int height = h;
   2060 
   2061      do {
   2062        uint16x8_t s0 = vld1q_u16(im_ptr);
   2063        uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
   2064 
   2065        uint16x8_t d0 = vhaddq_u16(s0, s1);
   2066        d0 = vhaddq_u16(d0, vert_offset);
   2067 
   2068        vst1q_u16(dst_ptr, d0);
   2069 
   2070        im_ptr += im_stride;
   2071        dst_ptr += dst_stride;
   2072      } while (--height != 0);
   2073      im += 8;
   2074      dst += 8;
   2075      w -= 8;
   2076    } while (w != 0);
   2077  }
   2078 }