tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_convolve_scale_neon.c (21022B)


      1 /*
      2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <arm_neon.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/av1_rtcd.h"
     17 
     18 #include "aom_dsp/aom_dsp_common.h"
     19 #include "aom_dsp/arm/mem_neon.h"
     20 #include "aom_dsp/arm/transpose_neon.h"
     21 #include "aom_ports/mem.h"
     22 #include "av1/common/convolve.h"
     23 #include "av1/common/filter.h"
     24 #include "av1/common/arm/highbd_convolve_neon.h"
     25 
     26 static inline void highbd_dist_wtd_comp_avg_neon(
     27    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     28    int w, int h, ConvolveParams *conv_params, const int round_bits,
     29    const int offset, const int bd) {
     30  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
     31  const int ref_stride = conv_params->dst_stride;
     32  const int32x4_t round_shift = vdupq_n_s32(-round_bits);
     33  const uint32x4_t offset_vec = vdupq_n_u32(offset);
     34  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
     35  uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset);
     36  uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset);
     37 
     38  // Weighted averaging
     39  if (w <= 4) {
     40    do {
     41      const uint16x4_t src = vld1_u16(src_ptr);
     42      const uint16x4_t ref = vld1_u16(ref_ptr);
     43 
     44      uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset);
     45      wtd_avg = vmlal_u16(wtd_avg, src, bck_offset);
     46      wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS);
     47      int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec));
     48      d0 = vqrshlq_s32(d0, round_shift);
     49 
     50      uint16x4_t d0_u16 = vqmovun_s32(d0);
     51      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
     52 
     53      if (w == 2) {
     54        store_u16_2x1(dst_ptr, d0_u16);
     55      } else {
     56        vst1_u16(dst_ptr, d0_u16);
     57      }
     58 
     59      src_ptr += src_stride;
     60      dst_ptr += dst_stride;
     61      ref_ptr += ref_stride;
     62    } while (--h != 0);
     63  } else {
     64    do {
     65      int width = w;
     66      const uint16_t *src = src_ptr;
     67      const uint16_t *ref = ref_ptr;
     68      uint16_t *dst = dst_ptr;
     69      do {
     70        const uint16x8_t s = vld1q_u16(src);
     71        const uint16x8_t r = vld1q_u16(ref);
     72 
     73        uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset);
     74        wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset);
     75        wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS);
     76        int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec));
     77        d0 = vqrshlq_s32(d0, round_shift);
     78 
     79        uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset);
     80        wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset);
     81        wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS);
     82        int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec));
     83        d1 = vqrshlq_s32(d1, round_shift);
     84 
     85        uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1));
     86        d01 = vminq_u16(d01, max);
     87        vst1q_u16(dst, d01);
     88 
     89        src += 8;
     90        ref += 8;
     91        dst += 8;
     92        width -= 8;
     93      } while (width != 0);
     94      src_ptr += src_stride;
     95      dst_ptr += dst_stride;
     96      ref_ptr += ref_stride;
     97    } while (--h != 0);
     98  }
     99 }
    100 
    101 static inline void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride,
    102                                        uint16_t *dst_ptr, int dst_stride,
    103                                        int w, int h,
    104                                        ConvolveParams *conv_params,
    105                                        const int round_bits, const int offset,
    106                                        const int bd) {
    107  CONV_BUF_TYPE *ref_ptr = conv_params->dst;
    108  const int ref_stride = conv_params->dst_stride;
    109  const int32x4_t round_shift = vdupq_n_s32(-round_bits);
    110  const uint16x4_t offset_vec = vdup_n_u16(offset);
    111  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    112 
    113  if (w <= 4) {
    114    do {
    115      const uint16x4_t src = vld1_u16(src_ptr);
    116      const uint16x4_t ref = vld1_u16(ref_ptr);
    117 
    118      uint16x4_t avg = vhadd_u16(src, ref);
    119      int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec));
    120      d0 = vqrshlq_s32(d0, round_shift);
    121 
    122      uint16x4_t d0_u16 = vqmovun_s32(d0);
    123      d0_u16 = vmin_u16(d0_u16, vget_low_u16(max));
    124 
    125      if (w == 2) {
    126        store_u16_2x1(dst_ptr, d0_u16);
    127      } else {
    128        vst1_u16(dst_ptr, d0_u16);
    129      }
    130 
    131      src_ptr += src_stride;
    132      ref_ptr += ref_stride;
    133      dst_ptr += dst_stride;
    134    } while (--h != 0);
    135  } else {
    136    do {
    137      int width = w;
    138      const uint16_t *src = src_ptr;
    139      const uint16_t *ref = ref_ptr;
    140      uint16_t *dst = dst_ptr;
    141      do {
    142        const uint16x8_t s = vld1q_u16(src);
    143        const uint16x8_t r = vld1q_u16(ref);
    144 
    145        uint16x8_t avg = vhaddq_u16(s, r);
    146        int32x4_t d0_lo =
    147            vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec));
    148        int32x4_t d0_hi =
    149            vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec));
    150        d0_lo = vqrshlq_s32(d0_lo, round_shift);
    151        d0_hi = vqrshlq_s32(d0_hi, round_shift);
    152 
    153        uint16x8_t d0 = vcombine_u16(vqmovun_s32(d0_lo), vqmovun_s32(d0_hi));
    154        d0 = vminq_u16(d0, max);
    155        vst1q_u16(dst, d0);
    156 
    157        src += 8;
    158        ref += 8;
    159        dst += 8;
    160        width -= 8;
    161      } while (width != 0);
    162 
    163      src_ptr += src_stride;
    164      ref_ptr += ref_stride;
    165      dst_ptr += dst_stride;
    166    } while (--h != 0);
    167  }
    168 }
    169 
    170 static inline void highbd_convolve_2d_x_scale_8tap_neon(
    171    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    172    int w, int h, const int subpel_x_qn, const int x_step_qn,
    173    const InterpFilterParams *filter_params, ConvolveParams *conv_params,
    174    const int offset) {
    175  static const uint32_t kIdx[4] = { 0, 1, 2, 3 };
    176  const uint32x4_t idx = vld1q_u32(kIdx);
    177  const uint32x4_t subpel_mask = vdupq_n_u32(SCALE_SUBPEL_MASK);
    178  const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
    179  const int32x4_t offset_s32 = vdupq_n_s32(offset);
    180 
    181  if (w <= 4) {
    182    int height = h;
    183    uint16_t *d = dst_ptr;
    184 
    185    do {
    186      int x_qn = subpel_x_qn;
    187 
    188      // Load 4 src vectors at a time, they might be the same, but we have to
    189      // calculate the indices anyway. Doing it in SIMD and then storing the
    190      // indices is faster than having to calculate the expression
    191      // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times
    192      // Ideally this should be a gather using the indices, but NEON does not
    193      // have that, so have to emulate
    194      const uint32x4_t xqn_idx = vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn);
    195      // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) =
    196      // 2
    197      const uint32x4_t src_idx_u32 =
    198          vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1);
    199 #if AOM_ARCH_AARCH64
    200      uint64x2_t src4[2];
    201      src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr),
    202                          vget_low_u32(src_idx_u32));
    203      src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr),
    204                          vget_high_u32(src_idx_u32));
    205      int16_t *src4_ptr[4];
    206      uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
    207      vst1q_u64(tmp_ptr, src4[0]);
    208      vst1q_u64(tmp_ptr + 2, src4[1]);
    209 #else
    210      uint32x4_t src4;
    211      src4 = vaddq_u32(vdupq_n_u32((const uint32_t)src_ptr), src_idx_u32);
    212      int16_t *src4_ptr[4];
    213      uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
    214      vst1q_u32(tmp_ptr, src4);
    215 #endif  // AOM_ARCH_AARCH64
    216      // Same for the filter vectors
    217      const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32(
    218          vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS));
    219      int32_t x_filter4_idx[4];
    220      vst1q_s32(x_filter4_idx, filter_idx_s32);
    221      const int16_t *x_filter4_ptr[4];
    222 
    223      // Load source
    224      int16x8_t s0 = vld1q_s16(src4_ptr[0]);
    225      int16x8_t s1 = vld1q_s16(src4_ptr[1]);
    226      int16x8_t s2 = vld1q_s16(src4_ptr[2]);
    227      int16x8_t s3 = vld1q_s16(src4_ptr[3]);
    228 
    229      // We could easily do this using SIMD as well instead of calling the
    230      // inline function 4 times.
    231      x_filter4_ptr[0] =
    232          av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[0]);
    233      x_filter4_ptr[1] =
    234          av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[1]);
    235      x_filter4_ptr[2] =
    236          av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[2]);
    237      x_filter4_ptr[3] =
    238          av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[3]);
    239 
    240      // Actually load the filters
    241      const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
    242      const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
    243      const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
    244      const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
    245 
    246      // Group low and high parts and transpose
    247      int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
    248                                 vget_low_s16(x_filter1),
    249                                 vget_low_s16(x_filter2),
    250                                 vget_low_s16(x_filter3) };
    251      int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
    252                                 vget_high_s16(x_filter1),
    253                                 vget_high_s16(x_filter2),
    254                                 vget_high_s16(x_filter3) };
    255      transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
    256      transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
    257 
    258      // Run the 2D Scale convolution
    259      uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
    260          s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
    261 
    262      if (w == 2) {
    263        store_u16_2x1(d, d0);
    264      } else {
    265        vst1_u16(d, d0);
    266      }
    267 
    268      src_ptr += src_stride;
    269      d += dst_stride;
    270      height--;
    271    } while (height > 0);
    272  } else {
    273    int height = h;
    274 
    275    do {
    276      int width = w;
    277      int x_qn = subpel_x_qn;
    278      uint16_t *d = dst_ptr;
    279      const uint16_t *s = src_ptr;
    280 
    281      do {
    282        // Load 4 src vectors at a time, they might be the same, but we have to
    283        // calculate the indices anyway. Doing it in SIMD and then storing the
    284        // indices is faster than having to calculate the expression
    285        // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times
    286        // Ideally this should be a gather using the indices, but NEON does not
    287        // have that, so have to emulate
    288        const uint32x4_t xqn_idx =
    289            vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn);
    290        // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
    291        // = 2
    292        const uint32x4_t src_idx_u32 =
    293            vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1);
    294 #if AOM_ARCH_AARCH64
    295        uint64x2_t src4[2];
    296        src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)s),
    297                            vget_low_u32(src_idx_u32));
    298        src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)s),
    299                            vget_high_u32(src_idx_u32));
    300        int16_t *src4_ptr[4];
    301        uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
    302        vst1q_u64(tmp_ptr, src4[0]);
    303        vst1q_u64(tmp_ptr + 2, src4[1]);
    304 #else
    305        uint32x4_t src4;
    306        src4 = vaddq_u32(vdupq_n_u32((const uint32_t)s), src_idx_u32);
    307        int16_t *src4_ptr[4];
    308        uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
    309        vst1q_u32(tmp_ptr, src4);
    310 #endif  // AOM_ARCH_AARCH64
    311        // Same for the filter vectors
    312        const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32(
    313            vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS));
    314        int32_t x_filter4_idx[4];
    315        vst1q_s32(x_filter4_idx, filter_idx_s32);
    316        const int16_t *x_filter4_ptr[4];
    317 
    318        // Load source
    319        int16x8_t s0 = vld1q_s16(src4_ptr[0]);
    320        int16x8_t s1 = vld1q_s16(src4_ptr[1]);
    321        int16x8_t s2 = vld1q_s16(src4_ptr[2]);
    322        int16x8_t s3 = vld1q_s16(src4_ptr[3]);
    323 
    324        // We could easily do this using SIMD as well instead of calling the
    325        // inline function 4 times.
    326        x_filter4_ptr[0] = av1_get_interp_filter_subpel_kernel(
    327            filter_params, x_filter4_idx[0]);
    328        x_filter4_ptr[1] = av1_get_interp_filter_subpel_kernel(
    329            filter_params, x_filter4_idx[1]);
    330        x_filter4_ptr[2] = av1_get_interp_filter_subpel_kernel(
    331            filter_params, x_filter4_idx[2]);
    332        x_filter4_ptr[3] = av1_get_interp_filter_subpel_kernel(
    333            filter_params, x_filter4_idx[3]);
    334 
    335        // Actually load the filters
    336        const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
    337        const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
    338        const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
    339        const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);
    340 
    341        // Group low and high parts and transpose
    342        int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
    343                                   vget_low_s16(x_filter1),
    344                                   vget_low_s16(x_filter2),
    345                                   vget_low_s16(x_filter3) };
    346        int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
    347                                   vget_high_s16(x_filter1),
    348                                   vget_high_s16(x_filter2),
    349                                   vget_high_s16(x_filter3) };
    350        transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
    351        transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);
    352 
    353        // Run the 2D Scale X convolution
    354        uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
    355            s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);
    356 
    357        vst1_u16(d, d0);
    358 
    359        x_qn += 4 * x_step_qn;
    360        d += 4;
    361        width -= 4;
    362      } while (width > 0);
    363 
    364      src_ptr += src_stride;
    365      dst_ptr += dst_stride;
    366      height--;
    367    } while (height > 0);
    368  }
    369 }
    370 
    371 static inline void highbd_convolve_2d_y_scale_8tap_neon(
    372    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    373    int w, int h, const int subpel_y_qn, const int y_step_qn,
    374    const InterpFilterParams *filter_params, const int round1_bits,
    375    const int offset) {
    376  const int32x4_t offset_s32 = vdupq_n_s32(1 << offset);
    377 
    378  const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_bits);
    379  if (w <= 4) {
    380    int height = h;
    381    uint16_t *d = dst_ptr;
    382    int y_qn = subpel_y_qn;
    383 
    384    do {
    385      const int16_t *s =
    386          (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
    387 
    388      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
    389      load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
    390 
    391      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
    392      const int16_t *y_filter_ptr =
    393          av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
    394      const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
    395 
    396      uint16x4_t d0 = highbd_convolve8_4_srsub_s32_s16(
    397          s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32,
    398          offset_s32, vdupq_n_s32(0));
    399 
    400      if (w == 2) {
    401        store_u16_2x1(d, d0);
    402      } else {
    403        vst1_u16(d, d0);
    404      }
    405 
    406      y_qn += y_step_qn;
    407      d += dst_stride;
    408      height--;
    409    } while (height > 0);
    410  } else {
    411    int width = w;
    412 
    413    do {
    414      int height = h;
    415      int y_qn = subpel_y_qn;
    416 
    417      uint16_t *d = dst_ptr;
    418 
    419      do {
    420        const int16_t *s =
    421            (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
    422        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
    423        load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
    424 
    425        const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
    426        const int16_t *y_filter_ptr =
    427            av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
    428        const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
    429 
    430        uint16x8_t d0 = highbd_convolve8_8_srsub_s32_s16(
    431            s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32,
    432            offset_s32, vdupq_n_s32(0));
    433        vst1q_u16(d, d0);
    434 
    435        y_qn += y_step_qn;
    436        d += dst_stride;
    437        height--;
    438      } while (height > 0);
    439      src_ptr += 8;
    440      dst_ptr += 8;
    441      width -= 8;
    442    } while (width > 0);
    443  }
    444 }
    445 
    446 static inline void highbd_convolve_correct_offset_neon(
    447    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    448    int w, int h, const int round_bits, const int offset, const int bd) {
    449  const int32x4_t round_shift_s32 = vdupq_n_s32(-round_bits);
    450  const int16x4_t offset_s16 = vdup_n_s16(offset);
    451  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    452 
    453  if (w <= 4) {
    454    for (int y = 0; y < h; ++y) {
    455      const int16x4_t s = vld1_s16((const int16_t *)src_ptr + y * src_stride);
    456      const int32x4_t d0 =
    457          vqrshlq_s32(vsubl_s16(s, offset_s16), round_shift_s32);
    458      uint16x4_t d = vqmovun_s32(d0);
    459      d = vmin_u16(d, vget_low_u16(max));
    460      if (w == 2) {
    461        store_u16_2x1(dst_ptr + y * dst_stride, d);
    462      } else {
    463        vst1_u16(dst_ptr + y * dst_stride, d);
    464      }
    465    }
    466  } else {
    467    for (int y = 0; y < h; ++y) {
    468      for (int x = 0; x < w; x += 8) {
    469        // Subtract round offset and convolve round
    470        const int16x8_t s =
    471            vld1q_s16((const int16_t *)src_ptr + y * src_stride + x);
    472        const int32x4_t d0 = vqrshlq_s32(vsubl_s16(vget_low_s16(s), offset_s16),
    473                                         round_shift_s32);
    474        const int32x4_t d1 = vqrshlq_s32(
    475            vsubl_s16(vget_high_s16(s), offset_s16), round_shift_s32);
    476        uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1));
    477        d01 = vminq_u16(d01, max);
    478        vst1q_u16(dst_ptr + y * dst_stride + x, d01);
    479      }
    480    }
    481  }
    482 }
    483 
    484 void av1_highbd_convolve_2d_scale_neon(
    485    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
    486    int h, const InterpFilterParams *filter_params_x,
    487    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
    488    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
    489    ConvolveParams *conv_params, int bd) {
    490  uint16_t *im_block = (uint16_t *)aom_memalign(
    491      16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP));
    492  if (!im_block) return;
    493  uint16_t *im_block2 = (uint16_t *)aom_memalign(
    494      16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP));
    495  if (!im_block2) {
    496    aom_free(im_block);  // free the first block and return.
    497    return;
    498  }
    499 
    500  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
    501             filter_params_y->taps;
    502  const int im_stride = MAX_SB_SIZE;
    503  const int bits =
    504      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    505  assert(bits >= 0);
    506 
    507  const int vert_offset = filter_params_y->taps / 2 - 1;
    508  const int horiz_offset = filter_params_x->taps / 2 - 1;
    509  const int x_offset_bits = (1 << (bd + FILTER_BITS - 1));
    510  const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    511  const int y_offset_correction =
    512      ((1 << (y_offset_bits - conv_params->round_1)) +
    513       (1 << (y_offset_bits - conv_params->round_1 - 1)));
    514 
    515  CONV_BUF_TYPE *dst16 = conv_params->dst;
    516  const int dst16_stride = conv_params->dst_stride;
    517 
    518  const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
    519 
    520  highbd_convolve_2d_x_scale_8tap_neon(
    521      src_ptr, src_stride, im_block, im_stride, w, im_h, subpel_x_qn, x_step_qn,
    522      filter_params_x, conv_params, x_offset_bits);
    523  if (conv_params->is_compound && !conv_params->do_average) {
    524    highbd_convolve_2d_y_scale_8tap_neon(
    525        im_block, im_stride, dst16, dst16_stride, w, h, subpel_y_qn, y_step_qn,
    526        filter_params_y, conv_params->round_1, y_offset_bits);
    527  } else {
    528    highbd_convolve_2d_y_scale_8tap_neon(
    529        im_block, im_stride, im_block2, im_stride, w, h, subpel_y_qn, y_step_qn,
    530        filter_params_y, conv_params->round_1, y_offset_bits);
    531  }
    532 
    533  // Do the compound averaging outside the loop, avoids branching within the
    534  // main loop
    535  if (conv_params->is_compound) {
    536    if (conv_params->do_average) {
    537      if (conv_params->use_dist_wtd_comp_avg) {
    538        highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
    539                                      h, conv_params, bits, y_offset_correction,
    540                                      bd);
    541      } else {
    542        highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
    543                             conv_params, bits, y_offset_correction, bd);
    544      }
    545    }
    546  } else {
    547    highbd_convolve_correct_offset_neon(im_block2, im_stride, dst, dst_stride,
    548                                        w, h, bits, y_offset_correction, bd);
    549  }
    550  aom_free(im_block);
    551  aom_free(im_block2);
    552 }