tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_convolve_sve2.c (68969B)


      1 /*
      2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <arm_neon.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/av1_rtcd.h"
     17 
     18 #include "aom_dsp/aom_dsp_common.h"
     19 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
     20 #include "aom_dsp/arm/aom_neon_sve2_bridge.h"
     21 #include "aom_dsp/arm/mem_neon.h"
     22 #include "aom_dsp/arm/transpose_neon.h"
     23 #include "aom_ports/mem.h"
     24 #include "av1/common/convolve.h"
     25 #include "av1/common/filter.h"
     26 #include "av1/common/arm/highbd_convolve_sve2.h"
     27 
     28 DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = {
     29  0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
     30  4, 5, 6, 7, 5, 6, 7, 0, 6, 7, 0, 1, 7, 0, 1, 2,
     31 };
     32 
     33 static inline uint16x4_t convolve12_4_x(
     34    int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11,
     35    const int64x2_t offset, uint16x8x4_t permute_tbl, uint16x4_t max) {
     36  int16x8_t permuted_samples[6];
     37  permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]);
     38  permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]);
     39  permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]);
     40  permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]);
     41  permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]);
     42  permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]);
     43 
     44  int64x2_t sum01 =
     45      aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0);
     46  sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1);
     47  sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1);
     48 
     49  int64x2_t sum23 =
     50      aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0);
     51  sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1);
     52  sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1);
     53 
     54  int32x4_t res0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
     55  uint16x4_t res = vqrshrun_n_s32(res0123, FILTER_BITS);
     56 
     57  return vmin_u16(res, max);
     58 }
     59 
     60 static inline uint16x8_t convolve12_8_x(int16x8_t s0, int16x8_t s1,
     61                                        int16x8_t s2, int16x8_t filter_0_7,
     62                                        int16x8_t filter_4_11, int64x2_t offset,
     63                                        uint16x8x4_t permute_tbl,
     64                                        uint16x8_t max) {
     65  int16x8_t permuted_samples[8];
     66  permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]);
     67  permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]);
     68  permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]);
     69  permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]);
     70  permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]);
     71  permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]);
     72  permuted_samples[6] = aom_tbl2_s16(s1, s2, permute_tbl.val[2]);
     73  permuted_samples[7] = aom_tbl2_s16(s1, s2, permute_tbl.val[3]);
     74 
     75  int64x2_t sum01 =
     76      aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0);
     77  sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1);
     78  sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1);
     79 
     80  int64x2_t sum23 =
     81      aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0);
     82  sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1);
     83  sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1);
     84 
     85  int64x2_t sum45 =
     86      aom_svdot_lane_s16(offset, permuted_samples[2], filter_0_7, 0);
     87  sum45 = aom_svdot_lane_s16(sum45, permuted_samples[4], filter_0_7, 1);
     88  sum45 = aom_svdot_lane_s16(sum45, permuted_samples[6], filter_4_11, 1);
     89 
     90  int64x2_t sum67 =
     91      aom_svdot_lane_s16(offset, permuted_samples[3], filter_0_7, 0);
     92  sum67 = aom_svdot_lane_s16(sum67, permuted_samples[5], filter_0_7, 1);
     93  sum67 = aom_svdot_lane_s16(sum67, permuted_samples[7], filter_4_11, 1);
     94 
     95  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
     96  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
     97 
     98  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
     99                                vqrshrun_n_s32(sum4567, FILTER_BITS));
    100 
    101  return vminq_u16(res, max);
    102 }
    103 
    104 static inline void highbd_convolve_x_sr_12tap_sve2(
    105    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
    106    int width, int height, const int16_t *y_filter_ptr,
    107    ConvolveParams *conv_params, int bd) {
    108  // This shim allows to do only one rounding shift instead of two.
    109  const int64x2_t offset = vdupq_n_s64(1 << (conv_params->round_0 - 1));
    110 
    111  const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
    112  const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
    113 
    114  uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdTbl);
    115  // Scale indices by size of the true vector length to avoid reading from an
    116  // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
    117  uint16x8_t correction0 = vreinterpretq_u16_u64(vcombine_u64(
    118      vdup_n_u64(0), vdup_n_u64(svcnth() * 0x0001000000000000ULL)));
    119  permute_tbl.val[2] = vaddq_u16(permute_tbl.val[2], correction0);
    120 
    121  uint16x8_t correction1 = vreinterpretq_u16_u64(
    122      vcombine_u64(vdup_n_u64(svcnth() * 0x0001000100000000ULL),
    123                   vdup_n_u64(svcnth() * 0x0001000100010000ULL)));
    124  permute_tbl.val[3] = vaddq_u16(permute_tbl.val[3], correction1);
    125 
    126  if (width == 4) {
    127    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
    128    const int16_t *s = (const int16_t *)src;
    129 
    130    do {
    131      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
    132      load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6);
    133      load_s16_8x4(s + 8, src_stride, &s1, &s3, &s5, &s7);
    134 
    135      uint16x4_t d0 = convolve12_4_x(s0, s1, y_filter_0_7, y_filter_4_11,
    136                                     offset, permute_tbl, max);
    137      uint16x4_t d1 = convolve12_4_x(s2, s3, y_filter_0_7, y_filter_4_11,
    138                                     offset, permute_tbl, max);
    139      uint16x4_t d2 = convolve12_4_x(s4, s5, y_filter_0_7, y_filter_4_11,
    140                                     offset, permute_tbl, max);
    141      uint16x4_t d3 = convolve12_4_x(s6, s7, y_filter_0_7, y_filter_4_11,
    142                                     offset, permute_tbl, max);
    143 
    144      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
    145 
    146      s += 4 * src_stride;
    147      dst += 4 * dst_stride;
    148      height -= 4;
    149    } while (height != 0);
    150  } else {
    151    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    152 
    153    do {
    154      const int16_t *s = (const int16_t *)src;
    155      uint16_t *d = dst;
    156      int w = width;
    157 
    158      do {
    159        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11;
    160        load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9);
    161        load_s16_8x4(s + 8, src_stride, &s1, &s4, &s7, &s10);
    162        load_s16_8x4(s + 16, src_stride, &s2, &s5, &s8, &s11);
    163 
    164        uint16x8_t d0 = convolve12_8_x(s0, s1, s2, y_filter_0_7, y_filter_4_11,
    165                                       offset, permute_tbl, max);
    166        uint16x8_t d1 = convolve12_8_x(s3, s4, s5, y_filter_0_7, y_filter_4_11,
    167                                       offset, permute_tbl, max);
    168        uint16x8_t d2 = convolve12_8_x(s6, s7, s8, y_filter_0_7, y_filter_4_11,
    169                                       offset, permute_tbl, max);
    170        uint16x8_t d3 = convolve12_8_x(s9, s10, s11, y_filter_0_7,
    171                                       y_filter_4_11, offset, permute_tbl, max);
    172 
    173        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    174 
    175        s += 8;
    176        d += 8;
    177        w -= 8;
    178      } while (w != 0);
    179      src += 4 * src_stride;
    180      dst += 4 * dst_stride;
    181      height -= 4;
    182    } while (height != 0);
    183  }
    184 }
    185 
    186 static inline uint16x8_t convolve8_8_x(int16x8_t s0[8], int16x8_t filter,
    187                                       int64x2_t offset, uint16x8_t max) {
    188  int64x2_t sum[8];
    189  sum[0] = aom_sdotq_s16(offset, s0[0], filter);
    190  sum[1] = aom_sdotq_s16(offset, s0[1], filter);
    191  sum[2] = aom_sdotq_s16(offset, s0[2], filter);
    192  sum[3] = aom_sdotq_s16(offset, s0[3], filter);
    193  sum[4] = aom_sdotq_s16(offset, s0[4], filter);
    194  sum[5] = aom_sdotq_s16(offset, s0[5], filter);
    195  sum[6] = aom_sdotq_s16(offset, s0[6], filter);
    196  sum[7] = aom_sdotq_s16(offset, s0[7], filter);
    197 
    198  sum[0] = vpaddq_s64(sum[0], sum[1]);
    199  sum[2] = vpaddq_s64(sum[2], sum[3]);
    200  sum[4] = vpaddq_s64(sum[4], sum[5]);
    201  sum[6] = vpaddq_s64(sum[6], sum[7]);
    202 
    203  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
    204  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6]));
    205 
    206  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
    207                                vqrshrun_n_s32(sum4567, FILTER_BITS));
    208 
    209  return vminq_u16(res, max);
    210 }
    211 
    212 static inline void highbd_convolve_x_sr_8tap_sve2(
    213    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
    214    int width, int height, const int16_t *y_filter_ptr,
    215    ConvolveParams *conv_params, int bd) {
    216  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    217  // This shim allows to do only one rounding shift instead of two.
    218  const int64_t offset = 1 << (conv_params->round_0 - 1);
    219  const int64x2_t offset_lo = vcombine_s64((int64x1_t)(offset), vdup_n_s64(0));
    220 
    221  const int16x8_t filter = vld1q_s16(y_filter_ptr);
    222 
    223  do {
    224    const int16_t *s = (const int16_t *)src;
    225    uint16_t *d = dst;
    226    int w = width;
    227 
    228    do {
    229      int16x8_t s0[8], s1[8], s2[8], s3[8];
    230      load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
    231                   &s0[4], &s0[5], &s0[6], &s0[7]);
    232      load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
    233                   &s1[4], &s1[5], &s1[6], &s1[7]);
    234      load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
    235                   &s2[4], &s2[5], &s2[6], &s2[7]);
    236      load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
    237                   &s3[4], &s3[5], &s3[6], &s3[7]);
    238 
    239      uint16x8_t d0 = convolve8_8_x(s0, filter, offset_lo, max);
    240      uint16x8_t d1 = convolve8_8_x(s1, filter, offset_lo, max);
    241      uint16x8_t d2 = convolve8_8_x(s2, filter, offset_lo, max);
    242      uint16x8_t d3 = convolve8_8_x(s3, filter, offset_lo, max);
    243 
    244      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    245 
    246      s += 8;
    247      d += 8;
    248      w -= 8;
    249    } while (w != 0);
    250    src += 4 * src_stride;
    251    dst += 4 * dst_stride;
    252    height -= 4;
    253  } while (height != 0);
    254 }
    255 
    256 // clang-format off
    257 DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = {
    258  0, 2, 4, 6, 1, 3, 5, 7,
    259 };
    260 // clang-format on
    261 
    262 static inline uint16x4_t convolve4_4_x(int16x8_t s0, int16x8_t filter,
    263                                       int64x2_t offset,
    264                                       uint16x8x2_t permute_tbl,
    265                                       uint16x4_t max) {
    266  int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]);
    267  int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]);
    268 
    269  int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0);
    270  int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0);
    271 
    272  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
    273  uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
    274 
    275  return vmin_u16(res, max);
    276 }
    277 
    278 static inline uint16x8_t convolve4_8_x(int16x8_t s0[4], int16x8_t filter,
    279                                       int64x2_t offset, uint16x8_t tbl,
    280                                       uint16x8_t max) {
    281  int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0);
    282  int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0);
    283  int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0);
    284  int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0);
    285 
    286  int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
    287  int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
    288 
    289  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, FILTER_BITS),
    290                                vqrshrun_n_s32(sum2637, FILTER_BITS));
    291  res = aom_tbl_u16(res, tbl);
    292 
    293  return vminq_u16(res, max);
    294 }
    295 
    296 static inline void highbd_convolve_x_sr_4tap_sve2(
    297    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
    298    int width, int height, const int16_t *x_filter_ptr,
    299    ConvolveParams *conv_params, int bd) {
    300  // This shim allows to do only one rounding shift instead of two.
    301  const int64x2_t offset = vdupq_n_s64(1 << (conv_params->round_0 - 1));
    302 
    303  const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
    304  const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0));
    305 
    306  if (width == 4) {
    307    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
    308    uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl);
    309 
    310    const int16_t *s = (const int16_t *)(src);
    311 
    312    do {
    313      int16x8_t s0, s1, s2, s3;
    314      load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3);
    315 
    316      uint16x4_t d0 = convolve4_4_x(s0, filter, offset, permute_tbl, max);
    317      uint16x4_t d1 = convolve4_4_x(s1, filter, offset, permute_tbl, max);
    318      uint16x4_t d2 = convolve4_4_x(s2, filter, offset, permute_tbl, max);
    319      uint16x4_t d3 = convolve4_4_x(s3, filter, offset, permute_tbl, max);
    320 
    321      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
    322 
    323      s += 4 * src_stride;
    324      dst += 4 * dst_stride;
    325      height -= 4;
    326    } while (height != 0);
    327  } else {
    328    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    329    uint16x8_t idx = vld1q_u16(kDeinterleaveTbl);
    330 
    331    do {
    332      const int16_t *s = (const int16_t *)(src);
    333      uint16_t *d = dst;
    334      int w = width;
    335 
    336      do {
    337        int16x8_t s0[4], s1[4], s2[4], s3[4];
    338        load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
    339        load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
    340        load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
    341        load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
    342 
    343        uint16x8_t d0 = convolve4_8_x(s0, filter, offset, idx, max);
    344        uint16x8_t d1 = convolve4_8_x(s1, filter, offset, idx, max);
    345        uint16x8_t d2 = convolve4_8_x(s2, filter, offset, idx, max);
    346        uint16x8_t d3 = convolve4_8_x(s3, filter, offset, idx, max);
    347 
    348        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    349 
    350        s += 8;
    351        d += 8;
    352        w -= 8;
    353      } while (w != 0);
    354      src += 4 * src_stride;
    355      dst += 4 * dst_stride;
    356      height -= 4;
    357    } while (height != 0);
    358  }
    359 }
    360 
    361 void av1_highbd_convolve_x_sr_sve2(const uint16_t *src, int src_stride,
    362                                   uint16_t *dst, int dst_stride, int w, int h,
    363                                   const InterpFilterParams *filter_params_x,
    364                                   const int subpel_x_qn,
    365                                   ConvolveParams *conv_params, int bd) {
    366  if (w == 2 || h == 2) {
    367    av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
    368                               filter_params_x, subpel_x_qn, conv_params, bd);
    369    return;
    370  }
    371 
    372  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
    373 
    374  if (x_filter_taps == 6) {
    375    av1_highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h,
    376                                  filter_params_x, subpel_x_qn, conv_params,
    377                                  bd);
    378    return;
    379  }
    380 
    381  const int horiz_offset = filter_params_x->taps / 2 - 1;
    382  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
    383      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    384 
    385  src -= horiz_offset;
    386 
    387  if (x_filter_taps == 12) {
    388    highbd_convolve_x_sr_12tap_sve2(src, src_stride, dst, dst_stride, w, h,
    389                                    x_filter_ptr, conv_params, bd);
    390    return;
    391  }
    392 
    393  if (x_filter_taps == 8) {
    394    highbd_convolve_x_sr_8tap_sve2(src, src_stride, dst, dst_stride, w, h,
    395                                   x_filter_ptr, conv_params, bd);
    396    return;
    397  }
    398 
    399  highbd_convolve_x_sr_4tap_sve2(src + 2, src_stride, dst, dst_stride, w, h,
    400                                 x_filter_ptr, conv_params, bd);
    401 }
    402 
    403 static inline uint16x4_t highbd_convolve12_4_y(int16x8_t s0[2], int16x8_t s1[2],
    404                                               int16x8_t s2[2],
    405                                               int16x8_t filter_0_7,
    406                                               int16x8_t filter_4_11,
    407                                               uint16x4_t max) {
    408  int64x2_t sum[2];
    409 
    410  sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0);
    411  sum[0] = aom_svdot_lane_s16(sum[0], s1[0], filter_0_7, 1);
    412  sum[0] = aom_svdot_lane_s16(sum[0], s2[0], filter_4_11, 1);
    413 
    414  sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0);
    415  sum[1] = aom_svdot_lane_s16(sum[1], s1[1], filter_0_7, 1);
    416  sum[1] = aom_svdot_lane_s16(sum[1], s2[1], filter_4_11, 1);
    417 
    418  int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1]));
    419 
    420  uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS);
    421 
    422  return vmin_u16(res, max);
    423 }
    424 
    425 static inline void highbd_convolve_y_sr_12tap_sve2(
    426    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
    427    int width, int height, const int16_t *y_filter_ptr, int bd) {
    428  const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
    429  const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
    430 
    431  uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
    432  // Scale indices by size of the true vector length to avoid reading from an
    433  // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
    434  uint16x8_t correction0 =
    435      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
    436  merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
    437 
    438  uint16x8_t correction1 =
    439      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
    440  merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
    441 
    442  uint16x8_t correction2 =
    443      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
    444  merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
    445 
    446  const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
    447 
    448  do {
    449    int16_t *s = (int16_t *)src;
    450    uint16_t *d = dst;
    451    int h = height;
    452 
    453    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
    454    load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
    455                  &s9, &sA);
    456    s += 11 * src_stride;
    457 
    458    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2],
    459        s6789[2], s789A[2];
    460    transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
    461    transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
    462    transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
    463    transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
    464    transpose_concat_elems_s16_4x4(s4, s5, s6, s7, s4567);
    465    transpose_concat_elems_s16_4x4(s5, s6, s7, s8, s5678);
    466    transpose_concat_elems_s16_4x4(s6, s7, s8, s9, s6789);
    467    transpose_concat_elems_s16_4x4(s7, s8, s9, sA, s789A);
    468 
    469    do {
    470      int16x4_t sB, sC, sD, sE;
    471      load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
    472 
    473      int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
    474      transpose_concat_elems_s16_4x4(sB, sC, sD, sE, sBCDE);
    475 
    476      // Use the above transpose and reuse data from the previous loop to get
    477      // the rest.
    478      aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB);
    479      aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC);
    480      aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD);
    481 
    482      uint16x4_t d0 = highbd_convolve12_4_y(s0123, s4567, s89AB, y_filter_0_7,
    483                                            y_filter_4_11, max);
    484      uint16x4_t d1 = highbd_convolve12_4_y(s1234, s5678, s9ABC, y_filter_0_7,
    485                                            y_filter_4_11, max);
    486      uint16x4_t d2 = highbd_convolve12_4_y(s2345, s6789, sABCD, y_filter_0_7,
    487                                            y_filter_4_11, max);
    488      uint16x4_t d3 = highbd_convolve12_4_y(s3456, s789A, sBCDE, y_filter_0_7,
    489                                            y_filter_4_11, max);
    490 
    491      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
    492 
    493      // Prepare block for next iteration - re-using as much as possible.
    494      // Shuffle everything up four rows.
    495      s0123[0] = s4567[0];
    496      s0123[1] = s4567[1];
    497      s1234[0] = s5678[0];
    498      s1234[1] = s5678[1];
    499      s2345[0] = s6789[0];
    500      s2345[1] = s6789[1];
    501      s3456[0] = s789A[0];
    502      s3456[1] = s789A[1];
    503      s4567[0] = s89AB[0];
    504      s4567[1] = s89AB[1];
    505      s5678[0] = s9ABC[0];
    506      s5678[1] = s9ABC[1];
    507      s6789[0] = sABCD[0];
    508      s6789[1] = sABCD[1];
    509      s789A[0] = sBCDE[0];
    510      s789A[1] = sBCDE[1];
    511 
    512      s += 4 * src_stride;
    513      d += 4 * dst_stride;
    514      h -= 4;
    515    } while (h != 0);
    516    src += 4;
    517    dst += 4;
    518    width -= 4;
    519  } while (width != 0);
    520 }
    521 
    522 static inline uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2],
    523                                              int16x8_t samples_hi[2],
    524                                              int16x8_t filter,
    525                                              uint16x4_t max) {
    526  int64x2_t sum01 =
    527      aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0);
    528  sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
    529 
    530  int64x2_t sum23 =
    531      aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0);
    532  sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
    533 
    534  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
    535  uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
    536  return vmin_u16(res, max);
    537 }
    538 
    539 static inline uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4],
    540                                              int16x8_t samples_hi[4],
    541                                              int16x8_t filter,
    542                                              uint16x8_t max) {
    543  int64x2_t sum01 =
    544      aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0);
    545  sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
    546 
    547  int64x2_t sum23 =
    548      aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0);
    549  sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
    550 
    551  int64x2_t sum45 =
    552      aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0);
    553  sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1);
    554 
    555  int64x2_t sum67 =
    556      aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0);
    557  sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1);
    558 
    559  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
    560  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
    561  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
    562                                vqrshrun_n_s32(sum4567, FILTER_BITS));
    563  return vminq_u16(res, max);
    564 }
    565 
    566 static void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src,
    567                                           ptrdiff_t src_stride, uint16_t *dst,
    568                                           ptrdiff_t dst_stride, int width,
    569                                           int height, const int16_t *filter_y,
    570                                           int bd) {
    571  assert(width >= 4 && height >= 4);
    572 
    573  const int16x8_t y_filter = vld1q_s16(filter_y);
    574 
    575  uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
    576  // Scale indices by size of the true vector length to avoid reading from an
    577  // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
    578  uint16x8_t correction0 =
    579      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
    580  merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
    581 
    582  uint16x8_t correction1 =
    583      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
    584  merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
    585 
    586  uint16x8_t correction2 =
    587      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
    588  merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
    589 
    590  if (width == 4) {
    591    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
    592    int16_t *s = (int16_t *)src;
    593 
    594    int16x4_t s0, s1, s2, s3, s4, s5, s6;
    595    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
    596    s += 7 * src_stride;
    597 
    598    // This operation combines a conventional transpose and the sample permute
    599    // required before computing the dot product.
    600    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
    601    transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
    602    transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
    603    transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
    604    transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
    605 
    606    do {
    607      int16x4_t s7, s8, s9, s10;
    608      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
    609 
    610      int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
    611      // Transpose and shuffle the 4 lines that were loaded.
    612      transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s789A);
    613 
    614      // Merge new data into block from previous iteration.
    615      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
    616      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
    617      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
    618 
    619      uint16x4_t d0 = highbd_convolve8_4_y(s0123, s4567, y_filter, max);
    620      uint16x4_t d1 = highbd_convolve8_4_y(s1234, s5678, y_filter, max);
    621      uint16x4_t d2 = highbd_convolve8_4_y(s2345, s6789, y_filter, max);
    622      uint16x4_t d3 = highbd_convolve8_4_y(s3456, s789A, y_filter, max);
    623 
    624      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
    625 
    626      // Prepare block for next iteration - re-using as much as possible.
    627      // Shuffle everything up four rows.
    628      s0123[0] = s4567[0];
    629      s0123[1] = s4567[1];
    630      s1234[0] = s5678[0];
    631      s1234[1] = s5678[1];
    632      s2345[0] = s6789[0];
    633      s2345[1] = s6789[1];
    634      s3456[0] = s789A[0];
    635      s3456[1] = s789A[1];
    636      s += 4 * src_stride;
    637      dst += 4 * dst_stride;
    638      height -= 4;
    639    } while (height != 0);
    640  } else {
    641    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    642 
    643    do {
    644      int h = height;
    645      int16_t *s = (int16_t *)src;
    646      uint16_t *d = dst;
    647 
    648      int16x8_t s0, s1, s2, s3, s4, s5, s6;
    649      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
    650      s += 7 * src_stride;
    651 
    652      // This operation combines a conventional transpose and the sample permute
    653      // required before computing the dot product.
    654      int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
    655      transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123);
    656      transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234);
    657      transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345);
    658      transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456);
    659 
    660      do {
    661        int16x8_t s7, s8, s9, s10;
    662        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
    663 
    664        int16x8_t s4567[4], s5678[4], s6789[4], s789A[4];
    665        // Transpose and shuffle the 4 lines that were loaded.
    666        transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s789A);
    667 
    668        // Merge new data into block from previous iteration.
    669        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
    670        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
    671        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
    672 
    673        uint16x8_t d0 = highbd_convolve8_8_y(s0123, s4567, y_filter, max);
    674        uint16x8_t d1 = highbd_convolve8_8_y(s1234, s5678, y_filter, max);
    675        uint16x8_t d2 = highbd_convolve8_8_y(s2345, s6789, y_filter, max);
    676        uint16x8_t d3 = highbd_convolve8_8_y(s3456, s789A, y_filter, max);
    677 
    678        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    679 
    680        // Prepare block for next iteration - re-using as much as possible.
    681        // Shuffle everything up four rows.
    682        s0123[0] = s4567[0];
    683        s0123[1] = s4567[1];
    684        s0123[2] = s4567[2];
    685        s0123[3] = s4567[3];
    686        s1234[0] = s5678[0];
    687        s1234[1] = s5678[1];
    688        s1234[2] = s5678[2];
    689        s1234[3] = s5678[3];
    690        s2345[0] = s6789[0];
    691        s2345[1] = s6789[1];
    692        s2345[2] = s6789[2];
    693        s2345[3] = s6789[3];
    694        s3456[0] = s789A[0];
    695        s3456[1] = s789A[1];
    696        s3456[2] = s789A[2];
    697        s3456[3] = s789A[3];
    698 
    699        s += 4 * src_stride;
    700        d += 4 * dst_stride;
    701        h -= 4;
    702      } while (h != 0);
    703      src += 8;
    704      dst += 8;
    705      width -= 8;
    706    } while (width != 0);
    707  }
    708 }
    709 
    710 static inline uint16x4_t highbd_convolve4_4_y(int16x8_t samples[2],
    711                                              int16x8_t filter,
    712                                              uint16x4_t max) {
    713  int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0);
    714  int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[1], filter, 0);
    715 
    716  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
    717  uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
    718  return vmin_u16(res, max);
    719 }
    720 
    721 static inline uint16x8_t highbd_convolve4_8_y(int16x8_t samples[4],
    722                                              int16x8_t filter,
    723                                              uint16x8_t max) {
    724  int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0);
    725  int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[1], filter, 0);
    726  int64x2_t sum45 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[2], filter, 0);
    727  int64x2_t sum67 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[3], filter, 0);
    728 
    729  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
    730  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
    731  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
    732                                vqrshrun_n_s32(sum4567, FILTER_BITS));
    733  return vminq_u16(res, max);
    734 }
    735 
    736 static void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src,
    737                                           ptrdiff_t src_stride, uint16_t *dst,
    738                                           ptrdiff_t dst_stride, int width,
    739                                           int height, const int16_t *filter_y,
    740                                           int bd) {
    741  assert(width >= 4 && height >= 4);
    742 
    743  const int16x8_t y_filter =
    744      vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
    745 
    746  if (width == 4) {
    747    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
    748    int16_t *s = (int16_t *)src;
    749 
    750    int16x4_t s0, s1, s2;
    751    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
    752    s += 3 * src_stride;
    753 
    754    do {
    755      int16x4_t s3, s4, s5, s6;
    756      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
    757 
    758      // This operation combines a conventional transpose and the sample permute
    759      // required before computing the dot product.
    760      int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
    761      transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
    762      transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
    763      transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
    764      transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
    765 
    766      uint16x4_t d0 = highbd_convolve4_4_y(s0123, y_filter, max);
    767      uint16x4_t d1 = highbd_convolve4_4_y(s1234, y_filter, max);
    768      uint16x4_t d2 = highbd_convolve4_4_y(s2345, y_filter, max);
    769      uint16x4_t d3 = highbd_convolve4_4_y(s3456, y_filter, max);
    770 
    771      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
    772 
    773      // Shuffle everything up four rows.
    774      s0 = s4;
    775      s1 = s5;
    776      s2 = s6;
    777 
    778      s += 4 * src_stride;
    779      dst += 4 * dst_stride;
    780      height -= 4;
    781    } while (height != 0);
    782  } else {
    783    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    784 
    785    do {
    786      int h = height;
    787      int16_t *s = (int16_t *)src;
    788      uint16_t *d = dst;
    789 
    790      int16x8_t s0, s1, s2;
    791      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
    792      s += 3 * src_stride;
    793 
    794      do {
    795        int16x8_t s3, s4, s5, s6;
    796        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
    797 
    798        // This operation combines a conventional transpose and the sample
    799        // permute required before computing the dot product.
    800        int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
    801        transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123);
    802        transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234);
    803        transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345);
    804        transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456);
    805 
    806        uint16x8_t d0 = highbd_convolve4_8_y(s0123, y_filter, max);
    807        uint16x8_t d1 = highbd_convolve4_8_y(s1234, y_filter, max);
    808        uint16x8_t d2 = highbd_convolve4_8_y(s2345, y_filter, max);
    809        uint16x8_t d3 = highbd_convolve4_8_y(s3456, y_filter, max);
    810 
    811        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
    812 
    813        // Shuffle everything up four rows.
    814        s0 = s4;
    815        s1 = s5;
    816        s2 = s6;
    817 
    818        s += 4 * src_stride;
    819        d += 4 * dst_stride;
    820        h -= 4;
    821      } while (h != 0);
    822      src += 8;
    823      dst += 8;
    824      width -= 8;
    825    } while (width != 0);
    826  }
    827 }
    828 
    829 void av1_highbd_convolve_y_sr_sve2(const uint16_t *src, int src_stride,
    830                                   uint16_t *dst, int dst_stride, int w, int h,
    831                                   const InterpFilterParams *filter_params_y,
    832                                   const int subpel_y_qn, int bd) {
    833  if (w == 2 || h == 2) {
    834    av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
    835                               filter_params_y, subpel_y_qn, bd);
    836    return;
    837  }
    838  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
    839 
    840  if (y_filter_taps == 6) {
    841    av1_highbd_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h,
    842                                  filter_params_y, subpel_y_qn, bd);
    843    return;
    844  }
    845 
    846  const int vert_offset = filter_params_y->taps / 2 - 1;
    847  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
    848      filter_params_y, subpel_y_qn & SUBPEL_MASK);
    849 
    850  src -= vert_offset * src_stride;
    851 
    852  if (y_filter_taps > 8) {
    853    highbd_convolve_y_sr_12tap_sve2(src, src_stride, dst, dst_stride, w, h,
    854                                    y_filter_ptr, bd);
    855    return;
    856  }
    857 
    858  if (y_filter_taps == 4) {
    859    highbd_convolve_y_sr_4tap_sve2(src + 2 * src_stride, src_stride, dst,
    860                                   dst_stride, w, h, y_filter_ptr, bd);
    861    return;
    862  }
    863 
    864  highbd_convolve_y_sr_8tap_sve2(src, src_stride, dst, dst_stride, w, h,
    865                                 y_filter_ptr, bd);
    866 }
    867 
    868 static inline uint16x4_t convolve12_4_2d_h(
    869    int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11,
    870    const int64x2_t offset, int32x4_t shift, uint16x8x4_t permute_tbl) {
    871  int16x8_t permuted_samples[6];
    872  permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]);
    873  permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]);
    874  permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]);
    875  permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]);
    876  permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]);
    877  permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]);
    878 
    879  int64x2_t sum01 =
    880      aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0);
    881  sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1);
    882  sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1);
    883 
    884  int64x2_t sum23 =
    885      aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0);
    886  sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1);
    887  sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1);
    888 
    889  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
    890  sum0123 = vqrshlq_s32(sum0123, shift);
    891  return vqmovun_s32(sum0123);
    892 }
    893 
    894 static inline uint16x8_t convolve12_8_2d_h(int16x8_t s0, int16x8_t s1,
    895                                           int16x8_t s2, int16x8_t filter_0_7,
    896                                           int16x8_t filter_4_11,
    897                                           int64x2_t offset, int32x4_t shift,
    898                                           uint16x8x4_t permute_tbl) {
    899  int16x8_t permuted_samples[8];
    900  permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]);
    901  permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]);
    902  permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]);
    903  permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]);
    904  permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]);
    905  permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]);
    906  permuted_samples[6] = aom_tbl2_s16(s1, s2, permute_tbl.val[2]);
    907  permuted_samples[7] = aom_tbl2_s16(s1, s2, permute_tbl.val[3]);
    908 
    909  int64x2_t sum01 =
    910      aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0);
    911  sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1);
    912  sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1);
    913 
    914  int64x2_t sum23 =
    915      aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0);
    916  sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1);
    917  sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1);
    918 
    919  int64x2_t sum45 =
    920      aom_svdot_lane_s16(offset, permuted_samples[2], filter_0_7, 0);
    921  sum45 = aom_svdot_lane_s16(sum45, permuted_samples[4], filter_0_7, 1);
    922  sum45 = aom_svdot_lane_s16(sum45, permuted_samples[6], filter_4_11, 1);
    923 
    924  int64x2_t sum67 =
    925      aom_svdot_lane_s16(offset, permuted_samples[3], filter_0_7, 0);
    926  sum67 = aom_svdot_lane_s16(sum67, permuted_samples[5], filter_0_7, 1);
    927  sum67 = aom_svdot_lane_s16(sum67, permuted_samples[7], filter_4_11, 1);
    928 
    929  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
    930  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
    931 
    932  sum0123 = vqrshlq_s32(sum0123, shift);
    933  sum4567 = vqrshlq_s32(sum4567, shift);
    934 
    935  return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
    936 }
    937 
    938 static inline void highbd_convolve_2d_sr_horiz_12tap_sve2(
    939    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
    940    int width, int height, const int16_t *y_filter_ptr,
    941    ConvolveParams *conv_params, const int x_offset) {
    942  const int64x2_t offset = vdupq_n_s64(x_offset);
    943  const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
    944 
    945  const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
    946  const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
    947 
    948  uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdTbl);
    949  // Scale indices by size of the true vector length to avoid reading from an
    950  // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
    951  uint16x8_t correction0 = vreinterpretq_u16_u64(vcombine_u64(
    952      vdup_n_u64(0), vdup_n_u64(svcnth() * 0x0001000000000000ULL)));
    953  permute_tbl.val[2] = vaddq_u16(permute_tbl.val[2], correction0);
    954 
    955  uint16x8_t correction1 = vreinterpretq_u16_u64(
    956      vcombine_u64(vdup_n_u64(svcnth() * 0x0001000100000000ULL),
    957                   vdup_n_u64(svcnth() * 0x0001000100010000ULL)));
    958  permute_tbl.val[3] = vaddq_u16(permute_tbl.val[3], correction1);
    959 
    960  if (width == 4) {
    961    const int16_t *s = (const int16_t *)src;
    962 
    963    do {
    964      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
    965      load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6);
    966      load_s16_8x4(s + 8, src_stride, &s1, &s3, &s5, &s7);
    967 
    968      uint16x4_t d0 = convolve12_4_2d_h(s0, s1, y_filter_0_7, y_filter_4_11,
    969                                        offset, shift, permute_tbl);
    970      uint16x4_t d1 = convolve12_4_2d_h(s2, s3, y_filter_0_7, y_filter_4_11,
    971                                        offset, shift, permute_tbl);
    972      uint16x4_t d2 = convolve12_4_2d_h(s4, s5, y_filter_0_7, y_filter_4_11,
    973                                        offset, shift, permute_tbl);
    974      uint16x4_t d3 = convolve12_4_2d_h(s6, s7, y_filter_0_7, y_filter_4_11,
    975                                        offset, shift, permute_tbl);
    976 
    977      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
    978 
    979      dst += 4 * dst_stride;
    980      s += 4 * src_stride;
    981      height -= 4;
    982    } while (height > 0);
    983  } else {
    984    do {
    985      const int16_t *s = (const int16_t *)src;
    986      uint16_t *d = dst;
    987      int w = width;
    988 
    989      do {
    990        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11;
    991        load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9);
    992        load_s16_8x4(s + 8, src_stride, &s1, &s4, &s7, &s10);
    993        load_s16_8x4(s + 16, src_stride, &s2, &s5, &s8, &s11);
    994 
    995        uint16x8_t d0 =
    996            convolve12_8_2d_h(s0, s1, s2, y_filter_0_7, y_filter_4_11, offset,
    997                              shift, permute_tbl);
    998        uint16x8_t d1 =
    999            convolve12_8_2d_h(s3, s4, s5, y_filter_0_7, y_filter_4_11, offset,
   1000                              shift, permute_tbl);
   1001        uint16x8_t d2 =
   1002            convolve12_8_2d_h(s6, s7, s8, y_filter_0_7, y_filter_4_11, offset,
   1003                              shift, permute_tbl);
   1004        uint16x8_t d3 =
   1005            convolve12_8_2d_h(s9, s10, s11, y_filter_0_7, y_filter_4_11, offset,
   1006                              shift, permute_tbl);
   1007 
   1008        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1009 
   1010        s += 8;
   1011        d += 8;
   1012        w -= 8;
   1013      } while (w != 0);
   1014      src += 4 * src_stride;
   1015      dst += 4 * dst_stride;
   1016      height -= 4;
   1017    } while (height > 0);
   1018  }
   1019 }
   1020 
   1021 static inline uint16x8_t convolve8_8_2d_h(int16x8_t s0[8], int16x8_t filter,
   1022                                          int64x2_t offset, int32x4_t shift) {
   1023  int64x2_t sum[8];
   1024  sum[0] = aom_sdotq_s16(offset, s0[0], filter);
   1025  sum[1] = aom_sdotq_s16(offset, s0[1], filter);
   1026  sum[2] = aom_sdotq_s16(offset, s0[2], filter);
   1027  sum[3] = aom_sdotq_s16(offset, s0[3], filter);
   1028  sum[4] = aom_sdotq_s16(offset, s0[4], filter);
   1029  sum[5] = aom_sdotq_s16(offset, s0[5], filter);
   1030  sum[6] = aom_sdotq_s16(offset, s0[6], filter);
   1031  sum[7] = aom_sdotq_s16(offset, s0[7], filter);
   1032 
   1033  sum[0] = vpaddq_s64(sum[0], sum[1]);
   1034  sum[2] = vpaddq_s64(sum[2], sum[3]);
   1035  sum[4] = vpaddq_s64(sum[4], sum[5]);
   1036  sum[6] = vpaddq_s64(sum[6], sum[7]);
   1037 
   1038  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
   1039  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6]));
   1040 
   1041  sum0123 = vqrshlq_s32(sum0123, shift);
   1042  sum4567 = vqrshlq_s32(sum4567, shift);
   1043 
   1044  return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
   1045 }
   1046 
   1047 static inline void highbd_convolve_2d_sr_horiz_8tap_sve2(
   1048    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
   1049    int width, int height, const int16_t *y_filter_ptr,
   1050    ConvolveParams *conv_params, const int x_offset) {
   1051  const int64x2_t offset = vdupq_n_s64(x_offset);
   1052  const int64x2_t offset_lo = vcombine_s64(vget_low_s64(offset), vdup_n_s64(0));
   1053  const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
   1054 
   1055  const int16x8_t filter = vld1q_s16(y_filter_ptr);
   1056 
   1057  do {
   1058    const int16_t *s = (const int16_t *)src;
   1059    uint16_t *d = dst;
   1060    int w = width;
   1061 
   1062    do {
   1063      int16x8_t s0[8], s1[8], s2[8], s3[8];
   1064      load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
   1065                   &s0[4], &s0[5], &s0[6], &s0[7]);
   1066      load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
   1067                   &s1[4], &s1[5], &s1[6], &s1[7]);
   1068      load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
   1069                   &s2[4], &s2[5], &s2[6], &s2[7]);
   1070      load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
   1071                   &s3[4], &s3[5], &s3[6], &s3[7]);
   1072 
   1073      uint16x8_t d0 = convolve8_8_2d_h(s0, filter, offset_lo, shift);
   1074      uint16x8_t d1 = convolve8_8_2d_h(s1, filter, offset_lo, shift);
   1075      uint16x8_t d2 = convolve8_8_2d_h(s2, filter, offset_lo, shift);
   1076      uint16x8_t d3 = convolve8_8_2d_h(s3, filter, offset_lo, shift);
   1077 
   1078      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1079 
   1080      s += 8;
   1081      d += 8;
   1082      w -= 8;
   1083    } while (w != 0);
   1084    src += 4 * src_stride;
   1085    dst += 4 * dst_stride;
   1086    height -= 4;
   1087  } while (height > 0);
   1088 }
   1089 
   1090 static inline uint16x4_t convolve4_4_2d_h(int16x8_t s0, int16x8_t filter,
   1091                                          int64x2_t offset, int32x4_t shift,
   1092                                          uint16x8x2_t permute_tbl) {
   1093  int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]);
   1094  int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]);
   1095 
   1096  int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0);
   1097  int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0);
   1098 
   1099  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
   1100  sum0123 = vqrshlq_s32(sum0123, shift);
   1101  return vqmovun_s32(sum0123);
   1102 }
   1103 
   1104 static inline uint16x8_t convolve4_8_2d_h(int16x8_t s0[8], int16x8_t filter,
   1105                                          int64x2_t offset, int32x4_t shift,
   1106                                          uint16x8_t tbl) {
   1107  int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0);
   1108  int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0);
   1109  int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0);
   1110  int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0);
   1111 
   1112  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
   1113  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
   1114 
   1115  sum0123 = vqrshlq_s32(sum0123, shift);
   1116  sum4567 = vqrshlq_s32(sum4567, shift);
   1117 
   1118  uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
   1119  return aom_tbl_u16(res, tbl);
   1120 }
   1121 
   1122 static inline void highbd_convolve_2d_sr_horiz_4tap_sve2(
   1123    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
   1124    int width, int height, const int16_t *x_filter_ptr,
   1125    ConvolveParams *conv_params, const int x_offset) {
   1126  const int64x2_t offset = vdupq_n_s64(x_offset);
   1127  const int32x4_t shift = vdupq_n_s32(-conv_params->round_0);
   1128 
   1129  const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
   1130  const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0));
   1131 
   1132  if (width == 4) {
   1133    const int16_t *s = (const int16_t *)(src);
   1134 
   1135    uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl);
   1136 
   1137    do {
   1138      int16x8_t s0, s1, s2, s3;
   1139      load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3);
   1140 
   1141      uint16x4_t d0 = convolve4_4_2d_h(s0, filter, offset, shift, permute_tbl);
   1142      uint16x4_t d1 = convolve4_4_2d_h(s1, filter, offset, shift, permute_tbl);
   1143      uint16x4_t d2 = convolve4_4_2d_h(s2, filter, offset, shift, permute_tbl);
   1144      uint16x4_t d3 = convolve4_4_2d_h(s3, filter, offset, shift, permute_tbl);
   1145 
   1146      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
   1147 
   1148      s += 4 * src_stride;
   1149      dst += 4 * dst_stride;
   1150      height -= 4;
   1151    } while (height > 0);
   1152  } else {
   1153    uint16x8_t idx = vld1q_u16(kDeinterleaveTbl);
   1154 
   1155    do {
   1156      const int16_t *s = (const int16_t *)(src);
   1157      uint16_t *d = dst;
   1158      int w = width;
   1159 
   1160      do {
   1161        int16x8_t s0[8], s1[8], s2[8], s3[8];
   1162        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
   1163                     &s0[4], &s0[5], &s0[6], &s0[7]);
   1164        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
   1165                     &s1[4], &s1[5], &s1[6], &s1[7]);
   1166        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
   1167                     &s2[4], &s2[5], &s2[6], &s2[7]);
   1168        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
   1169                     &s3[4], &s3[5], &s3[6], &s3[7]);
   1170 
   1171        uint16x8_t d0 = convolve4_8_2d_h(s0, filter, offset, shift, idx);
   1172        uint16x8_t d1 = convolve4_8_2d_h(s1, filter, offset, shift, idx);
   1173        uint16x8_t d2 = convolve4_8_2d_h(s2, filter, offset, shift, idx);
   1174        uint16x8_t d3 = convolve4_8_2d_h(s3, filter, offset, shift, idx);
   1175 
   1176        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1177 
   1178        s += 8;
   1179        d += 8;
   1180        w -= 8;
   1181      } while (w != 0);
   1182      src += 4 * src_stride;
   1183      dst += 4 * dst_stride;
   1184      height -= 4;
   1185    } while (height > 0);
   1186  }
   1187 }
   1188 
   1189 static inline uint16x4_t highbd_convolve12_4_2d_v(
   1190    int16x8_t s0[2], int16x8_t s1[2], int16x8_t s2[2], int16x8_t filter_0_7,
   1191    int16x8_t filter_4_11, int32x4_t shift, int64x2_t offset, uint16x4_t max) {
   1192  int64x2_t sum01 = aom_svdot_lane_s16(offset, s0[0], filter_0_7, 0);
   1193  sum01 = aom_svdot_lane_s16(sum01, s1[0], filter_0_7, 1);
   1194  sum01 = aom_svdot_lane_s16(sum01, s2[0], filter_4_11, 1);
   1195 
   1196  int64x2_t sum23 = aom_svdot_lane_s16(offset, s0[1], filter_0_7, 0);
   1197  sum23 = aom_svdot_lane_s16(sum23, s1[1], filter_0_7, 1);
   1198  sum23 = aom_svdot_lane_s16(sum23, s2[1], filter_4_11, 1);
   1199 
   1200  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
   1201  sum0123 = vshlq_s32(sum0123, shift);
   1202 
   1203  uint16x4_t res = vqmovun_s32(sum0123);
   1204 
   1205  return vmin_u16(res, max);
   1206 }
   1207 
   1208 static inline void highbd_convolve_2d_sr_vert_12tap_sve2(
   1209    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
   1210    int width, int height, const int16_t *y_filter_ptr,
   1211    ConvolveParams *conv_params, int bd, const int y_offset) {
   1212  const int64x2_t offset = vdupq_n_s64(y_offset);
   1213  const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
   1214 
   1215  const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
   1216  const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
   1217 
   1218  uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
   1219  // Scale indices by size of the true vector length to avoid reading from an
   1220  // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
   1221  uint16x8_t correction0 =
   1222      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
   1223  merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
   1224 
   1225  uint16x8_t correction1 =
   1226      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
   1227  merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
   1228 
   1229  uint16x8_t correction2 =
   1230      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
   1231  merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
   1232 
   1233  const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
   1234 
   1235  do {
   1236    int16_t *s = (int16_t *)src;
   1237    uint16_t *d = (uint16_t *)dst;
   1238    int h = height;
   1239 
   1240    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
   1241    load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
   1242                  &s9, &sA);
   1243    s += 11 * src_stride;
   1244 
   1245    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2],
   1246        s6789[2], s789A[2];
   1247    // This operation combines a conventional transpose and the sample permute
   1248    // required before computing the dot product.
   1249    transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
   1250    transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
   1251    transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
   1252    transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
   1253    transpose_concat_elems_s16_4x4(s4, s5, s6, s7, s4567);
   1254    transpose_concat_elems_s16_4x4(s5, s6, s7, s8, s5678);
   1255    transpose_concat_elems_s16_4x4(s6, s7, s8, s9, s6789);
   1256    transpose_concat_elems_s16_4x4(s7, s8, s9, sA, s789A);
   1257 
   1258    do {
   1259      int16x4_t sB, sC, sD, sE;
   1260      load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
   1261 
   1262      int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
   1263      transpose_concat_elems_s16_4x4(sB, sC, sD, sE, sBCDE);
   1264 
   1265      // Use the above transpose and reuse data from the previous loop to get
   1266      // the rest.
   1267      aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB);
   1268      aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC);
   1269      aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD);
   1270 
   1271      uint16x4_t d0 = highbd_convolve12_4_2d_v(
   1272          s0123, s4567, s89AB, y_filter_0_7, y_filter_4_11, shift, offset, max);
   1273      uint16x4_t d1 = highbd_convolve12_4_2d_v(
   1274          s1234, s5678, s9ABC, y_filter_0_7, y_filter_4_11, shift, offset, max);
   1275      uint16x4_t d2 = highbd_convolve12_4_2d_v(
   1276          s2345, s6789, sABCD, y_filter_0_7, y_filter_4_11, shift, offset, max);
   1277      uint16x4_t d3 = highbd_convolve12_4_2d_v(
   1278          s3456, s789A, sBCDE, y_filter_0_7, y_filter_4_11, shift, offset, max);
   1279 
   1280      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
   1281 
   1282      // Prepare block for next iteration - re-using as much as possible.
   1283      // Shuffle everything up four rows.
   1284      s0123[0] = s4567[0];
   1285      s0123[1] = s4567[1];
   1286      s1234[0] = s5678[0];
   1287      s1234[1] = s5678[1];
   1288      s2345[0] = s6789[0];
   1289      s2345[1] = s6789[1];
   1290      s3456[0] = s789A[0];
   1291      s3456[1] = s789A[1];
   1292      s4567[0] = s89AB[0];
   1293      s4567[1] = s89AB[1];
   1294      s5678[0] = s9ABC[0];
   1295      s5678[1] = s9ABC[1];
   1296      s6789[0] = sABCD[0];
   1297      s6789[1] = sABCD[1];
   1298      s789A[0] = sBCDE[0];
   1299      s789A[1] = sBCDE[1];
   1300 
   1301      s += 4 * src_stride;
   1302      d += 4 * dst_stride;
   1303      h -= 4;
   1304    } while (h != 0);
   1305    src += 4;
   1306    dst += 4;
   1307    width -= 4;
   1308  } while (width != 0);
   1309 }
   1310 
   1311 static inline uint16x4_t highbd_convolve8_4_2d_v(
   1312    int16x8_t samples_lo[2], int16x8_t samples_hi[2], int16x8_t filter,
   1313    int32x4_t shift, int64x2_t offset, uint16x4_t max) {
   1314  int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
   1315  sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
   1316 
   1317  int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0);
   1318  sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
   1319 
   1320  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
   1321  sum0123 = vshlq_s32(sum0123, shift);
   1322 
   1323  uint16x4_t res = vqmovun_s32(sum0123);
   1324  return vmin_u16(res, max);
   1325 }
   1326 
   1327 static inline uint16x8_t highbd_convolve8_8_2d_v(
   1328    int16x8_t samples_lo[4], int16x8_t samples_hi[4], int16x8_t filter,
   1329    int32x4_t shift, int64x2_t offset, uint16x8_t max) {
   1330  int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0);
   1331  sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1);
   1332 
   1333  int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0);
   1334  sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1);
   1335 
   1336  int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0);
   1337  sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1);
   1338 
   1339  int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0);
   1340  sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1);
   1341 
   1342  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
   1343  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
   1344 
   1345  sum0123 = vshlq_s32(sum0123, shift);
   1346  sum4567 = vshlq_s32(sum4567, shift);
   1347 
   1348  uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
   1349  return vminq_u16(res, max);
   1350 }
   1351 
   1352 static void highbd_convolve_2d_sr_vert_8tap_sve2(
   1353    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
   1354    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y,
   1355    ConvolveParams *conv_params, int bd, const int y_offset) {
   1356  assert(width >= 4 && height >= 4);
   1357  const int64x2_t offset = vdupq_n_s64(y_offset);
   1358  const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
   1359  const int16x8_t y_filter = vld1q_s16(filter_y);
   1360 
   1361  uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl);
   1362  // Scale indices by size of the true vector length to avoid reading from an
   1363  // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
   1364  uint16x8_t correction0 =
   1365      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
   1366  merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
   1367 
   1368  uint16x8_t correction1 =
   1369      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
   1370  merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
   1371 
   1372  uint16x8_t correction2 =
   1373      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
   1374  merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
   1375 
   1376  if (width == 4) {
   1377    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
   1378    int16_t *s = (int16_t *)src;
   1379 
   1380    int16x4_t s0, s1, s2, s3, s4, s5, s6;
   1381    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
   1382    s += 7 * src_stride;
   1383 
   1384    // This operation combines a conventional transpose and the sample permute
   1385    // required before computing the dot product.
   1386    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
   1387    transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
   1388    transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
   1389    transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
   1390    transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
   1391 
   1392    do {
   1393      int16x4_t s7, s8, s9, s10;
   1394      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
   1395 
   1396      int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
   1397      // Transpose and shuffle the 4 lines that were loaded.
   1398      transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s789A);
   1399 
   1400      // Merge new data into block from previous iteration.
   1401      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
   1402      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
   1403      aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
   1404 
   1405      uint16x4_t d0 =
   1406          highbd_convolve8_4_2d_v(s0123, s4567, y_filter, shift, offset, max);
   1407      uint16x4_t d1 =
   1408          highbd_convolve8_4_2d_v(s1234, s5678, y_filter, shift, offset, max);
   1409      uint16x4_t d2 =
   1410          highbd_convolve8_4_2d_v(s2345, s6789, y_filter, shift, offset, max);
   1411      uint16x4_t d3 =
   1412          highbd_convolve8_4_2d_v(s3456, s789A, y_filter, shift, offset, max);
   1413 
   1414      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
   1415 
   1416      // Prepare block for next iteration - re-using as much as possible.
   1417      // Shuffle everything up four rows.
   1418      s0123[0] = s4567[0];
   1419      s0123[1] = s4567[1];
   1420      s1234[0] = s5678[0];
   1421      s1234[1] = s5678[1];
   1422      s2345[0] = s6789[0];
   1423      s2345[1] = s6789[1];
   1424      s3456[0] = s789A[0];
   1425      s3456[1] = s789A[1];
   1426 
   1427      s += 4 * src_stride;
   1428      dst += 4 * dst_stride;
   1429      height -= 4;
   1430    } while (height != 0);
   1431  } else {
   1432    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   1433 
   1434    do {
   1435      int h = height;
   1436      int16_t *s = (int16_t *)src;
   1437      uint16_t *d = dst;
   1438 
   1439      int16x8_t s0, s1, s2, s3, s4, s5, s6;
   1440      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
   1441      s += 7 * src_stride;
   1442 
   1443      // This operation combines a conventional transpose and the sample permute
   1444      // required before computing the dot product.
   1445      int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
   1446      transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123);
   1447      transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234);
   1448      transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345);
   1449      transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456);
   1450 
   1451      do {
   1452        int16x8_t s7, s8, s9, s10;
   1453        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
   1454 
   1455        int16x8_t s4567[4], s5678[4], s6789[4], s789A[4];
   1456        // Transpose and shuffle the 4 lines that were loaded.
   1457        transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s789A);
   1458 
   1459        // Merge new data into block from previous iteration.
   1460        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567);
   1461        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678);
   1462        aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789);
   1463 
   1464        uint16x8_t d0 =
   1465            highbd_convolve8_8_2d_v(s0123, s4567, y_filter, shift, offset, max);
   1466        uint16x8_t d1 =
   1467            highbd_convolve8_8_2d_v(s1234, s5678, y_filter, shift, offset, max);
   1468        uint16x8_t d2 =
   1469            highbd_convolve8_8_2d_v(s2345, s6789, y_filter, shift, offset, max);
   1470        uint16x8_t d3 =
   1471            highbd_convolve8_8_2d_v(s3456, s789A, y_filter, shift, offset, max);
   1472 
   1473        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1474 
   1475        // Prepare block for next iteration - re-using as much as possible.
   1476        // Shuffle everything up four rows.
   1477        s0123[0] = s4567[0];
   1478        s0123[1] = s4567[1];
   1479        s0123[2] = s4567[2];
   1480        s0123[3] = s4567[3];
   1481        s1234[0] = s5678[0];
   1482        s1234[1] = s5678[1];
   1483        s1234[2] = s5678[2];
   1484        s1234[3] = s5678[3];
   1485        s2345[0] = s6789[0];
   1486        s2345[1] = s6789[1];
   1487        s2345[2] = s6789[2];
   1488        s2345[3] = s6789[3];
   1489        s3456[0] = s789A[0];
   1490        s3456[1] = s789A[1];
   1491        s3456[2] = s789A[2];
   1492        s3456[3] = s789A[3];
   1493 
   1494        s += 4 * src_stride;
   1495        d += 4 * dst_stride;
   1496        h -= 4;
   1497      } while (h != 0);
   1498      src += 8;
   1499      dst += 8;
   1500      width -= 8;
   1501    } while (width != 0);
   1502  }
   1503 }
   1504 
   1505 static inline uint16x4_t highbd_convolve4_4_2d_v(int16x8_t samples[2],
   1506                                                 int16x8_t filter,
   1507                                                 int32x4_t shift,
   1508                                                 int64x2_t offset,
   1509                                                 uint16x4_t max) {
   1510  int64x2_t sum01 = aom_svdot_lane_s16(offset, samples[0], filter, 0);
   1511  int64x2_t sum23 = aom_svdot_lane_s16(offset, samples[1], filter, 0);
   1512 
   1513  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
   1514  sum0123 = vshlq_s32(sum0123, shift);
   1515 
   1516  uint16x4_t res = vqmovun_s32(sum0123);
   1517  return vmin_u16(res, max);
   1518 }
   1519 
   1520 static inline uint16x8_t highbd_convolve4_8_2d_v(int16x8_t samples[4],
   1521                                                 int16x8_t filter,
   1522                                                 int32x4_t shift,
   1523                                                 int64x2_t offset,
   1524                                                 uint16x8_t max) {
   1525  int64x2_t sum01 = aom_svdot_lane_s16(offset, samples[0], filter, 0);
   1526  int64x2_t sum23 = aom_svdot_lane_s16(offset, samples[1], filter, 0);
   1527  int64x2_t sum45 = aom_svdot_lane_s16(offset, samples[2], filter, 0);
   1528  int64x2_t sum67 = aom_svdot_lane_s16(offset, samples[3], filter, 0);
   1529 
   1530  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
   1531  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
   1532 
   1533  sum0123 = vshlq_s32(sum0123, shift);
   1534  sum4567 = vshlq_s32(sum4567, shift);
   1535 
   1536  uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567));
   1537  return vminq_u16(res, max);
   1538 }
   1539 
   1540 static void highbd_convolve_2d_sr_vert_4tap_sve2(
   1541    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
   1542    ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y,
   1543    ConvolveParams *conv_params, int bd, const int y_offset) {
   1544  assert(width >= 4 && height >= 4);
   1545  const int64x2_t offset = vdupq_n_s64(y_offset);
   1546  const int32x4_t shift = vdupq_n_s32(-conv_params->round_1);
   1547 
   1548  const int16x8_t y_filter =
   1549      vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0));
   1550 
   1551  if (width == 4) {
   1552    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
   1553    int16_t *s = (int16_t *)(src);
   1554 
   1555    int16x4_t s0, s1, s2;
   1556    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
   1557    s += 3 * src_stride;
   1558 
   1559    do {
   1560      int16x4_t s3, s4, s5, s6;
   1561      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
   1562 
   1563      // This operation combines a conventional transpose and the sample permute
   1564      // required before computing the dot product.
   1565      int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
   1566      transpose_concat_elems_s16_4x4(s0, s1, s2, s3, s0123);
   1567      transpose_concat_elems_s16_4x4(s1, s2, s3, s4, s1234);
   1568      transpose_concat_elems_s16_4x4(s2, s3, s4, s5, s2345);
   1569      transpose_concat_elems_s16_4x4(s3, s4, s5, s6, s3456);
   1570 
   1571      uint16x4_t d0 =
   1572          highbd_convolve4_4_2d_v(s0123, y_filter, shift, offset, max);
   1573      uint16x4_t d1 =
   1574          highbd_convolve4_4_2d_v(s1234, y_filter, shift, offset, max);
   1575      uint16x4_t d2 =
   1576          highbd_convolve4_4_2d_v(s2345, y_filter, shift, offset, max);
   1577      uint16x4_t d3 =
   1578          highbd_convolve4_4_2d_v(s3456, y_filter, shift, offset, max);
   1579 
   1580      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
   1581 
   1582      // Shuffle everything up four rows.
   1583      s0 = s4;
   1584      s1 = s5;
   1585      s2 = s6;
   1586 
   1587      s += 4 * src_stride;
   1588      dst += 4 * dst_stride;
   1589      height -= 4;
   1590    } while (height != 0);
   1591  } else {
   1592    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
   1593 
   1594    do {
   1595      int h = height;
   1596      int16_t *s = (int16_t *)(src);
   1597      uint16_t *d = dst;
   1598 
   1599      int16x8_t s0, s1, s2;
   1600      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
   1601      s += 3 * src_stride;
   1602 
   1603      do {
   1604        int16x8_t s3, s4, s5, s6;
   1605        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
   1606 
   1607        // This operation combines a conventional transpose and the sample
   1608        // permute required before computing the dot product.
   1609        int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
   1610        transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123);
   1611        transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234);
   1612        transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345);
   1613        transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456);
   1614 
   1615        uint16x8_t d0 =
   1616            highbd_convolve4_8_2d_v(s0123, y_filter, shift, offset, max);
   1617        uint16x8_t d1 =
   1618            highbd_convolve4_8_2d_v(s1234, y_filter, shift, offset, max);
   1619        uint16x8_t d2 =
   1620            highbd_convolve4_8_2d_v(s2345, y_filter, shift, offset, max);
   1621        uint16x8_t d3 =
   1622            highbd_convolve4_8_2d_v(s3456, y_filter, shift, offset, max);
   1623 
   1624        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
   1625 
   1626        // Shuffle everything up four rows.
   1627        s0 = s4;
   1628        s1 = s5;
   1629        s2 = s6;
   1630 
   1631        s += 4 * src_stride;
   1632        d += 4 * dst_stride;
   1633        h -= 4;
   1634      } while (h != 0);
   1635      src += 8;
   1636      dst += 8;
   1637      width -= 8;
   1638    } while (width != 0);
   1639  }
   1640 }
   1641 
   1642 void av1_highbd_convolve_2d_sr_sve2(const uint16_t *src, int src_stride,
   1643                                    uint16_t *dst, int dst_stride, int w, int h,
   1644                                    const InterpFilterParams *filter_params_x,
   1645                                    const InterpFilterParams *filter_params_y,
   1646                                    const int subpel_x_qn,
   1647                                    const int subpel_y_qn,
   1648                                    ConvolveParams *conv_params, int bd) {
   1649  if (w == 2 || h == 2) {
   1650    av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
   1651                                filter_params_x, filter_params_y, subpel_x_qn,
   1652                                subpel_y_qn, conv_params, bd);
   1653    return;
   1654  }
   1655 
   1656  DECLARE_ALIGNED(16, uint16_t,
   1657                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
   1658  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
   1659  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   1660 
   1661  if (x_filter_taps == 6 || y_filter_taps == 6) {
   1662    av1_highbd_convolve_2d_sr_neon(src, src_stride, dst, dst_stride, w, h,
   1663                                   filter_params_x, filter_params_y,
   1664                                   subpel_x_qn, subpel_y_qn, conv_params, bd);
   1665    return;
   1666  }
   1667 
   1668  const int clamped_x_taps = x_filter_taps < 4 ? 4 : x_filter_taps;
   1669  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
   1670 
   1671  const int im_stride = MAX_SB_SIZE;
   1672  const int vert_offset = clamped_y_taps / 2 - 1;
   1673  const int horiz_offset = clamped_x_taps / 2 - 1;
   1674  const int x_offset = (1 << (bd + FILTER_BITS - 1));
   1675  const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   1676  // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a
   1677  // simple shift left instead of a rounding saturating shift left.
   1678  const int y_offset =
   1679      (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1));
   1680 
   1681  const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
   1682 
   1683  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1684      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   1685  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1686      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   1687  const int im_h = h + clamped_y_taps - 1;
   1688 
   1689  if (x_filter_taps > 8) {
   1690    highbd_convolve_2d_sr_horiz_12tap_sve2(src_ptr, src_stride, im_block,
   1691                                           im_stride, w, im_h, x_filter_ptr,
   1692                                           conv_params, x_offset);
   1693 
   1694    highbd_convolve_2d_sr_vert_12tap_sve2(im_block, im_stride, dst, dst_stride,
   1695                                          w, h, y_filter_ptr, conv_params, bd,
   1696                                          y_offset);
   1697    return;
   1698  }
   1699 
   1700  if (x_filter_taps <= 4) {
   1701    highbd_convolve_2d_sr_horiz_4tap_sve2(src_ptr, src_stride, im_block,
   1702                                          im_stride, w, im_h, x_filter_ptr,
   1703                                          conv_params, x_offset);
   1704  } else {
   1705    highbd_convolve_2d_sr_horiz_8tap_sve2(src_ptr, src_stride, im_block,
   1706                                          im_stride, w, im_h, x_filter_ptr,
   1707                                          conv_params, x_offset);
   1708  }
   1709 
   1710  if (y_filter_taps <= 4) {
   1711    highbd_convolve_2d_sr_vert_4tap_sve2(im_block, im_stride, dst, dst_stride,
   1712                                         w, h, y_filter_ptr, conv_params, bd,
   1713                                         y_offset);
   1714  } else {
   1715    highbd_convolve_2d_sr_vert_8tap_sve2(im_block, im_stride, dst, dst_stride,
   1716                                         w, h, y_filter_ptr, conv_params, bd,
   1717                                         y_offset);
   1718  }
   1719 }