[ tor-browser ].git.dasho

highbd_convolve_rvv.c (71814B)
      1 /*
      2 * Copyright (c) 2025, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 
     14 #include "config/aom_config.h"
     15 #include "config/av1_rtcd.h"
     16 
     17 #include "aom_dsp/riscv/mem_rvv.h"
     18 #include "aom_ports/mem.h"
     19 #include "av1/common/filter.h"
     20 #include "av1/common/riscv/convolve_rvv.h"
     21 
     22 static inline vuint16mf2_t highbd_convolve6_4_y_rvv(
     23    const vint16mf2_t s0, const vint16mf2_t s1, const vint16mf2_t s2,
     24    const vint16mf2_t s3, const vint16mf2_t s4, const vint16mf2_t s5,
     25    const int16_t *filter, const uint16_t max, size_t vl) {
     26  // Values at indices 0 and 7 of y_filter are zero.
     27  vint32m1_t sum = __riscv_vwmul_vx_i32m1(s0, filter[1], vl);
     28  sum = __riscv_vwmacc_vx_i32m1(sum, filter[2], s1, vl);
     29  sum = __riscv_vwmacc_vx_i32m1(sum, filter[3], s2, vl);
     30  sum = __riscv_vwmacc_vx_i32m1(sum, filter[4], s3, vl);
     31  sum = __riscv_vwmacc_vx_i32m1(sum, filter[5], s4, vl);
     32  sum = __riscv_vwmacc_vx_i32m1(sum, filter[6], s5, vl);
     33 
     34  // Add rounding constant and shift
     35  sum = __riscv_vadd_vx_i32m1(sum, 1 << (COMPOUND_ROUND1_BITS - 1), vl);
     36 
     37  // Narrow result to 16-bit with rounding and saturation
     38  vint16mf2_t res = __riscv_vnsra_wx_i16mf2(sum, COMPOUND_ROUND1_BITS, vl);
     39 
     40  // Clamp result to max value
     41  vuint16mf2_t d0 =
     42      __riscv_vreinterpret_v_i16mf2_u16mf2(__riscv_vmax_vx_i16mf2(res, 0, vl));
     43  return __riscv_vminu_vx_u16mf2(d0, max, vl);
     44 }
     45 
     46 static inline vuint16m1_t highbd_convolve6_8_y_rvv(
     47    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
     48    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
     49    const int16_t *filter, const uint16_t max, size_t vl) {
     50  // Values at indices 0 and 7 of y_filter are zero.
     51  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, filter[1], vl);
     52  sum = __riscv_vwmacc_vx_i32m2(sum, filter[2], s1, vl);
     53  sum = __riscv_vwmacc_vx_i32m2(sum, filter[3], s2, vl);
     54  sum = __riscv_vwmacc_vx_i32m2(sum, filter[4], s3, vl);
     55  sum = __riscv_vwmacc_vx_i32m2(sum, filter[5], s4, vl);
     56  sum = __riscv_vwmacc_vx_i32m2(sum, filter[6], s5, vl);
     57 
     58  // Add rounding constant and shift
     59  sum = __riscv_vadd_vx_i32m2(sum, 1 << (COMPOUND_ROUND1_BITS - 1), vl);
     60 
     61  // Narrow result to 16-bit with rounding and saturation
     62  vint16m1_t res = __riscv_vnsra_wx_i16m1(sum, COMPOUND_ROUND1_BITS, vl);
     63 
     64  // Clamp result to max value
     65  vuint16m1_t d0 =
     66      __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(res, 0, vl));
     67  return __riscv_vminu_vx_u16m1(d0, max, vl);
     68 }
     69 
     70 static inline void highbd_convolve_y_sr_6tap_rvv(
     71    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
     72    int w, int h, const int16_t *y_filter, int bd) {
     73  const uint16_t max = (1 << bd) - 1;
     74  size_t vl = __riscv_vsetvl_e16m1(w);
     75 
     76  if (w == 4) {
     77    const int16_t *s = (const int16_t *)(src_ptr + src_stride);
     78    uint16_t *d = dst_ptr;
     79 
     80    // Load initial 5 rows of data
     81    vint16mf2_t s0, s1, s2, s3, s4;
     82    load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4, vl);
     83    s += 5 * src_stride;
     84 
     85    do {
     86      // Load next 4 rows of data
     87      vint16mf2_t s5, s6, s7, s8;
     88      load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8, vl);
     89 
     90      // Perform 6-tap convolution for 4 rows
     91      vuint16mf2_t d0 =
     92          highbd_convolve6_4_y_rvv(s0, s1, s2, s3, s4, s5, y_filter, max, vl);
     93      vuint16mf2_t d1 =
     94          highbd_convolve6_4_y_rvv(s1, s2, s3, s4, s5, s6, y_filter, max, vl);
     95      vuint16mf2_t d2 =
     96          highbd_convolve6_4_y_rvv(s2, s3, s4, s5, s6, s7, y_filter, max, vl);
     97      vuint16mf2_t d3 =
     98          highbd_convolve6_4_y_rvv(s3, s4, s5, s6, s7, s8, y_filter, max, vl);
     99 
    100      // Store results
    101      store_u16_4x4(d, dst_stride, d0, d1, d2, d3, vl);
    102 
    103      // Update source pointers for next iteration
    104      s0 = __riscv_vmv_v_v_i16mf2(s4, vl);
    105      s1 = __riscv_vmv_v_v_i16mf2(s5, vl);
    106      s2 = __riscv_vmv_v_v_i16mf2(s6, vl);
    107      s3 = __riscv_vmv_v_v_i16mf2(s7, vl);
    108      s4 = __riscv_vmv_v_v_i16mf2(s8, vl);
    109 
    110      s += 4 * src_stride;
    111      d += 4 * dst_stride;
    112      h -= 4;
    113    } while (h != 0);
    114  } else {
    115    do {
    116      int height = h;
    117      const int16_t *s = (const int16_t *)(src_ptr + src_stride);
    118      uint16_t *d = dst_ptr;
    119 
    120      // Load initial 5 rows of data
    121      vint16m1_t s0, s1, s2, s3, s4;
    122      load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4, vl);
    123      s += 5 * src_stride;
    124 
    125      do {
    126        // Load next 4 rows of data
    127        vint16m1_t s5, s6, s7, s8;
    128        load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8, vl);
    129 
    130        // Perform 6-tap convolution for 4 rows
    131        vuint16m1_t d0 =
    132            highbd_convolve6_8_y_rvv(s0, s1, s2, s3, s4, s5, y_filter, max, vl);
    133        vuint16m1_t d1 =
    134            highbd_convolve6_8_y_rvv(s1, s2, s3, s4, s5, s6, y_filter, max, vl);
    135        vuint16m1_t d2 =
    136            highbd_convolve6_8_y_rvv(s2, s3, s4, s5, s6, s7, y_filter, max, vl);
    137        vuint16m1_t d3 =
    138            highbd_convolve6_8_y_rvv(s3, s4, s5, s6, s7, s8, y_filter, max, vl);
    139 
    140        // Store results
    141        store_u16_8x4(d, dst_stride, d0, d1, d2, d3, vl);
    142 
    143        // Update source pointers for next iteration
    144        s0 = __riscv_vmv_v_v_i16m1(s4, vl);
    145        s1 = __riscv_vmv_v_v_i16m1(s5, vl);
    146        s2 = __riscv_vmv_v_v_i16m1(s6, vl);
    147        s3 = __riscv_vmv_v_v_i16m1(s7, vl);
    148        s4 = __riscv_vmv_v_v_i16m1(s8, vl);
    149 
    150        s += 4 * src_stride;
    151        d += 4 * dst_stride;
    152        height -= 4;
    153      } while (height != 0);
    154 
    155      src_ptr += vl;
    156      dst_ptr += vl;
    157      w -= vl;
    158    } while (w > 0);
    159  }
    160 }
    161 
    162 static inline vuint16mf2_t highbd_convolve8_4_y_rvv(
    163    const vint16mf2_t s0, const vint16mf2_t s1, const vint16mf2_t s2,
    164    const vint16mf2_t s3, const vint16mf2_t s4, const vint16mf2_t s5,
    165    const vint16mf2_t s6, const vint16mf2_t s7, const int16_t *filter,
    166    const uint16_t max, size_t vl) {
    167  vint32m1_t sum = __riscv_vwmul_vx_i32m1(s0, filter[0], vl);
    168  sum = __riscv_vwmacc_vx_i32m1(sum, filter[1], s1, vl);
    169  sum = __riscv_vwmacc_vx_i32m1(sum, filter[2], s2, vl);
    170  sum = __riscv_vwmacc_vx_i32m1(sum, filter[3], s3, vl);
    171  sum = __riscv_vwmacc_vx_i32m1(sum, filter[4], s4, vl);
    172  sum = __riscv_vwmacc_vx_i32m1(sum, filter[5], s5, vl);
    173  sum = __riscv_vwmacc_vx_i32m1(sum, filter[6], s6, vl);
    174  sum = __riscv_vwmacc_vx_i32m1(sum, filter[7], s7, vl);
    175 
    176  // Convert to unsigned 16-bit with saturation
    177  vuint32m1_t d0 =
    178      __riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vx_i32m1(sum, 0, vl));
    179  vuint16mf2_t res =
    180      __riscv_vnclipu_wx_u16mf2(d0, COMPOUND_ROUND1_BITS, __RISCV_VXRM_RNU, vl);
    181 
    182  // Clamp to max
    183  return __riscv_vminu_vx_u16mf2(res, max, vl);
    184 }
    185 
    186 static inline vuint16m1_t highbd_convolve8_8_y_rvv(
    187    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    188    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
    189    const vint16m1_t s6, const vint16m1_t s7, const int16_t *filter,
    190    const uint16_t max, size_t vl) {
    191  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, filter[0], vl);
    192  sum = __riscv_vwmacc_vx_i32m2(sum, filter[1], s1, vl);
    193  sum = __riscv_vwmacc_vx_i32m2(sum, filter[2], s2, vl);
    194  sum = __riscv_vwmacc_vx_i32m2(sum, filter[3], s3, vl);
    195  sum = __riscv_vwmacc_vx_i32m2(sum, filter[4], s4, vl);
    196  sum = __riscv_vwmacc_vx_i32m2(sum, filter[5], s5, vl);
    197  sum = __riscv_vwmacc_vx_i32m2(sum, filter[6], s6, vl);
    198  sum = __riscv_vwmacc_vx_i32m2(sum, filter[7], s7, vl);
    199 
    200  // Convert to unsigned 16-bit with saturation
    201  vuint32m2_t d0 =
    202      __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmax_vx_i32m2(sum, 0, vl));
    203  vuint16m1_t res =
    204      __riscv_vnclipu_wx_u16m1(d0, COMPOUND_ROUND1_BITS, __RISCV_VXRM_RNU, vl);
    205 
    206  // Clamp to max
    207  return __riscv_vminu_vx_u16m1(res, max, vl);
    208 }
    209 
    210 static inline void highbd_convolve_y_sr_8tap_rvv(
    211    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    212    int w, int h, const int16_t *y_filter, int bd) {
    213  const uint16_t max = (1 << bd) - 1;
    214  size_t vl = __riscv_vsetvl_e16m1(w);
    215 
    216  if (w == 4) {
    217    const int16_t *s = (const int16_t *)src_ptr;
    218    uint16_t *d = dst_ptr;
    219 
    220    // Load initial 7 rows of data
    221    vint16mf2_t s0, s1, s2, s3, s4, s5, s6;
    222    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, vl);
    223    s += 7 * src_stride;
    224 
    225    do {
    226      // Load next 4 rows of data
    227      vint16mf2_t s7, s8, s9, s10;
    228      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10, vl);
    229 
    230      // Perform 8-tap convolution for 4 rows
    231      vuint16mf2_t d0 = highbd_convolve8_4_y_rvv(s0, s1, s2, s3, s4, s5, s6, s7,
    232                                                 y_filter, max, vl);
    233      vuint16mf2_t d1 = highbd_convolve8_4_y_rvv(s1, s2, s3, s4, s5, s6, s7, s8,
    234                                                 y_filter, max, vl);
    235      vuint16mf2_t d2 = highbd_convolve8_4_y_rvv(s2, s3, s4, s5, s6, s7, s8, s9,
    236                                                 y_filter, max, vl);
    237      vuint16mf2_t d3 = highbd_convolve8_4_y_rvv(s3, s4, s5, s6, s7, s8, s9,
    238                                                 s10, y_filter, max, vl);
    239 
    240      // Store results
    241      store_u16_4x4(d, dst_stride, d0, d1, d2, d3, vl);
    242 
    243      // Update source pointers for next iteration
    244      s0 = __riscv_vmv_v_v_i16mf2(s4, vl);
    245      s1 = __riscv_vmv_v_v_i16mf2(s5, vl);
    246      s2 = __riscv_vmv_v_v_i16mf2(s6, vl);
    247      s3 = __riscv_vmv_v_v_i16mf2(s7, vl);
    248      s4 = __riscv_vmv_v_v_i16mf2(s8, vl);
    249      s5 = __riscv_vmv_v_v_i16mf2(s9, vl);
    250      s6 = __riscv_vmv_v_v_i16mf2(s10, vl);
    251 
    252      s += 4 * src_stride;
    253      d += 4 * dst_stride;
    254      h -= 4;
    255    } while (h != 0);
    256  } else {
    257    do {
    258      int height = h;
    259      const int16_t *s = (const int16_t *)src_ptr;
    260      uint16_t *d = dst_ptr;
    261 
    262      // Load initial 7 rows of data
    263      vint16m1_t s0, s1, s2, s3, s4, s5, s6;
    264      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, vl);
    265      s += 7 * src_stride;
    266 
    267      do {
    268        // Load next 4 rows of data
    269        vint16m1_t s7, s8, s9, s10;
    270        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10, vl);
    271 
    272        // Perform 8-tap convolution for 4 rows
    273        vuint16m1_t d0 = highbd_convolve8_8_y_rvv(s0, s1, s2, s3, s4, s5, s6,
    274                                                  s7, y_filter, max, vl);
    275        vuint16m1_t d1 = highbd_convolve8_8_y_rvv(s1, s2, s3, s4, s5, s6, s7,
    276                                                  s8, y_filter, max, vl);
    277        vuint16m1_t d2 = highbd_convolve8_8_y_rvv(s2, s3, s4, s5, s6, s7, s8,
    278                                                  s9, y_filter, max, vl);
    279        vuint16m1_t d3 = highbd_convolve8_8_y_rvv(s3, s4, s5, s6, s7, s8, s9,
    280                                                  s10, y_filter, max, vl);
    281 
    282        // Store results
    283        store_u16_8x4(d, dst_stride, d0, d1, d2, d3, vl);
    284 
    285        // Update source pointers for next iteration
    286        s0 = __riscv_vmv_v_v_i16m1(s4, vl);
    287        s1 = __riscv_vmv_v_v_i16m1(s5, vl);
    288        s2 = __riscv_vmv_v_v_i16m1(s6, vl);
    289        s3 = __riscv_vmv_v_v_i16m1(s7, vl);
    290        s4 = __riscv_vmv_v_v_i16m1(s8, vl);
    291        s5 = __riscv_vmv_v_v_i16m1(s9, vl);
    292        s6 = __riscv_vmv_v_v_i16m1(s10, vl);
    293 
    294        s += 4 * src_stride;
    295        d += 4 * dst_stride;
    296        height -= 4;
    297      } while (height != 0);
    298 
    299      src_ptr += vl;
    300      dst_ptr += vl;
    301      w -= vl;
    302    } while (w > 0);
    303  }
    304 }
    305 
    306 static inline vuint16mf2_t highbd_convolve12_4_y_rvv(
    307    const vint16mf2_t s0, const vint16mf2_t s1, const vint16mf2_t s2,
    308    const vint16mf2_t s3, const vint16mf2_t s4, const vint16mf2_t s5,
    309    const vint16mf2_t s6, const vint16mf2_t s7, const vint16mf2_t s8,
    310    const vint16mf2_t s9, const vint16mf2_t s10, const vint16mf2_t s11,
    311    const int16_t *filter, const uint16_t max, size_t vl) {
    312  vint32m1_t sum = __riscv_vwmul_vx_i32m1(s0, filter[0], vl);
    313  sum = __riscv_vwmacc_vx_i32m1(sum, filter[1], s1, vl);
    314  sum = __riscv_vwmacc_vx_i32m1(sum, filter[2], s2, vl);
    315  sum = __riscv_vwmacc_vx_i32m1(sum, filter[3], s3, vl);
    316  sum = __riscv_vwmacc_vx_i32m1(sum, filter[4], s4, vl);
    317  sum = __riscv_vwmacc_vx_i32m1(sum, filter[5], s5, vl);
    318  sum = __riscv_vwmacc_vx_i32m1(sum, filter[6], s6, vl);
    319  sum = __riscv_vwmacc_vx_i32m1(sum, filter[7], s7, vl);
    320  sum = __riscv_vwmacc_vx_i32m1(sum, filter[8], s8, vl);
    321  sum = __riscv_vwmacc_vx_i32m1(sum, filter[9], s9, vl);
    322  sum = __riscv_vwmacc_vx_i32m1(sum, filter[10], s10, vl);
    323  sum = __riscv_vwmacc_vx_i32m1(sum, filter[11], s11, vl);
    324 
    325  // Convert to unsigned 16-bit with saturation
    326  vuint32m1_t d0 =
    327      __riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vx_i32m1(sum, 0, vl));
    328  vuint16mf2_t res =
    329      __riscv_vnclipu_wx_u16mf2(d0, COMPOUND_ROUND1_BITS, __RISCV_VXRM_RNU, vl);
    330 
    331  // Clamp to max
    332  return __riscv_vminu_vx_u16mf2(res, max, vl);
    333 }
    334 
    335 static inline vuint16m1_t highbd_convolve12_8_y_rvv(
    336    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    337    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
    338    const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8,
    339    const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11,
    340    const int16_t *filter, const uint16_t max, size_t vl) {
    341  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, filter[0], vl);
    342  sum = __riscv_vwmacc_vx_i32m2(sum, filter[1], s1, vl);
    343  sum = __riscv_vwmacc_vx_i32m2(sum, filter[2], s2, vl);
    344  sum = __riscv_vwmacc_vx_i32m2(sum, filter[3], s3, vl);
    345  sum = __riscv_vwmacc_vx_i32m2(sum, filter[4], s4, vl);
    346  sum = __riscv_vwmacc_vx_i32m2(sum, filter[5], s5, vl);
    347  sum = __riscv_vwmacc_vx_i32m2(sum, filter[6], s6, vl);
    348  sum = __riscv_vwmacc_vx_i32m2(sum, filter[7], s7, vl);
    349  sum = __riscv_vwmacc_vx_i32m2(sum, filter[8], s8, vl);
    350  sum = __riscv_vwmacc_vx_i32m2(sum, filter[9], s9, vl);
    351  sum = __riscv_vwmacc_vx_i32m2(sum, filter[10], s10, vl);
    352  sum = __riscv_vwmacc_vx_i32m2(sum, filter[11], s11, vl);
    353 
    354  // Convert to unsigned 16-bit with saturation
    355  vuint32m2_t d0 =
    356      __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmax_vx_i32m2(sum, 0, vl));
    357  vuint16m1_t res =
    358      __riscv_vnclipu_wx_u16m1(d0, COMPOUND_ROUND1_BITS, __RISCV_VXRM_RNU, vl);
    359 
    360  // Clamp to max
    361  return __riscv_vminu_vx_u16m1(res, max, vl);
    362 }
    363 
    364 static inline void highbd_convolve_y_sr_12tap_rvv(
    365    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    366    int w, int h, const int16_t *y_filter, int bd) {
    367  const uint16_t max = (1 << bd) - 1;
    368  size_t vl = __riscv_vsetvl_e16m1(w);
    369 
    370  if (w == 4) {
    371    const int16_t *s = (const int16_t *)src_ptr;
    372    uint16_t *d = dst_ptr;
    373 
    374    // Load initial 11 rows of data
    375    vint16mf2_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
    376    load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
    377                  &s9, &s10, vl);
    378    s += 11 * src_stride;
    379 
    380    do {
    381      // Load next 4 rows of data
    382      vint16mf2_t s11, s12, s13, s14;
    383      load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14, vl);
    384 
    385      // Perform 12-tap convolution for 4 rows
    386      vuint16mf2_t d0 = highbd_convolve12_4_y_rvv(
    387          s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter, max, vl);
    388      vuint16mf2_t d1 = highbd_convolve12_4_y_rvv(
    389          s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter, max, vl);
    390      vuint16mf2_t d2 =
    391          highbd_convolve12_4_y_rvv(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
    392                                    s12, s13, y_filter, max, vl);
    393      vuint16mf2_t d3 =
    394          highbd_convolve12_4_y_rvv(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
    395                                    s13, s14, y_filter, max, vl);
    396 
    397      // Store results
    398      store_u16_4x4(d, dst_stride, d0, d1, d2, d3, vl);
    399 
    400      // Update source pointers for next iteration
    401      s0 = __riscv_vmv_v_v_i16mf2(s4, vl);
    402      s1 = __riscv_vmv_v_v_i16mf2(s5, vl);
    403      s2 = __riscv_vmv_v_v_i16mf2(s6, vl);
    404      s3 = __riscv_vmv_v_v_i16mf2(s7, vl);
    405      s4 = __riscv_vmv_v_v_i16mf2(s8, vl);
    406      s5 = __riscv_vmv_v_v_i16mf2(s9, vl);
    407      s6 = __riscv_vmv_v_v_i16mf2(s10, vl);
    408      s7 = __riscv_vmv_v_v_i16mf2(s11, vl);
    409      s8 = __riscv_vmv_v_v_i16mf2(s12, vl);
    410      s9 = __riscv_vmv_v_v_i16mf2(s13, vl);
    411      s10 = __riscv_vmv_v_v_i16mf2(s14, vl);
    412 
    413      s += 4 * src_stride;
    414      d += 4 * dst_stride;
    415      h -= 4;
    416    } while (h != 0);
    417  } else {
    418    do {
    419      int height = h;
    420      const int16_t *s = (const int16_t *)src_ptr;
    421      uint16_t *d = dst_ptr;
    422 
    423      // Load initial 11 rows of data
    424      vint16m1_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
    425      load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
    426                    &s9, &s10, vl);
    427      s += 11 * src_stride;
    428 
    429      do {
    430        // Load next 4 rows of data
    431        vint16m1_t s11, s12, s13, s14;
    432        load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14, vl);
    433 
    434        // Perform 12-tap convolution for 4 rows
    435        vuint16m1_t d0 =
    436            highbd_convolve12_8_y_rvv(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
    437                                      s10, s11, y_filter, max, vl);
    438        vuint16m1_t d1 =
    439            highbd_convolve12_8_y_rvv(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
    440                                      s11, s12, y_filter, max, vl);
    441        vuint16m1_t d2 =
    442            highbd_convolve12_8_y_rvv(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
    443                                      s12, s13, y_filter, max, vl);
    444        vuint16m1_t d3 =
    445            highbd_convolve12_8_y_rvv(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
    446                                      s13, s14, y_filter, max, vl);
    447 
    448        // Store results
    449        store_u16_8x4(d, dst_stride, d0, d1, d2, d3, vl);
    450 
    451        // Update source pointers for next iteration
    452        s0 = __riscv_vmv_v_v_i16m1(s4, vl);
    453        s1 = __riscv_vmv_v_v_i16m1(s5, vl);
    454        s2 = __riscv_vmv_v_v_i16m1(s6, vl);
    455        s3 = __riscv_vmv_v_v_i16m1(s7, vl);
    456        s4 = __riscv_vmv_v_v_i16m1(s8, vl);
    457        s5 = __riscv_vmv_v_v_i16m1(s9, vl);
    458        s6 = __riscv_vmv_v_v_i16m1(s10, vl);
    459        s7 = __riscv_vmv_v_v_i16m1(s11, vl);
    460        s8 = __riscv_vmv_v_v_i16m1(s12, vl);
    461        s9 = __riscv_vmv_v_v_i16m1(s13, vl);
    462        s10 = __riscv_vmv_v_v_i16m1(s14, vl);
    463 
    464        s += 4 * src_stride;
    465        d += 4 * dst_stride;
    466        height -= 4;
    467      } while (height != 0);
    468 
    469      src_ptr += vl;
    470      dst_ptr += vl;
    471      w -= vl;
    472    } while (w > 0);
    473  }
    474 }
    475 
    476 void av1_highbd_convolve_y_sr_rvv(const uint16_t *src, int src_stride,
    477                                  uint16_t *dst, int dst_stride, int w, int h,
    478                                  const InterpFilterParams *filter_params_y,
    479                                  const int subpel_y_qn, int bd) {
    480  if (w == 2 || h == 2) {
    481    av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
    482                               filter_params_y, subpel_y_qn, bd);
    483    return;
    484  }
    485 
    486  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
    487  const int vert_offset = filter_params_y->taps / 2 - 1;
    488  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
    489      filter_params_y, subpel_y_qn & SUBPEL_MASK);
    490 
    491  src -= vert_offset * src_stride;
    492 
    493  if (y_filter_taps > 8) {
    494    highbd_convolve_y_sr_12tap_rvv(src, src_stride, dst, dst_stride, w, h,
    495                                   y_filter_ptr, bd);
    496    return;
    497  }
    498  if (y_filter_taps < 8) {
    499    highbd_convolve_y_sr_6tap_rvv(src, src_stride, dst, dst_stride, w, h,
    500                                  y_filter_ptr, bd);
    501    return;
    502  }
    503 
    504  highbd_convolve_y_sr_8tap_rvv(src, src_stride, dst, dst_stride, w, h,
    505                                y_filter_ptr, bd);
    506 }
    507 
    508 static inline vuint16m1_t highbd_convolve6_8_x_rvv(
    509    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    510    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
    511    const int16_t *filter, const int32_t offset, const uint16_t max,
    512    size_t vl) {
    513  // Values at indices 0 and 7 of y_filter are zero.
    514  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, filter[1], vl);
    515  sum = __riscv_vwmacc_vx_i32m2(sum, filter[2], s1, vl);
    516  sum = __riscv_vwmacc_vx_i32m2(sum, filter[3], s2, vl);
    517  sum = __riscv_vwmacc_vx_i32m2(sum, filter[4], s3, vl);
    518  sum = __riscv_vwmacc_vx_i32m2(sum, filter[5], s4, vl);
    519  sum = __riscv_vwmacc_vx_i32m2(sum, filter[6], s5, vl);
    520 
    521  // Add rounding constant and offset
    522  sum = __riscv_vadd_vx_i32m2(sum, (1 << (FILTER_BITS - 1)) + offset, vl);
    523 
    524  // Narrow result to 16-bit with rounding and saturation
    525  vint16m1_t res = __riscv_vnsra_wx_i16m1(sum, FILTER_BITS, vl);
    526 
    527  // Clamp result to max value
    528  vuint16m1_t d0 =
    529      __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(res, 0, vl));
    530  return __riscv_vminu_vx_u16m1(d0, max, vl);
    531 }
    532 
    533 static inline void highbd_convolve_x_sr_6tap_rvv(
    534    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    535    int w, int h, const int16_t *x_filter, ConvolveParams *conv_params,
    536    int bd) {
    537  const uint16_t max = (1 << bd) - 1;
    538  // This shim allows to do only one rounding shift instead of two.
    539  const int32_t offset = 1 << (conv_params->round_0 - 1);
    540 
    541  int height = h;
    542  size_t vl = __riscv_vsetvl_e16m1(w);
    543 
    544  do {
    545    int width = w;
    546    const int16_t *s = (const int16_t *)src_ptr;
    547    uint16_t *d = dst_ptr;
    548 
    549    do {
    550      vint16m1_t s00, s01, s02, s03, s04, s05;
    551      vint16m1_t s10, s11, s12, s13, s14, s15;
    552      vint16m1_t s20, s21, s22, s23, s24, s25;
    553      vint16m1_t s30, s31, s32, s33, s34, s35;
    554 
    555      // Load 6 elements for each of 4 rows
    556      load_s16_8x6(s + 0 * src_stride, 1, &s00, &s01, &s02, &s03, &s04, &s05,
    557                   vl);
    558      load_s16_8x6(s + 1 * src_stride, 1, &s10, &s11, &s12, &s13, &s14, &s15,
    559                   vl);
    560      load_s16_8x6(s + 2 * src_stride, 1, &s20, &s21, &s22, &s23, &s24, &s25,
    561                   vl);
    562      load_s16_8x6(s + 3 * src_stride, 1, &s30, &s31, &s32, &s33, &s34, &s35,
    563                   vl);
    564 
    565      // Perform convolution
    566      vuint16m1_t d0 = highbd_convolve6_8_x_rvv(s00, s01, s02, s03, s04, s05,
    567                                                x_filter, offset, max, vl);
    568      vuint16m1_t d1 = highbd_convolve6_8_x_rvv(s10, s11, s12, s13, s14, s15,
    569                                                x_filter, offset, max, vl);
    570      vuint16m1_t d2 = highbd_convolve6_8_x_rvv(s20, s21, s22, s23, s24, s25,
    571                                                x_filter, offset, max, vl);
    572      vuint16m1_t d3 = highbd_convolve6_8_x_rvv(s30, s31, s32, s33, s34, s35,
    573                                                x_filter, offset, max, vl);
    574 
    575      // Store results
    576      store_u16_8x4(d, dst_stride, d0, d1, d2, d3, vl);
    577 
    578      s += vl;
    579      d += vl;
    580      width -= vl;
    581    } while (width > 0);
    582 
    583    src_ptr += 4 * src_stride;
    584    dst_ptr += 4 * dst_stride;
    585    height -= 4;
    586  } while (height != 0);
    587 }
    588 
    589 static inline vuint16mf2_t highbd_convolve4_4_x_rvv(
    590    const vint16mf2_t s0, const vint16mf2_t s1, const vint16mf2_t s2,
    591    const vint16mf2_t s3, const int16_t *filter, const int32_t offset,
    592    const uint16_t max, size_t vl) {
    593  vint32m1_t sum = __riscv_vwmul_vx_i32m1(s0, filter[0], vl);
    594  sum = __riscv_vwmacc_vx_i32m1(sum, filter[1], s1, vl);
    595  sum = __riscv_vwmacc_vx_i32m1(sum, filter[2], s2, vl);
    596  sum = __riscv_vwmacc_vx_i32m1(sum, filter[3], s3, vl);
    597 
    598  // Add rounding constant and offset
    599  sum = __riscv_vadd_vx_i32m1(sum, (1 << (FILTER_BITS - 1)) + offset, vl);
    600 
    601  // Narrow result to 16-bit with rounding and saturation
    602  vint16mf2_t res = __riscv_vnsra_wx_i16mf2(sum, FILTER_BITS, vl);
    603 
    604  // Clamp result to max value
    605  vuint16mf2_t d0 =
    606      __riscv_vreinterpret_v_i16mf2_u16mf2(__riscv_vmax_vx_i16mf2(res, 0, vl));
    607  return __riscv_vminu_vx_u16mf2(d0, max, vl);
    608 }
    609 
    610 static inline vuint16m1_t highbd_convolve8_8_x_rvv(
    611    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    612    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
    613    const vint16m1_t s6, const vint16m1_t s7, const int16_t *filter,
    614    const int32_t offset, const uint16_t max, size_t vl) {
    615  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, filter[0], vl);
    616  sum = __riscv_vwmacc_vx_i32m2(sum, filter[1], s1, vl);
    617  sum = __riscv_vwmacc_vx_i32m2(sum, filter[2], s2, vl);
    618  sum = __riscv_vwmacc_vx_i32m2(sum, filter[3], s3, vl);
    619  sum = __riscv_vwmacc_vx_i32m2(sum, filter[4], s4, vl);
    620  sum = __riscv_vwmacc_vx_i32m2(sum, filter[5], s5, vl);
    621  sum = __riscv_vwmacc_vx_i32m2(sum, filter[6], s6, vl);
    622  sum = __riscv_vwmacc_vx_i32m2(sum, filter[7], s7, vl);
    623 
    624  sum = __riscv_vwadd_wx_i32m2(sum, offset, vl);
    625 
    626  // Convert to unsigned 16-bit with saturation
    627  vuint32m2_t d0 =
    628      __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmax_vx_i32m2(sum, 0, vl));
    629  vuint16m1_t res =
    630      __riscv_vnclipu_wx_u16m1(d0, FILTER_BITS, __RISCV_VXRM_RNU, vl);
    631 
    632  // Clamp to max
    633  return __riscv_vminu_vx_u16m1(res, max, vl);
    634 }
    635 
    636 static inline void highbd_convolve_x_sr_rvv(const uint16_t *src_ptr,
    637                                            int src_stride, uint16_t *dst_ptr,
    638                                            int dst_stride, int w, int h,
    639                                            const int16_t *x_filter,
    640                                            ConvolveParams *conv_params,
    641                                            int bd) {
    642  // This shim allows to do only one rounding shift instead of two.
    643  const int32_t offset = 1 << (conv_params->round_0 - 1);
    644  const uint16_t max = (1 << bd) - 1;
    645  size_t vl = __riscv_vsetvl_e16m1(w);
    646 
    647  if (w == 4) {
    648    // 4-tap filters are used for blocks having width == 4.
    649    const int16_t *s = (const int16_t *)(src_ptr + 2);
    650    uint16_t *d = dst_ptr;
    651    const int16_t *x_filter_ptr = x_filter + 2;
    652 
    653    do {
    654      vint16mf2_t s00, s01, s02, s03;
    655      vint16mf2_t s10, s11, s12, s13;
    656      vint16mf2_t s20, s21, s22, s23;
    657      vint16mf2_t s30, s31, s32, s33;
    658 
    659      // Load pixels from each of 4 rows
    660      load_s16_4x4(s + 0 * src_stride, 1, &s00, &s01, &s02, &s03, vl);
    661      load_s16_4x4(s + 1 * src_stride, 1, &s10, &s11, &s12, &s13, vl);
    662      load_s16_4x4(s + 2 * src_stride, 1, &s20, &s21, &s22, &s23, vl);
    663      load_s16_4x4(s + 3 * src_stride, 1, &s30, &s31, &s32, &s33, vl);
    664 
    665      // Perform convolution for 4 rows
    666      vuint16mf2_t d0 = highbd_convolve4_4_x_rvv(s00, s01, s02, s03,
    667                                                 x_filter_ptr, offset, max, vl);
    668      vuint16mf2_t d1 = highbd_convolve4_4_x_rvv(s10, s11, s12, s13,
    669                                                 x_filter_ptr, offset, max, vl);
    670      vuint16mf2_t d2 = highbd_convolve4_4_x_rvv(s20, s21, s22, s23,
    671                                                 x_filter_ptr, offset, max, vl);
    672      vuint16mf2_t d3 = highbd_convolve4_4_x_rvv(s30, s31, s32, s33,
    673                                                 x_filter_ptr, offset, max, vl);
    674 
    675      // Store results
    676      store_u16_4x4(d, dst_stride, d0, d1, d2, d3, vl);
    677 
    678      s += 4 * src_stride;
    679      d += 4 * dst_stride;
    680      h -= 4;
    681    } while (h != 0);
    682  } else {
    683    int height = h;
    684    do {
    685      int width = w;
    686      const int16_t *s = (const int16_t *)src_ptr;
    687      uint16_t *d = dst_ptr;
    688 
    689      do {
    690        vint16m1_t s00, s01, s02, s03, s04, s05, s06, s07;
    691        vint16m1_t s10, s11, s12, s13, s14, s15, s16, s17;
    692        vint16m1_t s20, s21, s22, s23, s24, s25, s26, s27;
    693        vint16m1_t s30, s31, s32, s33, s34, s35, s36, s37;
    694 
    695        // Load elements for each of 4 rows
    696        load_s16_8x8(s + 0 * src_stride, 1, &s00, &s01, &s02, &s03, &s04, &s05,
    697                     &s06, &s07, vl);
    698        load_s16_8x8(s + 1 * src_stride, 1, &s10, &s11, &s12, &s13, &s14, &s15,
    699                     &s16, &s17, vl);
    700        load_s16_8x8(s + 2 * src_stride, 1, &s20, &s21, &s22, &s23, &s24, &s25,
    701                     &s26, &s27, vl);
    702        load_s16_8x8(s + 3 * src_stride, 1, &s30, &s31, &s32, &s33, &s34, &s35,
    703                     &s36, &s37, vl);
    704 
    705        // Perform convolution
    706        vuint16m1_t d0 = highbd_convolve8_8_x_rvv(
    707            s00, s01, s02, s03, s04, s05, s06, s07, x_filter, offset, max, vl);
    708        vuint16m1_t d1 = highbd_convolve8_8_x_rvv(
    709            s10, s11, s12, s13, s14, s15, s16, s17, x_filter, offset, max, vl);
    710        vuint16m1_t d2 = highbd_convolve8_8_x_rvv(
    711            s20, s21, s22, s23, s24, s25, s26, s27, x_filter, offset, max, vl);
    712        vuint16m1_t d3 = highbd_convolve8_8_x_rvv(
    713            s30, s31, s32, s33, s34, s35, s36, s37, x_filter, offset, max, vl);
    714 
    715        // Store results
    716        store_u16_8x4(d, dst_stride, d0, d1, d2, d3, vl);
    717 
    718        s += vl;
    719        d += vl;
    720        width -= vl;
    721      } while (width > 0);
    722 
    723      src_ptr += 4 * src_stride;
    724      dst_ptr += 4 * dst_stride;
    725      height -= 4;
    726    } while (height != 0);
    727  }
    728 }
    729 
    730 static inline vuint16mf2_t highbd_convolve12_4_x_rvv(
    731    const vint16mf2_t s0, const vint16mf2_t s1, const vint16mf2_t s2,
    732    const vint16mf2_t s3, const vint16mf2_t s4, const vint16mf2_t s5,
    733    const vint16mf2_t s6, const vint16mf2_t s7, const vint16mf2_t s8,
    734    const vint16mf2_t s9, const vint16mf2_t s10, const vint16mf2_t s11,
    735    const int16_t *filter, const int32_t offset, const uint16_t max,
    736    size_t vl) {
    737  vint32m1_t sum = __riscv_vwmul_vx_i32m1(s0, filter[0], vl);
    738  sum = __riscv_vwmacc_vx_i32m1(sum, filter[1], s1, vl);
    739  sum = __riscv_vwmacc_vx_i32m1(sum, filter[2], s2, vl);
    740  sum = __riscv_vwmacc_vx_i32m1(sum, filter[3], s3, vl);
    741  sum = __riscv_vwmacc_vx_i32m1(sum, filter[4], s4, vl);
    742  sum = __riscv_vwmacc_vx_i32m1(sum, filter[5], s5, vl);
    743  sum = __riscv_vwmacc_vx_i32m1(sum, filter[6], s6, vl);
    744  sum = __riscv_vwmacc_vx_i32m1(sum, filter[7], s7, vl);
    745  sum = __riscv_vwmacc_vx_i32m1(sum, filter[8], s8, vl);
    746  sum = __riscv_vwmacc_vx_i32m1(sum, filter[9], s9, vl);
    747  sum = __riscv_vwmacc_vx_i32m1(sum, filter[10], s10, vl);
    748  sum = __riscv_vwmacc_vx_i32m1(sum, filter[11], s11, vl);
    749  sum = __riscv_vwadd_wx_i32m1(sum, offset, vl);
    750 
    751  // Convert to unsigned 16-bit with saturation
    752  vuint32m1_t d0 =
    753      __riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vx_i32m1(sum, 0, vl));
    754  vuint16mf2_t res =
    755      __riscv_vnclipu_wx_u16mf2(d0, FILTER_BITS, __RISCV_VXRM_RNU, vl);
    756 
    757  // Clamp to max
    758  return __riscv_vminu_vx_u16mf2(res, max, vl);
    759 }
    760 
    761 static inline vuint16m1_t highbd_convolve12_8_x_rvv(
    762    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    763    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
    764    const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8,
    765    const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11,
    766    const int16_t *filter, const int32_t offset, const uint16_t max,
    767    size_t vl) {
    768  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, filter[0], vl);
    769  sum = __riscv_vwmacc_vx_i32m2(sum, filter[1], s1, vl);
    770  sum = __riscv_vwmacc_vx_i32m2(sum, filter[2], s2, vl);
    771  sum = __riscv_vwmacc_vx_i32m2(sum, filter[3], s3, vl);
    772  sum = __riscv_vwmacc_vx_i32m2(sum, filter[4], s4, vl);
    773  sum = __riscv_vwmacc_vx_i32m2(sum, filter[5], s5, vl);
    774  sum = __riscv_vwmacc_vx_i32m2(sum, filter[6], s6, vl);
    775  sum = __riscv_vwmacc_vx_i32m2(sum, filter[7], s7, vl);
    776  sum = __riscv_vwmacc_vx_i32m2(sum, filter[8], s8, vl);
    777  sum = __riscv_vwmacc_vx_i32m2(sum, filter[9], s9, vl);
    778  sum = __riscv_vwmacc_vx_i32m2(sum, filter[10], s10, vl);
    779  sum = __riscv_vwmacc_vx_i32m2(sum, filter[11], s11, vl);
    780  sum = __riscv_vwadd_wx_i32m2(sum, offset, vl);
    781 
    782  // Convert to unsigned 16-bit with saturation
    783  vuint32m2_t d0 =
    784      __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmax_vx_i32m2(sum, 0, vl));
    785  vuint16m1_t res =
    786      __riscv_vnclipu_wx_u16m1(d0, FILTER_BITS, __RISCV_VXRM_RNU, vl);
    787 
    788  // Clamp to max
    789  return __riscv_vminu_vx_u16m1(res, max, vl);
    790 }
    791 
    792 static inline void highbd_convolve_x_sr_12tap_rvv(
    793    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    794    int w, int h, const int16_t *x_filter, ConvolveParams *conv_params,
    795    int bd) {
    796  // This shim allows to do only one rounding shift instead of two.
    797  const int32_t offset = 1 << (conv_params->round_0 - 1);
    798  const uint16_t max = (1 << bd) - 1;
    799  size_t vl = __riscv_vsetvl_e16m1(w);
    800 
    801  if (w == 4) {
    802    const int16_t *s = (const int16_t *)src_ptr;
    803    uint16_t *d = dst_ptr;
    804 
    805    do {
    806      vint16mf2_t s00, s01, s02, s03, s04, s05, s06, s07, s08, s09, s010, s011;
    807      vint16mf2_t s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s110, s111;
    808      vint16mf2_t s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s210, s211;
    809      vint16mf2_t s30, s31, s32, s33, s34, s35, s36, s37, s38, s39, s310, s311;
    810 
    811      // Load elements for each of 4 rows
    812      load_s16_4x12(s + 0 * src_stride, 1, &s00, &s01, &s02, &s03, &s04, &s05,
    813                    &s06, &s07, &s08, &s09, &s010, &s011, vl);
    814      load_s16_4x12(s + 1 * src_stride, 1, &s10, &s11, &s12, &s13, &s14, &s15,
    815                    &s16, &s17, &s18, &s19, &s110, &s111, vl);
    816      load_s16_4x12(s + 2 * src_stride, 1, &s20, &s21, &s22, &s23, &s24, &s25,
    817                    &s26, &s27, &s28, &s29, &s210, &s211, vl);
    818      load_s16_4x12(s + 3 * src_stride, 1, &s30, &s31, &s32, &s33, &s34, &s35,
    819                    &s36, &s37, &s38, &s39, &s310, &s311, vl);
    820 
    821      // Perform convolution
    822      vuint16mf2_t d0 =
    823          highbd_convolve12_4_x_rvv(s00, s01, s02, s03, s04, s05, s06, s07, s08,
    824                                    s09, s010, s011, x_filter, offset, max, vl);
    825      vuint16mf2_t d1 =
    826          highbd_convolve12_4_x_rvv(s10, s11, s12, s13, s14, s15, s16, s17, s18,
    827                                    s19, s110, s111, x_filter, offset, max, vl);
    828      vuint16mf2_t d2 =
    829          highbd_convolve12_4_x_rvv(s20, s21, s22, s23, s24, s25, s26, s27, s28,
    830                                    s29, s210, s211, x_filter, offset, max, vl);
    831      vuint16mf2_t d3 =
    832          highbd_convolve12_4_x_rvv(s30, s31, s32, s33, s34, s35, s36, s37, s38,
    833                                    s39, s310, s311, x_filter, offset, max, vl);
    834 
    835      // Store results
    836      store_u16_4x4(d, dst_stride, d0, d1, d2, d3, vl);
    837 
    838      s += 4 * src_stride;
    839      d += 4 * dst_stride;
    840      h -= 4;
    841    } while (h != 0);
    842  } else {
    843    int height = h;
    844    do {
    845      const int16_t *s = (const int16_t *)src_ptr;
    846      uint16_t *d = dst_ptr;
    847      int width = w;
    848 
    849      do {
    850        vint16m1_t s00, s01, s02, s03, s04, s05, s06, s07, s08, s09, s010, s011;
    851        vint16m1_t s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s110, s111;
    852        vint16m1_t s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s210, s211;
    853        vint16m1_t s30, s31, s32, s33, s34, s35, s36, s37, s38, s39, s310, s311;
    854 
    855        // Load elements for each of 4 rows
    856        load_s16_8x12(s + 0 * src_stride, 1, &s00, &s01, &s02, &s03, &s04, &s05,
    857                      &s06, &s07, &s08, &s09, &s010, &s011, vl);
    858        load_s16_8x12(s + 1 * src_stride, 1, &s10, &s11, &s12, &s13, &s14, &s15,
    859                      &s16, &s17, &s18, &s19, &s110, &s111, vl);
    860        load_s16_8x12(s + 2 * src_stride, 1, &s20, &s21, &s22, &s23, &s24, &s25,
    861                      &s26, &s27, &s28, &s29, &s210, &s211, vl);
    862        load_s16_8x12(s + 3 * src_stride, 1, &s30, &s31, &s32, &s33, &s34, &s35,
    863                      &s36, &s37, &s38, &s39, &s310, &s311, vl);
    864 
    865        // Perform convolution
    866        vuint16m1_t d0 = highbd_convolve12_8_x_rvv(
    867            s00, s01, s02, s03, s04, s05, s06, s07, s08, s09, s010, s011,
    868            x_filter, offset, max, vl);
    869        vuint16m1_t d1 = highbd_convolve12_8_x_rvv(
    870            s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s110, s111,
    871            x_filter, offset, max, vl);
    872        vuint16m1_t d2 = highbd_convolve12_8_x_rvv(
    873            s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s210, s211,
    874            x_filter, offset, max, vl);
    875        vuint16m1_t d3 = highbd_convolve12_8_x_rvv(
    876            s30, s31, s32, s33, s34, s35, s36, s37, s38, s39, s310, s311,
    877            x_filter, offset, max, vl);
    878 
    879        // Store results
    880        store_u16_8x4(d, dst_stride, d0, d1, d2, d3, vl);
    881 
    882        s += vl;
    883        d += vl;
    884        width -= vl;
    885      } while (width > 0);
    886 
    887      src_ptr += 4 * src_stride;
    888      dst_ptr += 4 * dst_stride;
    889      height -= 4;
    890    } while (height != 0);
    891  }
    892 }
    893 
    894 void av1_highbd_convolve_x_sr_rvv(const uint16_t *src, int src_stride,
    895                                  uint16_t *dst, int dst_stride, int w, int h,
    896                                  const InterpFilterParams *filter_params_x,
    897                                  const int subpel_x_qn,
    898                                  ConvolveParams *conv_params, int bd) {
    899  if (w == 2 || h == 2) {
    900    av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
    901                               filter_params_x, subpel_x_qn, conv_params, bd);
    902    return;
    903  }
    904  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
    905  const int horiz_offset = filter_params_x->taps / 2 - 1;
    906  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
    907      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    908 
    909  src -= horiz_offset;
    910 
    911  if (x_filter_taps > 8) {
    912    highbd_convolve_x_sr_12tap_rvv(src, src_stride, dst, dst_stride, w, h,
    913                                   x_filter_ptr, conv_params, bd);
    914    return;
    915  }
    916  if (x_filter_taps <= 6 && w != 4) {
    917    highbd_convolve_x_sr_6tap_rvv(src + 1, src_stride, dst, dst_stride, w, h,
    918                                  x_filter_ptr, conv_params, bd);
    919    return;
    920  }
    921 
    922  highbd_convolve_x_sr_rvv(src, src_stride, dst, dst_stride, w, h, x_filter_ptr,
    923                           conv_params, bd);
    924 }
    925 
    926 // store_strided_u16_4xN
    927 static inline void store_strided_u16_4xN(uint16_t *addr, vuint16m1_t vdst,
    928                                         ptrdiff_t stride, size_t vl) {
    929  __riscv_vse16_v_u16m1(addr, vdst, vl >> 1);
    930  vdst = __riscv_vslidedown_vx_u16m1(vdst, vl >> 1, vl);
    931  __riscv_vse16_v_u16m1(addr + stride, vdst, vl >> 1);
    932 }
    933 
    934 static inline vuint16m1_t highbd_convolve12_2d_v_rvv(
    935    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    936    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
    937    const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8,
    938    const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11,
    939    const int16_t *y_filter, const int32_t offset, const int32_t shift,
    940    const uint16_t max, size_t vl) {
    941  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl);
    942  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl);
    943  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl);
    944  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl);
    945  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[4], s4, vl);
    946  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[5], s5, vl);
    947  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[6], s6, vl);
    948  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[7], s7, vl);
    949  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[8], s8, vl);
    950  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[9], s9, vl);
    951  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[10], s10, vl);
    952  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[11], s11, vl);
    953  sum = __riscv_vadd_vx_i32m2(sum, offset, vl);
    954 
    955  vint16m1_t i16_sum = __riscv_vnsra_wx_i16m1(sum, shift, vl);
    956  vint16m1_t iclip_sum =
    957      __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), max, vl);
    958  return __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum);
    959 }
    960 
    961 static inline void highbd_convolve_2d_sr_vert_12tap_rvv(
    962    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
    963    int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
    964    const int bd, const int offset, size_t vl) {
    965  const int32_t shift_s32 = conv_params->round_1;
    966  const int32_t offset_s32 = offset;
    967  const uint16_t max_u16 = (1 << bd) - 1;
    968 
    969  if (w == 4) {
    970    int16_t *s = (int16_t *)src_ptr;
    971    vl = vl << 1;
    972 
    973    vint16m1_t s0 = load_strided_i16_4xN(s, src_stride, vl);
    974    s += src_stride;
    975    vint16m1_t s1 = load_strided_i16_4xN(s, src_stride, vl);
    976    s += src_stride;
    977    vint16m1_t s2 = load_strided_i16_4xN(s, src_stride, vl);
    978    s += src_stride;
    979    vint16m1_t s3 = load_strided_i16_4xN(s, src_stride, vl);
    980    s += src_stride;
    981    vint16m1_t s4 = load_strided_i16_4xN(s, src_stride, vl);
    982    s += src_stride;
    983    vint16m1_t s5 = load_strided_i16_4xN(s, src_stride, vl);
    984    s += src_stride;
    985    vint16m1_t s6 = load_strided_i16_4xN(s, src_stride, vl);
    986    s += src_stride;
    987    vint16m1_t s7 = load_strided_i16_4xN(s, src_stride, vl);
    988    s += src_stride;
    989    vint16m1_t s8 = load_strided_i16_4xN(s, src_stride, vl);
    990    s += src_stride;
    991    vint16m1_t s9 = load_strided_i16_4xN(s, src_stride, vl);
    992    s += src_stride;
    993 
    994    do {
    995      vint16m1_t s10 = load_strided_i16_4xN(s, src_stride, vl);
    996      s += src_stride;
    997      vint16m1_t s11 = load_strided_i16_4xN(s, src_stride, vl);
    998      s += src_stride;
    999      vint16m1_t s12 = load_strided_i16_4xN(s, src_stride, vl);
   1000      s += src_stride;
   1001      vint16m1_t s13 = load_strided_i16_4xN(s, src_stride, vl);
   1002      s += src_stride;
   1003 
   1004      vuint16m1_t d0 = highbd_convolve12_2d_v_rvv(
   1005          s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_ptr,
   1006          offset_s32, shift_s32, max_u16, vl);
   1007      vuint16m1_t d1 = highbd_convolve12_2d_v_rvv(
   1008          s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_ptr,
   1009          offset_s32, shift_s32, max_u16, vl);
   1010 
   1011      store_strided_u16_4xN(dst_ptr, d0, dst_stride, vl);
   1012      dst_ptr += dst_stride << 1;
   1013      store_strided_u16_4xN(dst_ptr, d1, dst_stride, vl);
   1014      dst_ptr += dst_stride << 1;
   1015 
   1016      s0 = s4;
   1017      s1 = s5;
   1018      s2 = s6;
   1019      s3 = s7;
   1020      s4 = s8;
   1021      s5 = s9;
   1022      s6 = s10;
   1023      s7 = s11;
   1024      s8 = s12;
   1025      s9 = s13;
   1026 
   1027      h -= 4;
   1028    } while (h != 0);
   1029  } else {
   1030    do {
   1031      int height = h;
   1032      int16_t *s = (int16_t *)src_ptr;
   1033      uint16_t *d = dst_ptr;
   1034 
   1035      vint16m1_t s0 = __riscv_vle16_v_i16m1(s, vl);
   1036      s += src_stride;
   1037      vint16m1_t s1 = __riscv_vle16_v_i16m1(s, vl);
   1038      s += src_stride;
   1039      vint16m1_t s2 = __riscv_vle16_v_i16m1(s, vl);
   1040      s += src_stride;
   1041      vint16m1_t s3 = __riscv_vle16_v_i16m1(s, vl);
   1042      s += src_stride;
   1043      vint16m1_t s4 = __riscv_vle16_v_i16m1(s, vl);
   1044      s += src_stride;
   1045      vint16m1_t s5 = __riscv_vle16_v_i16m1(s, vl);
   1046      s += src_stride;
   1047      vint16m1_t s6 = __riscv_vle16_v_i16m1(s, vl);
   1048      s += src_stride;
   1049      vint16m1_t s7 = __riscv_vle16_v_i16m1(s, vl);
   1050      s += src_stride;
   1051      vint16m1_t s8 = __riscv_vle16_v_i16m1(s, vl);
   1052      s += src_stride;
   1053      vint16m1_t s9 = __riscv_vle16_v_i16m1(s, vl);
   1054      s += src_stride;
   1055      vint16m1_t s10 = __riscv_vle16_v_i16m1(s, vl);
   1056      s += src_stride;
   1057 
   1058      do {
   1059        vint16m1_t s11 = __riscv_vle16_v_i16m1(s, vl);
   1060        s += src_stride;
   1061        vint16m1_t s12 = __riscv_vle16_v_i16m1(s, vl);
   1062        s += src_stride;
   1063        vint16m1_t s13 = __riscv_vle16_v_i16m1(s, vl);
   1064        s += src_stride;
   1065        vint16m1_t s14 = __riscv_vle16_v_i16m1(s, vl);
   1066        s += src_stride;
   1067 
   1068        vuint16m1_t d0 = highbd_convolve12_2d_v_rvv(
   1069            s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_ptr,
   1070            offset_s32, shift_s32, max_u16, vl);
   1071        vuint16m1_t d1 = highbd_convolve12_2d_v_rvv(
   1072            s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_ptr,
   1073            offset_s32, shift_s32, max_u16, vl);
   1074        vuint16m1_t d2 = highbd_convolve12_2d_v_rvv(
   1075            s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_ptr,
   1076            offset_s32, shift_s32, max_u16, vl);
   1077        vuint16m1_t d3 = highbd_convolve12_2d_v_rvv(
   1078            s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_ptr,
   1079            offset_s32, shift_s32, max_u16, vl);
   1080 
   1081        __riscv_vse16_v_u16m1(d, d0, vl);
   1082        d += dst_stride;
   1083        __riscv_vse16_v_u16m1(d, d1, vl);
   1084        d += dst_stride;
   1085        __riscv_vse16_v_u16m1(d, d2, vl);
   1086        d += dst_stride;
   1087        __riscv_vse16_v_u16m1(d, d3, vl);
   1088        d += dst_stride;
   1089 
   1090        s0 = s4;
   1091        s1 = s5;
   1092        s2 = s6;
   1093        s3 = s7;
   1094        s4 = s8;
   1095        s5 = s9;
   1096        s6 = s10;
   1097        s7 = s11;
   1098        s8 = s12;
   1099        s9 = s13;
   1100        s10 = s14;
   1101 
   1102        height -= 4;
   1103      } while (height != 0);
   1104 
   1105      src_ptr += vl;
   1106      dst_ptr += vl;
   1107      w -= vl;
   1108    } while (w != 0);
   1109  }
   1110 }
   1111 
   1112 static inline vuint16m1_t highbd_convolve8_2d_v_rvv(
   1113    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
   1114    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
   1115    const vint16m1_t s6, const vint16m1_t s7, const int16_t *y_filter,
   1116    const int32_t offset, const int32_t shift, const uint16_t max, size_t vl) {
   1117  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl);
   1118  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl);
   1119  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl);
   1120  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl);
   1121  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[4], s4, vl);
   1122  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[5], s5, vl);
   1123  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[6], s6, vl);
   1124  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[7], s7, vl);
   1125  sum = __riscv_vadd_vx_i32m2(sum, offset, vl);
   1126 
   1127  vint16m1_t i16_sum = __riscv_vnsra_wx_i16m1(sum, shift, vl);
   1128  vint16m1_t iclip_sum =
   1129      __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), max, vl);
   1130  return __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum);
   1131 }
   1132 
   1133 static inline void highbd_convolve_2d_sr_vert_8tap_rvv(
   1134    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1135    int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
   1136    int bd, const int offset, size_t vl) {
   1137  const int32_t shift_s32 = conv_params->round_1;
   1138  const int32_t offset_s32 = offset;
   1139  const uint16_t max_u16 = (1 << bd) - 1;
   1140 
   1141  if (w <= 4) {
   1142    int16_t *s = (int16_t *)src_ptr;
   1143    vl = vl << 1;
   1144 
   1145    vint16m1_t s0 = load_strided_i16_4xN(s, src_stride, vl);
   1146    s += src_stride;
   1147    vint16m1_t s1 = load_strided_i16_4xN(s, src_stride, vl);
   1148    s += src_stride;
   1149    vint16m1_t s2 = load_strided_i16_4xN(s, src_stride, vl);
   1150    s += src_stride;
   1151    vint16m1_t s3 = load_strided_i16_4xN(s, src_stride, vl);
   1152    s += src_stride;
   1153    vint16m1_t s4 = load_strided_i16_4xN(s, src_stride, vl);
   1154    s += src_stride;
   1155    vint16m1_t s5 = load_strided_i16_4xN(s, src_stride, vl);
   1156    s += src_stride;
   1157 
   1158    do {
   1159      vint16m1_t s6 = load_strided_i16_4xN(s, src_stride, vl);
   1160      s += src_stride;
   1161      vint16m1_t s7 = load_strided_i16_4xN(s, src_stride, vl);
   1162      s += src_stride;
   1163 
   1164      vuint16m1_t d0 = highbd_convolve8_2d_v_rvv(s0, s1, s2, s3, s4, s5, s6, s7,
   1165                                                 y_filter_ptr, offset_s32,
   1166                                                 shift_s32, max_u16, vl);
   1167 
   1168      store_strided_u16_4xN(dst_ptr, d0, dst_stride, vl);
   1169      dst_ptr += dst_stride << 1;
   1170 
   1171      s0 = s2;
   1172      s1 = s3;
   1173      s2 = s4;
   1174      s3 = s5;
   1175      s4 = s6;
   1176      s5 = s7;
   1177 
   1178      h -= 2;
   1179    } while (h != 0);
   1180  } else {
   1181    do {
   1182      int height = h;
   1183      int16_t *s = (int16_t *)src_ptr;
   1184      uint16_t *d = dst_ptr;
   1185 
   1186      vint16m1_t s0 = __riscv_vle16_v_i16m1(s, vl);
   1187      s += src_stride;
   1188      vint16m1_t s1 = __riscv_vle16_v_i16m1(s, vl);
   1189      s += src_stride;
   1190      vint16m1_t s2 = __riscv_vle16_v_i16m1(s, vl);
   1191      s += src_stride;
   1192      vint16m1_t s3 = __riscv_vle16_v_i16m1(s, vl);
   1193      s += src_stride;
   1194      vint16m1_t s4 = __riscv_vle16_v_i16m1(s, vl);
   1195      s += src_stride;
   1196      vint16m1_t s5 = __riscv_vle16_v_i16m1(s, vl);
   1197      s += src_stride;
   1198      vint16m1_t s6 = __riscv_vle16_v_i16m1(s, vl);
   1199      s += src_stride;
   1200 
   1201      do {
   1202        vint16m1_t s7 = __riscv_vle16_v_i16m1(s, vl);
   1203        vuint16m1_t d0 = highbd_convolve8_2d_v_rvv(s0, s1, s2, s3, s4, s5, s6,
   1204                                                   s7, y_filter_ptr, offset_s32,
   1205                                                   shift_s32, max_u16, vl);
   1206        __riscv_vse16_v_u16m1(d, d0, vl);
   1207 
   1208        s0 = s1;
   1209        s1 = s2;
   1210        s2 = s3;
   1211        s3 = s4;
   1212        s4 = s5;
   1213        s5 = s6;
   1214        s6 = s7;
   1215        s += src_stride;
   1216        d += dst_stride;
   1217        height--;
   1218      } while (height != 0);
   1219 
   1220      src_ptr += vl;
   1221      dst_ptr += vl;
   1222      w -= vl;
   1223    } while (w != 0);
   1224  }
   1225 }
   1226 
   1227 static inline vuint16m1_t highbd_convolve6_2d_v_rvv(
   1228    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
   1229    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
   1230    const int16_t *y_filter, const int32_t offset, const int32_t shift,
   1231    const uint16_t max, size_t vl) {
   1232  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl);
   1233  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl);
   1234  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl);
   1235  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl);
   1236  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[4], s4, vl);
   1237  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[5], s5, vl);
   1238  sum = __riscv_vadd_vx_i32m2(sum, offset, vl);
   1239 
   1240  vint16m1_t i16_sum = __riscv_vnsra_wx_i16m1(sum, shift, vl);
   1241  vint16m1_t iclip_sum =
   1242      __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), max, vl);
   1243  return __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum);
   1244 }
   1245 
   1246 static inline void highbd_convolve_2d_sr_vert_6tap_rvv(
   1247    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1248    int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
   1249    int bd, const int offset, size_t vl) {
   1250  const int32_t shift_s32 = conv_params->round_1;
   1251  const int32_t offset_s32 = offset;
   1252  const uint16_t max_u16 = (1 << bd) - 1;
   1253  const int16_t *yfilter_6tap = y_filter_ptr + 1;
   1254 
   1255  if (w == 4) {
   1256    int16_t *s = (int16_t *)src_ptr;
   1257    vl = vl << 1;
   1258 
   1259    vint16m1_t s0 = load_strided_i16_4xN(s, src_stride, vl);
   1260    s += src_stride;
   1261    vint16m1_t s1 = load_strided_i16_4xN(s, src_stride, vl);
   1262    s += src_stride;
   1263    vint16m1_t s2 = load_strided_i16_4xN(s, src_stride, vl);
   1264    s += src_stride;
   1265    vint16m1_t s3 = load_strided_i16_4xN(s, src_stride, vl);
   1266    s += src_stride;
   1267 
   1268    do {
   1269      vint16m1_t s4 = load_strided_i16_4xN(s, src_stride, vl);
   1270      s += src_stride;
   1271      vint16m1_t s5 = load_strided_i16_4xN(s, src_stride, vl);
   1272      s += src_stride;
   1273      vint16m1_t s6 = load_strided_i16_4xN(s, src_stride, vl);
   1274      s += src_stride;
   1275      vint16m1_t s7 = load_strided_i16_4xN(s, src_stride, vl);
   1276      s += src_stride;
   1277 
   1278      vuint16m1_t d0 =
   1279          highbd_convolve6_2d_v_rvv(s0, s1, s2, s3, s4, s5, yfilter_6tap,
   1280                                    offset_s32, shift_s32, max_u16, vl);
   1281      vuint16m1_t d1 =
   1282          highbd_convolve6_2d_v_rvv(s2, s3, s4, s5, s6, s7, yfilter_6tap,
   1283                                    offset_s32, shift_s32, max_u16, vl);
   1284 
   1285      store_strided_u16_4xN(dst_ptr, d0, dst_stride, vl);
   1286      dst_ptr += dst_stride << 1;
   1287      store_strided_u16_4xN(dst_ptr, d1, dst_stride, vl);
   1288      dst_ptr += dst_stride << 1;
   1289 
   1290      s0 = s4;
   1291      s1 = s5;
   1292      s2 = s6;
   1293      s3 = s7;
   1294 
   1295      h -= 4;
   1296    } while (h != 0);
   1297  } else {
   1298    do {
   1299      int height = h;
   1300      int16_t *s = (int16_t *)src_ptr;
   1301      uint16_t *d = dst_ptr;
   1302 
   1303      vint16m1_t s0 = __riscv_vle16_v_i16m1(s, vl);
   1304      s += src_stride;
   1305      vint16m1_t s1 = __riscv_vle16_v_i16m1(s, vl);
   1306      s += src_stride;
   1307      vint16m1_t s2 = __riscv_vle16_v_i16m1(s, vl);
   1308      s += src_stride;
   1309      vint16m1_t s3 = __riscv_vle16_v_i16m1(s, vl);
   1310      s += src_stride;
   1311      vint16m1_t s4 = __riscv_vle16_v_i16m1(s, vl);
   1312      s += src_stride;
   1313 
   1314      do {
   1315        vint16m1_t s5 = __riscv_vle16_v_i16m1(s, vl);
   1316        s += src_stride;
   1317        vint16m1_t s6 = __riscv_vle16_v_i16m1(s, vl);
   1318        s += src_stride;
   1319        vint16m1_t s7 = __riscv_vle16_v_i16m1(s, vl);
   1320        s += src_stride;
   1321        vint16m1_t s8 = __riscv_vle16_v_i16m1(s, vl);
   1322        s += src_stride;
   1323 
   1324        vuint16m1_t d0 =
   1325            highbd_convolve6_2d_v_rvv(s0, s1, s2, s3, s4, s5, yfilter_6tap,
   1326                                      offset_s32, shift_s32, max_u16, vl);
   1327        vuint16m1_t d1 =
   1328            highbd_convolve6_2d_v_rvv(s1, s2, s3, s4, s5, s6, yfilter_6tap,
   1329                                      offset_s32, shift_s32, max_u16, vl);
   1330        vuint16m1_t d2 =
   1331            highbd_convolve6_2d_v_rvv(s2, s3, s4, s5, s6, s7, yfilter_6tap,
   1332                                      offset_s32, shift_s32, max_u16, vl);
   1333        vuint16m1_t d3 =
   1334            highbd_convolve6_2d_v_rvv(s3, s4, s5, s6, s7, s8, yfilter_6tap,
   1335                                      offset_s32, shift_s32, max_u16, vl);
   1336 
   1337        __riscv_vse16_v_u16m1(d, d0, vl);
   1338        d += dst_stride;
   1339        __riscv_vse16_v_u16m1(d, d1, vl);
   1340        d += dst_stride;
   1341        __riscv_vse16_v_u16m1(d, d2, vl);
   1342        d += dst_stride;
   1343        __riscv_vse16_v_u16m1(d, d3, vl);
   1344        d += dst_stride;
   1345 
   1346        s0 = s4;
   1347        s1 = s5;
   1348        s2 = s6;
   1349        s3 = s7;
   1350        s4 = s8;
   1351 
   1352        height -= 4;
   1353      } while (height != 0);
   1354 
   1355      src_ptr += vl;
   1356      dst_ptr += vl;
   1357      w -= vl;
   1358    } while (w != 0);
   1359  }
   1360 }
   1361 
   1362 static inline vint16m1_t highbd_convolve12_8_2d_h_rvv(
   1363    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
   1364    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
   1365    const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8,
   1366    const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11,
   1367    const int16_t *x_filter, const int32_t offset, const int32_t shift,
   1368    size_t vl) {
   1369  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, x_filter[0], vl);
   1370  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[1], s1, vl);
   1371  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[2], s2, vl);
   1372  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[3], s3, vl);
   1373  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[4], s4, vl);
   1374  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[5], s5, vl);
   1375  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[6], s6, vl);
   1376  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[7], s7, vl);
   1377  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[8], s8, vl);
   1378  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[9], s9, vl);
   1379  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[10], s10, vl);
   1380  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[11], s11, vl);
   1381 
   1382  sum = __riscv_vadd_vx_i32m2(sum, offset, vl);
   1383 
   1384  return __riscv_vnclip_wx_i16m1(sum, shift, __RISCV_VXRM_RNU, vl);
   1385 }
   1386 
   1387 static inline void highbd_convolve_2d_sr_horiz_12tap_rvv(
   1388    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1389    int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
   1390    const int offset, size_t vl) {
   1391  assert(h >= 5);
   1392  const int32_t shift_s32 = conv_params->round_0;
   1393  const int32_t offset_s32 = offset;
   1394 
   1395  if (w == 4) {
   1396    const int16_t *s = (int16_t *)src_ptr;
   1397    int16_t *d = (int16_t *)dst_ptr;
   1398 
   1399    do {
   1400      vint16m1_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
   1401 
   1402      load_s16_8x12(s, 1, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9,
   1403                    &t10, &t11, vl);
   1404 
   1405      vint16m1_t d0 = highbd_convolve12_8_2d_h_rvv(
   1406          t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, x_filter_ptr,
   1407          offset_s32, shift_s32, vl);
   1408 
   1409      __riscv_vse16_v_i16m1(d, d0, vl);
   1410 
   1411      s += src_stride;
   1412      d += dst_stride;
   1413 
   1414    } while (--h != 0);
   1415  } else {
   1416    int height = h;
   1417 
   1418    do {
   1419      const int16_t *s = (int16_t *)src_ptr;
   1420      int16_t *d = (int16_t *)dst_ptr;
   1421      int width = w;
   1422 
   1423      do {
   1424        vint16m1_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
   1425 
   1426        load_s16_8x12(s, 1, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9,
   1427                      &t10, &t11, vl);
   1428 
   1429        vint16m1_t d0 = highbd_convolve12_8_2d_h_rvv(
   1430            t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, x_filter_ptr,
   1431            offset_s32, shift_s32, vl);
   1432 
   1433        __riscv_vse16_v_i16m1(d, d0, vl);
   1434 
   1435        s += vl;
   1436        d += vl;
   1437        width -= vl;
   1438      } while (width != 0);
   1439      src_ptr += src_stride;
   1440      dst_ptr += dst_stride;
   1441    } while (--height != 0);
   1442  }
   1443 }
   1444 
   1445 static inline vint16m1_t highbd_convolve8_4_2d_h_rvv(
   1446    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
   1447    const vint16m1_t s3, const int16_t *x_filter, const int32_t offset,
   1448    const int32_t shift, size_t vl) {
   1449  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, x_filter[0], vl);
   1450  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[1], s1, vl);
   1451  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[2], s2, vl);
   1452  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[3], s3, vl);
   1453 
   1454  sum = __riscv_vadd_vx_i32m2(sum, offset, vl);
   1455 
   1456  return __riscv_vnclip_wx_i16m1(sum, shift, __RISCV_VXRM_RNU, vl);
   1457 }
   1458 
   1459 static inline vint16m1_t highbd_convolve8_8_2d_h_rvv(
   1460    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
   1461    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
   1462    const vint16m1_t s6, const vint16m1_t s7, const int16_t *x_filter,
   1463    const int32_t offset, const int32_t shift, size_t vl) {
   1464  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, x_filter[0], vl);
   1465  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[1], s1, vl);
   1466  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[2], s2, vl);
   1467  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[3], s3, vl);
   1468  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[4], s4, vl);
   1469  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[5], s5, vl);
   1470  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[6], s6, vl);
   1471  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[7], s7, vl);
   1472 
   1473  sum = __riscv_vadd_vx_i32m2(sum, offset, vl);
   1474 
   1475  return __riscv_vnclip_wx_i16m1(sum, shift, __RISCV_VXRM_RNU, vl);
   1476 }
   1477 
   1478 static inline void highbd_convolve_2d_sr_horiz_rvv(
   1479    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1480    int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
   1481    const int offset, size_t vl) {
   1482  assert(h >= 5);
   1483  const int32_t shift_s32 = conv_params->round_0;
   1484  const int32_t offset_s32 = offset;
   1485 
   1486  if (w == 4) {
   1487    const int16_t *x_filter = (x_filter_ptr + 2);
   1488    const int16_t *s = (int16_t *)(src_ptr + 1);
   1489    int16_t *d = (int16_t *)dst_ptr;
   1490 
   1491    do {
   1492      vint16m1_t t0, t1, t2, t3;
   1493 
   1494      load_s16_8x4(s, 1, &t0, &t1, &t2, &t3, vl);
   1495 
   1496      vint16m1_t d0 = highbd_convolve8_4_2d_h_rvv(t0, t1, t2, t3, x_filter,
   1497                                                  offset_s32, shift_s32, vl);
   1498 
   1499      __riscv_vse16_v_i16m1(d, d0, vl);
   1500 
   1501      s += src_stride;
   1502      d += dst_stride;
   1503    } while (--h != 0);
   1504  } else {
   1505    do {
   1506      const int16_t *s = (int16_t *)src_ptr;
   1507      int16_t *d = (int16_t *)dst_ptr;
   1508      int width = w;
   1509 
   1510      do {
   1511        vint16m1_t t0, t1, t2, t3, t4, t5, t6, t7;
   1512 
   1513        load_s16_8x8(s, 1, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, vl);
   1514 
   1515        vint16m1_t d0 = highbd_convolve8_8_2d_h_rvv(t0, t1, t2, t3, t4, t5, t6,
   1516                                                    t7, x_filter_ptr,
   1517                                                    offset_s32, shift_s32, vl);
   1518        __riscv_vse16_v_i16m1(d, d0, vl);
   1519 
   1520        s += vl;
   1521        d += vl;
   1522        width -= vl;
   1523      } while (width != 0);
   1524      src_ptr += src_stride;
   1525      dst_ptr += dst_stride;
   1526    } while (--h != 0);
   1527  }
   1528 }
   1529 
   1530 static inline vint16m1_t highbd_convolve6_8_2d_h_rvv(
   1531    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
   1532    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
   1533    const int16_t *x_filter, const int32_t offset, const int32_t shift,
   1534    size_t vl) {
   1535  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, x_filter[0], vl);
   1536  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[1], s1, vl);
   1537  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[2], s2, vl);
   1538  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[3], s3, vl);
   1539  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[4], s4, vl);
   1540  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[5], s5, vl);
   1541 
   1542  sum = __riscv_vadd_vx_i32m2(sum, offset, vl);
   1543 
   1544  return __riscv_vnclip_wx_i16m1(sum, shift, __RISCV_VXRM_RNU, vl);
   1545 }
   1546 
   1547 static inline void highbd_convolve_2d_sr_horiz_6tap_rvv(
   1548    const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
   1549    int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
   1550    const int offset, size_t vl) {
   1551  assert(h >= 5);
   1552  const int32_t shift_s32 = conv_params->round_0;
   1553  const int32_t offset_s32 = offset;
   1554  const int16_t *x_filter = (x_filter_ptr + 1);
   1555 
   1556  do {
   1557    const int16_t *s = (int16_t *)src_ptr;
   1558    int16_t *d = (int16_t *)dst_ptr;
   1559    int width = w;
   1560 
   1561    do {
   1562      vint16m1_t t0, t1, t2, t3, t4, t5;
   1563 
   1564      load_s16_8x6(s, 1, &t0, &t1, &t2, &t3, &t4, &t5, vl);
   1565 
   1566      vint16m1_t d0 = highbd_convolve6_8_2d_h_rvv(
   1567          t0, t1, t2, t3, t4, t5, x_filter, offset_s32, shift_s32, vl);
   1568 
   1569      __riscv_vse16_v_i16m1(d, d0, vl);
   1570 
   1571      s += vl;
   1572      d += vl;
   1573      width -= vl;
   1574    } while (width != 0);
   1575    src_ptr += src_stride;
   1576    dst_ptr += dst_stride;
   1577  } while (--h != 0);
   1578 }
   1579 
   1580 void av1_highbd_convolve_2d_sr_rvv(const uint16_t *src, int src_stride,
   1581                                   uint16_t *dst, int dst_stride, int w, int h,
   1582                                   const InterpFilterParams *filter_params_x,
   1583                                   const InterpFilterParams *filter_params_y,
   1584                                   const int subpel_x_qn, const int subpel_y_qn,
   1585                                   ConvolveParams *conv_params, int bd) {
   1586  if (w == 2 || h == 2) {
   1587    av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
   1588                                filter_params_x, filter_params_y, subpel_x_qn,
   1589                                subpel_y_qn, conv_params, bd);
   1590    return;
   1591  }
   1592  DECLARE_ALIGNED(16, uint16_t,
   1593                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
   1594  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
   1595  const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
   1596 
   1597  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   1598  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
   1599  const int im_h = h + clamped_y_taps - 1;
   1600  const int im_stride = MAX_SB_SIZE;
   1601  const int vert_offset = clamped_y_taps / 2 - 1;
   1602  const int horiz_offset = clamped_x_taps / 2 - 1;
   1603  const int x_offset_initial = (1 << (bd + FILTER_BITS - 1));
   1604  const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
   1605  // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a
   1606  // simple shift left instead of a rounding saturating shift left.
   1607  const int y_offset =
   1608      (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1));
   1609 
   1610  const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
   1611 
   1612  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1613      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   1614  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1615      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   1616 
   1617  size_t vl = __riscv_vsetvl_e16m1(w);
   1618 
   1619  if (x_filter_taps > 8) {
   1620    highbd_convolve_2d_sr_horiz_12tap_rvv(src_ptr, src_stride, im_block,
   1621                                          im_stride, w, im_h, x_filter_ptr,
   1622                                          conv_params, x_offset_initial, vl);
   1623 
   1624    highbd_convolve_2d_sr_vert_12tap_rvv(im_block, im_stride, dst, dst_stride,
   1625                                         w, h, y_filter_ptr, conv_params, bd,
   1626                                         y_offset, vl);
   1627    return;
   1628  }
   1629  if (x_filter_taps <= 6 && w != 4) {
   1630    highbd_convolve_2d_sr_horiz_6tap_rvv(src_ptr, src_stride, im_block,
   1631                                         im_stride, w, im_h, x_filter_ptr,
   1632                                         conv_params, x_offset_initial, vl);
   1633  } else {
   1634    highbd_convolve_2d_sr_horiz_rvv(src_ptr, src_stride, im_block, im_stride, w,
   1635                                    im_h, x_filter_ptr, conv_params,
   1636                                    x_offset_initial, vl);
   1637  }
   1638 
   1639  if (y_filter_taps <= 6) {
   1640    highbd_convolve_2d_sr_vert_6tap_rvv(im_block, im_stride, dst, dst_stride, w,
   1641                                        h, y_filter_ptr, conv_params, bd,
   1642                                        y_offset, vl);
   1643  } else {
   1644    highbd_convolve_2d_sr_vert_8tap_rvv(im_block, im_stride, dst, dst_stride, w,
   1645                                        h, y_filter_ptr, conv_params, bd,
   1646                                        y_offset, vl);
   1647  }
   1648 }
   1649 
   1650 // Filter used is [64, 64].
   1651 void av1_highbd_convolve_x_sr_intrabc_rvv(
   1652    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
   1653    int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
   1654    ConvolveParams *conv_params, int bd) {
   1655  assert(subpel_x_qn == 8);
   1656  assert(filter_params_x->taps == 2);
   1657  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
   1658  (void)filter_params_x;
   1659  (void)subpel_x_qn;
   1660  (void)conv_params;
   1661  (void)bd;
   1662 
   1663  size_t vl = __riscv_vsetvl_e16m1(w);
   1664  if (w <= 4) {
   1665    do {
   1666      // Load
   1667      vuint16mf2_t s0_0 = __riscv_vle16_v_u16mf2(src, vl);
   1668      vuint16mf2_t s0_1 = __riscv_vle16_v_u16mf2(src + 1, vl);
   1669      vuint16mf2_t s1_0 = __riscv_vle16_v_u16mf2(src + src_stride, vl);
   1670      vuint16mf2_t s1_1 = __riscv_vle16_v_u16mf2(src + src_stride + 1, vl);
   1671 
   1672      // Average the values
   1673      vuint16mf2_t d0 =
   1674          __riscv_vaaddu_vv_u16mf2(s0_0, s0_1, __RISCV_VXRM_RNU, vl);
   1675      vuint16mf2_t d1 =
   1676          __riscv_vaaddu_vv_u16mf2(s1_0, s1_1, __RISCV_VXRM_RNU, vl);
   1677 
   1678      // Store
   1679      __riscv_vse16_v_u16mf2(dst, d0, vl);
   1680      __riscv_vse16_v_u16mf2(dst + dst_stride, d1, vl);
   1681 
   1682      src += src_stride << 1;
   1683      dst += dst_stride << 1;
   1684      h -= 2;
   1685    } while (h > 0);
   1686  } else {
   1687    do {
   1688      const uint16_t *src_ptr = src;
   1689      uint16_t *dst_ptr = dst;
   1690      int width = w;
   1691 
   1692      do {
   1693        // Load
   1694        vuint16m1_t s0 = __riscv_vle16_v_u16m1(src_ptr, vl);
   1695        vuint16m1_t s1 = __riscv_vle16_v_u16m1(src_ptr + 1, vl);
   1696        vuint16m1_t s2 = __riscv_vle16_v_u16m1(src_ptr + src_stride, vl);
   1697        vuint16m1_t s3 = __riscv_vle16_v_u16m1(src_ptr + src_stride + 1, vl);
   1698 
   1699        // Average the values
   1700        vuint16m1_t d0 = __riscv_vaaddu_vv_u16m1(s0, s1, __RISCV_VXRM_RNU, vl);
   1701        vuint16m1_t d1 = __riscv_vaaddu_vv_u16m1(s2, s3, __RISCV_VXRM_RNU, vl);
   1702 
   1703        // Store
   1704        __riscv_vse16_v_u16m1(dst_ptr, d0, vl);
   1705        __riscv_vse16_v_u16m1(dst_ptr + dst_stride, d1, vl);
   1706 
   1707        src_ptr += vl;
   1708        dst_ptr += vl;
   1709        width -= vl;
   1710      } while (width > 0);
   1711      src += src_stride << 1;
   1712      dst += dst_stride << 1;
   1713      h -= 2;
   1714    } while (h > 0);
   1715  }
   1716 }
   1717 
   1718 // Filter used is [64, 64].
   1719 void av1_highbd_convolve_y_sr_intrabc_rvv(
   1720    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
   1721    int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
   1722    int bd) {
   1723  assert(subpel_y_qn == 8);
   1724  assert(filter_params_y->taps == 2);
   1725  (void)filter_params_y;
   1726  (void)subpel_y_qn;
   1727  (void)bd;
   1728 
   1729  size_t vl = __riscv_vsetvl_e16m1(w);
   1730  if (w <= 4) {
   1731    vuint16mf2_t s0 = __riscv_vle16_v_u16mf2(src, vl);
   1732 
   1733    do {
   1734      vuint16mf2_t s1 = __riscv_vle16_v_u16mf2(src + src_stride, vl);
   1735      vuint16mf2_t s2 = __riscv_vle16_v_u16mf2(src + 2 * src_stride, vl);
   1736 
   1737      // Average the values
   1738      vuint16mf2_t d0 = __riscv_vaaddu_vv_u16mf2(s0, s1, __RISCV_VXRM_RNU, vl);
   1739      vuint16mf2_t d1 = __riscv_vaaddu_vv_u16mf2(s1, s2, __RISCV_VXRM_RNU, vl);
   1740 
   1741      // Store
   1742      __riscv_vse16_v_u16mf2(dst, d0, vl);
   1743      __riscv_vse16_v_u16mf2(dst + dst_stride, d1, vl);
   1744 
   1745      s0 = s2;
   1746      src += src_stride << 1;
   1747      dst += dst_stride << 1;
   1748      h -= 2;
   1749    } while (h > 0);
   1750  } else {
   1751    do {
   1752      const uint16_t *src_ptr = src;
   1753      uint16_t *dst_ptr = dst;
   1754      int height = h;
   1755 
   1756      vuint16m1_t s0 = __riscv_vle16_v_u16m1(src_ptr, vl);
   1757 
   1758      do {
   1759        vuint16m1_t s1 = __riscv_vle16_v_u16m1(src_ptr + src_stride, vl);
   1760        vuint16m1_t s2 = __riscv_vle16_v_u16m1(src_ptr + 2 * src_stride, vl);
   1761 
   1762        // Average the values
   1763        vuint16m1_t d0 = __riscv_vaaddu_vv_u16m1(s0, s1, __RISCV_VXRM_RNU, vl);
   1764        vuint16m1_t d1 = __riscv_vaaddu_vv_u16m1(s1, s2, __RISCV_VXRM_RNU, vl);
   1765 
   1766        // Store
   1767        __riscv_vse16_v_u16m1(dst_ptr, d0, vl);
   1768        __riscv_vse16_v_u16m1(dst_ptr + dst_stride, d1, vl);
   1769 
   1770        s0 = s2;
   1771        src_ptr += src_stride << 1;
   1772        dst_ptr += dst_stride << 1;
   1773        height -= 2;
   1774      } while (height > 0);
   1775      src += vl;
   1776      dst += vl;
   1777      w -= vl;
   1778    } while (w > 0);
   1779  }
   1780 }
   1781 
   1782 // Both horizontal and vertical passes use the same 2-tap filter: [64, 64].
   1783 void av1_highbd_convolve_2d_sr_intrabc_rvv(
   1784    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
   1785    int h, const InterpFilterParams *filter_params_x,
   1786    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
   1787    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
   1788  assert(subpel_x_qn == 8);
   1789  assert(subpel_y_qn == 8);
   1790  assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
   1791  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
   1792  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
   1793  (void)filter_params_x;
   1794  (void)subpel_x_qn;
   1795  (void)filter_params_y;
   1796  (void)subpel_y_qn;
   1797  (void)conv_params;
   1798  (void)bd;
   1799 
   1800  size_t vl = __riscv_vsetvl_e16m1(w);
   1801 
   1802  if (w <= 8) {
   1803    // Horizontal filter.
   1804    vuint16m1_t s0 = __riscv_vle16_v_u16m1(src, vl);
   1805    vuint16m1_t s1 = __riscv_vle16_v_u16m1(src + 1, vl);
   1806    src += src_stride;
   1807 
   1808    vuint16m1_t sum0 = __riscv_vadd_vv_u16m1(s0, s1, vl);
   1809 
   1810    do {
   1811      vuint16m1_t s2 = __riscv_vle16_v_u16m1(src, vl);
   1812      vuint16m1_t s3 = __riscv_vle16_v_u16m1(src + 1, vl);
   1813      src += src_stride;
   1814      vuint16m1_t s4 = __riscv_vle16_v_u16m1(src, vl);
   1815      vuint16m1_t s5 = __riscv_vle16_v_u16m1(src + 1, vl);
   1816      src += src_stride;
   1817 
   1818      vuint16m1_t sum1 = __riscv_vadd_vv_u16m1(s2, s3, vl);
   1819      vuint16m1_t sum2 = __riscv_vadd_vv_u16m1(s4, s5, vl);
   1820 
   1821      // Vertical filter.
   1822      vuint16m1_t d0 =
   1823          __riscv_vadd_vx_u16m1(__riscv_vadd_vv_u16m1(sum0, sum1, vl), 2, vl);
   1824      vuint16m1_t d1 =
   1825          __riscv_vadd_vx_u16m1(__riscv_vadd_vv_u16m1(sum1, sum2, vl), 2, vl);
   1826 
   1827      d0 = __riscv_vsrl_vx_u16m1(d0, 2, vl);
   1828      d1 = __riscv_vsrl_vx_u16m1(d1, 2, vl);
   1829 
   1830      __riscv_vse16_v_u16m1(dst, d0, vl);
   1831      dst += dst_stride;
   1832      __riscv_vse16_v_u16m1(dst, d1, vl);
   1833      dst += dst_stride;
   1834 
   1835      sum0 = sum2;
   1836      h -= 2;
   1837    } while (h != 0);
   1838  } else {
   1839    do {
   1840      uint16_t *src_ptr = (uint16_t *)src;
   1841      uint16_t *dst_ptr = dst;
   1842      int height = h;
   1843 
   1844      // Horizontal filter.
   1845      vuint16m1_t s0 = __riscv_vle16_v_u16m1(src_ptr, vl);
   1846      vuint16m1_t s1 = __riscv_vle16_v_u16m1(src_ptr + 1, vl);
   1847      src_ptr += src_stride;
   1848 
   1849      vuint16m1_t sum0 = __riscv_vadd_vv_u16m1(s0, s1, vl);
   1850 
   1851      do {
   1852        vuint16m1_t s2 = __riscv_vle16_v_u16m1(src_ptr, vl);
   1853        vuint16m1_t s3 = __riscv_vle16_v_u16m1(src_ptr + 1, vl);
   1854        src_ptr += src_stride;
   1855        vuint16m1_t s4 = __riscv_vle16_v_u16m1(src_ptr, vl);
   1856        vuint16m1_t s5 = __riscv_vle16_v_u16m1(src_ptr + 1, vl);
   1857        src_ptr += src_stride;
   1858 
   1859        vuint16m1_t sum1 = __riscv_vadd_vv_u16m1(s2, s3, vl);
   1860        vuint16m1_t sum2 = __riscv_vadd_vv_u16m1(s4, s5, vl);
   1861 
   1862        // Vertical filter.
   1863        vuint16m1_t d0 =
   1864            __riscv_vadd_vx_u16m1(__riscv_vadd_vv_u16m1(sum0, sum1, vl), 2, vl);
   1865        vuint16m1_t d1 =
   1866            __riscv_vadd_vx_u16m1(__riscv_vadd_vv_u16m1(sum1, sum2, vl), 2, vl);
   1867 
   1868        d0 = __riscv_vsrl_vx_u16m1(d0, 2, vl);
   1869        d1 = __riscv_vsrl_vx_u16m1(d1, 2, vl);
   1870 
   1871        __riscv_vse16_v_u16m1(dst_ptr, d0, vl);
   1872        dst_ptr += dst_stride;
   1873        __riscv_vse16_v_u16m1(dst_ptr, d1, vl);
   1874        dst_ptr += dst_stride;
   1875 
   1876        sum0 = __riscv_vmv_v_v_u16m1(sum2, vl);
   1877        height -= 2;
   1878      } while (height != 0);
   1879 
   1880      src += vl;
   1881      dst += vl;
   1882      w -= vl;
   1883    } while (w != 0);
   1884  }
   1885 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE