tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

convolve_rvv.c (67649B)


      1 /*
      2 * Copyright (c) 2025, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <riscv_vector.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/av1_rtcd.h"
     17 
     18 #include "aom_dsp/aom_dsp_common.h"
     19 #include "aom_ports/mem.h"
     20 #include "av1/common/convolve.h"
     21 #include "av1/common/filter.h"
     22 #include "av1/common/riscv/convolve_rvv.h"
     23 
     24 static inline vuint8mf2_t convolve12_4_x_rvv(
     25    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
     26    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
     27    const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8,
     28    const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11,
     29    const int16_t *filter, const int32_t horiz_const, size_t vl) {
     30  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, filter[0], vl);
     31  sum = __riscv_vwmacc_vx_i32m2(sum, filter[1], s1, vl);
     32  sum = __riscv_vwmacc_vx_i32m2(sum, filter[2], s2, vl);
     33  sum = __riscv_vwmacc_vx_i32m2(sum, filter[3], s3, vl);
     34  sum = __riscv_vwmacc_vx_i32m2(sum, filter[4], s4, vl);
     35  sum = __riscv_vwmacc_vx_i32m2(sum, filter[5], s5, vl);
     36  sum = __riscv_vwmacc_vx_i32m2(sum, filter[6], s6, vl);
     37  sum = __riscv_vwmacc_vx_i32m2(sum, filter[7], s7, vl);
     38  sum = __riscv_vwmacc_vx_i32m2(sum, filter[8], s8, vl);
     39  sum = __riscv_vwmacc_vx_i32m2(sum, filter[9], s9, vl);
     40  sum = __riscv_vwmacc_vx_i32m2(sum, filter[10], s10, vl);
     41  sum = __riscv_vwmacc_vx_i32m2(sum, filter[11], s11, vl);
     42  sum = __riscv_vwadd_wx_i32m2(sum, horiz_const + (1 << (FILTER_BITS - 1)), vl);
     43 
     44  // Round and shift
     45  vint16m1_t i16_sum = __riscv_vnsra_wx_i16m1(sum, FILTER_BITS, vl);
     46  vint16m1_t iclip_sum =
     47      __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), 255, vl);
     48 
     49  // Convert to 8-bit
     50  return __riscv_vncvt_x_x_w_u8mf2(
     51      __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum), vl);
     52 }
     53 
     54 static inline void convolve_x_sr_12tap_rvv(const uint8_t *src_ptr,
     55                                           int src_stride, uint8_t *dst_ptr,
     56                                           const int dst_stride, int w, int h,
     57                                           const int16_t *x_filter_ptr) {
     58  const int32_t horiz_const = (1 << (ROUND0_BITS - 1));
     59  size_t vl = __riscv_vsetvl_e16m1(w);
     60 
     61  do {
     62    const uint8_t *s = src_ptr;
     63    uint8_t *d = dst_ptr;
     64    int width = w;
     65 
     66    do {
     67      // Load
     68      vuint8mf2_t t0 = __riscv_vle8_v_u8mf2(s + 0, vl);
     69      vuint8mf2_t t1 = __riscv_vle8_v_u8mf2(s + 1, vl);
     70      vuint8mf2_t t2 = __riscv_vle8_v_u8mf2(s + 2, vl);
     71      vuint8mf2_t t3 = __riscv_vle8_v_u8mf2(s + 3, vl);
     72      vuint8mf2_t t4 = __riscv_vle8_v_u8mf2(s + 4, vl);
     73      vuint8mf2_t t5 = __riscv_vle8_v_u8mf2(s + 5, vl);
     74      vuint8mf2_t t6 = __riscv_vle8_v_u8mf2(s + 6, vl);
     75      vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(s + 7, vl);
     76      vuint8mf2_t t8 = __riscv_vle8_v_u8mf2(s + 8, vl);
     77      vuint8mf2_t t9 = __riscv_vle8_v_u8mf2(s + 9, vl);
     78      vuint8mf2_t t10 = __riscv_vle8_v_u8mf2(s + 10, vl);
     79      vuint8mf2_t t11 = __riscv_vle8_v_u8mf2(s + 11, vl);
     80 
     81      // Convert to 16-bit integers
     82      vint16m1_t s0 =
     83          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl));
     84      vint16m1_t s1 =
     85          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl));
     86      vint16m1_t s2 =
     87          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl));
     88      vint16m1_t s3 =
     89          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl));
     90      vint16m1_t s4 =
     91          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl));
     92      vint16m1_t s5 =
     93          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl));
     94      vint16m1_t s6 =
     95          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl));
     96      vint16m1_t s7 =
     97          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl));
     98      vint16m1_t s8 =
     99          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t8, vl));
    100      vint16m1_t s9 =
    101          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t9, vl));
    102      vint16m1_t s10 =
    103          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t10, vl));
    104      vint16m1_t s11 =
    105          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t11, vl));
    106 
    107      // Perform convolution
    108      vuint8mf2_t d0 =
    109          convolve12_4_x_rvv(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
    110                             x_filter_ptr, horiz_const, vl);
    111 
    112      // Store result
    113      __riscv_vse8_v_u8mf2(d, d0, vl);
    114 
    115      s += vl;
    116      d += vl;
    117      width -= vl;
    118    } while (width != 0);
    119    src_ptr += src_stride;
    120    dst_ptr += dst_stride;
    121  } while (--h != 0);
    122 }
    123 
    124 static inline vuint8mf2_t convolve4_8_x_rvv(
    125    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    126    const vint16m1_t s3, const int16_t *filter, const int16_t horiz_const,
    127    size_t vl) {
    128  vint16m1_t sum = __riscv_vmul_vx_i16m1(s0, filter[0], vl);
    129  sum = __riscv_vmacc_vx_i16m1(sum, filter[1], s1, vl);
    130  sum = __riscv_vmacc_vx_i16m1(sum, filter[2], s2, vl);
    131  sum = __riscv_vmacc_vx_i16m1(sum, filter[3], s3, vl);
    132  sum = __riscv_vadd_vx_i16m1(sum, horiz_const, vl);
    133 
    134  // Round and shift
    135  // We halved the filter values so -1 from right shift
    136  vuint16m1_t d0 =
    137      __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(sum, 0, vl));
    138 
    139  return __riscv_vnclipu_wx_u8mf2(d0, FILTER_BITS - 1, __RISCV_VXRM_RNU, vl);
    140 }
    141 
    142 static inline void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
    143                               vuint8mf2_t *const s0, vuint8mf2_t *const s1,
    144                               vuint8mf2_t *const s2, vuint8mf2_t *const s3,
    145                               size_t vl) {
    146  *s0 = __riscv_vle8_v_u8mf2(s, vl);
    147  s += p;
    148  *s1 = __riscv_vle8_v_u8mf2(s, vl);
    149  s += p;
    150  *s2 = __riscv_vle8_v_u8mf2(s, vl);
    151  s += p;
    152  *s3 = __riscv_vle8_v_u8mf2(s, vl);
    153 }
    154 
    155 static inline void store_u8_8x2(uint8_t *s, ptrdiff_t p, const vuint8mf2_t s0,
    156                                const vuint8mf2_t s1, size_t vl) {
    157  __riscv_vse8_v_u8mf2(s, s0, vl);
    158  s += p;
    159  __riscv_vse8_v_u8mf2(s, s1, vl);
    160 }
    161 
    162 static inline void convolve_x_sr_4tap_rvv(const uint8_t *src_ptr,
    163                                          int src_stride, uint8_t *dst_ptr,
    164                                          const int dst_stride, int w, int h,
    165                                          const int16_t *x_filter_ptr) {
    166  size_t vl;
    167  const int16_t horiz_const = (1 << ((ROUND0_BITS - 1) - 1));
    168 
    169  // All filter values are even, halve to reduce intermediate precision
    170  // requirements.
    171  int16_t filter[4];
    172  for (int i = 0; i < 4; i++) filter[i] = x_filter_ptr[2 + i] >> 1;
    173 
    174  if (w == 4) {
    175    vl = 8;
    176    do {
    177      // Load 8 pixels for each row
    178      vuint8mf2_t t00, t01, t02, t03;
    179      t00 = load_strided_u8_4xN((uint8_t *)src_ptr + 0, src_stride, vl);
    180      t01 = load_strided_u8_4xN((uint8_t *)src_ptr + 1, src_stride, vl);
    181      t02 = load_strided_u8_4xN((uint8_t *)src_ptr + 2, src_stride, vl);
    182      t03 = load_strided_u8_4xN((uint8_t *)src_ptr + 3, src_stride, vl);
    183 
    184      // Convert to 16-bit integers
    185      vint16m1_t s00, s01, s02, s03;
    186      s00 =
    187          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t00, vl));
    188      s01 =
    189          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t01, vl));
    190      s02 =
    191          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t02, vl));
    192      s03 =
    193          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t03, vl));
    194 
    195      // Perform convolution
    196      vuint8mf2_t d01 =
    197          convolve4_8_x_rvv(s00, s01, s02, s03, filter, horiz_const, vl);
    198 
    199      // Store result
    200      store_strided_u8_4xN(dst_ptr + 0 * dst_stride, d01, dst_stride, vl);
    201 
    202      src_ptr += 2 * src_stride;
    203      dst_ptr += 2 * dst_stride;
    204      h -= 2;
    205    } while (h != 0);
    206  } else {
    207    vl = __riscv_vsetvl_e16m1(w);
    208    do {
    209      int width = w;
    210      const uint8_t *s = src_ptr;
    211      uint8_t *d = dst_ptr;
    212 
    213      do {
    214        vuint8mf2_t t00, t01, t02, t03;
    215        vuint8mf2_t t10, t11, t12, t13;
    216        load_u8_8x4(s + 0 * src_stride, 1, &t00, &t01, &t02, &t03, vl);
    217        load_u8_8x4(s + 1 * src_stride, 1, &t10, &t11, &t12, &t13, vl);
    218 
    219        // Convert to 16-bit integers
    220        vint16m1_t s00, s01, s02, s03;
    221        s00 = __riscv_vreinterpret_v_u16m1_i16m1(
    222            __riscv_vzext_vf2_u16m1(t00, vl));
    223        s01 = __riscv_vreinterpret_v_u16m1_i16m1(
    224            __riscv_vzext_vf2_u16m1(t01, vl));
    225        s02 = __riscv_vreinterpret_v_u16m1_i16m1(
    226            __riscv_vzext_vf2_u16m1(t02, vl));
    227        s03 = __riscv_vreinterpret_v_u16m1_i16m1(
    228            __riscv_vzext_vf2_u16m1(t03, vl));
    229 
    230        vint16m1_t s10, s11, s12, s13;
    231        s10 = __riscv_vreinterpret_v_u16m1_i16m1(
    232            __riscv_vzext_vf2_u16m1(t10, vl));
    233        s11 = __riscv_vreinterpret_v_u16m1_i16m1(
    234            __riscv_vzext_vf2_u16m1(t11, vl));
    235        s12 = __riscv_vreinterpret_v_u16m1_i16m1(
    236            __riscv_vzext_vf2_u16m1(t12, vl));
    237        s13 = __riscv_vreinterpret_v_u16m1_i16m1(
    238            __riscv_vzext_vf2_u16m1(t13, vl));
    239 
    240        // Perform convolution
    241        vuint8mf2_t d0 =
    242            convolve4_8_x_rvv(s00, s01, s02, s03, filter, horiz_const, vl);
    243        vuint8mf2_t d1 =
    244            convolve4_8_x_rvv(s10, s11, s12, s13, filter, horiz_const, vl);
    245 
    246        // Store result
    247        store_u8_8x2(d, dst_stride, d0, d1, vl);
    248 
    249        s += vl;
    250        d += vl;
    251        width -= vl;
    252      } while (width > 0);
    253      src_ptr += 2 * src_stride;
    254      dst_ptr += 2 * dst_stride;
    255      h -= 2;
    256    } while (h != 0);
    257  }
    258 }
    259 
    260 static inline vuint8mf2_t convolve8_8_x_rvv(
    261    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    262    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
    263    const vint16m1_t s6, const vint16m1_t s7, const int16_t *filter,
    264    const int16_t horiz_const, size_t vl) {
    265  vint16m1_t sum = __riscv_vmul_vx_i16m1(s0, filter[0], vl);
    266  sum = __riscv_vmacc_vx_i16m1(sum, filter[1], s1, vl);
    267  sum = __riscv_vmacc_vx_i16m1(sum, filter[2], s2, vl);
    268  sum = __riscv_vmacc_vx_i16m1(sum, filter[3], s3, vl);
    269  sum = __riscv_vmacc_vx_i16m1(sum, filter[4], s4, vl);
    270  sum = __riscv_vmacc_vx_i16m1(sum, filter[5], s5, vl);
    271  sum = __riscv_vmacc_vx_i16m1(sum, filter[6], s6, vl);
    272  sum = __riscv_vmacc_vx_i16m1(sum, filter[7], s7, vl);
    273  sum = __riscv_vadd_vx_i16m1(sum, horiz_const, vl);
    274 
    275  // Round and shift
    276  // We halved the filter values so -1 from right shift
    277  vuint16m1_t d0 =
    278      __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(sum, 0, vl));
    279 
    280  return __riscv_vnclipu_wx_u8mf2(d0, FILTER_BITS - 1, __RISCV_VXRM_RNU, vl);
    281 }
    282 
    283 static inline void load_u8_8x8(const uint8_t *s, int p, vuint8mf2_t *const s0,
    284                               vuint8mf2_t *const s1, vuint8mf2_t *const s2,
    285                               vuint8mf2_t *const s3, vuint8mf2_t *const s4,
    286                               vuint8mf2_t *const s5, vuint8mf2_t *const s6,
    287                               vuint8mf2_t *const s7, size_t vl) {
    288  *s0 = __riscv_vle8_v_u8mf2(s, vl);
    289  s += p;
    290  *s1 = __riscv_vle8_v_u8mf2(s, vl);
    291  s += p;
    292  *s2 = __riscv_vle8_v_u8mf2(s, vl);
    293  s += p;
    294  *s3 = __riscv_vle8_v_u8mf2(s, vl);
    295  s += p;
    296  *s4 = __riscv_vle8_v_u8mf2(s, vl);
    297  s += p;
    298  *s5 = __riscv_vle8_v_u8mf2(s, vl);
    299  s += p;
    300  *s6 = __riscv_vle8_v_u8mf2(s, vl);
    301  s += p;
    302  *s7 = __riscv_vle8_v_u8mf2(s, vl);
    303 }
    304 
    305 static inline void convolve_x_sr_8tap_rvv(const uint8_t *src_ptr,
    306                                          int src_stride, uint8_t *dst_ptr,
    307                                          const int dst_stride, int w, int h,
    308                                          const int16_t *x_filter_ptr) {
    309  // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
    310  // rounding right shift by FILTER_BITS - instead of a first rounding right
    311  // shift by ROUND0_BITS, followed by second rounding right shift by
    312  // FILTER_BITS - ROUND0_BITS.
    313  // The outermost -1 is needed because we will halve the filter values.
    314  const int32_t horiz_const = 1 << ((ROUND0_BITS - 1) - 1);
    315 
    316  // Filter values are even so halve to reduce precision requirements.
    317  int16_t filter[8];
    318  for (int i = 0; i < 8; i++) filter[i] = x_filter_ptr[i] >> 1;
    319 
    320  size_t vl = __riscv_vsetvl_e16m1(w);
    321  while (h-- != 0) {
    322    int width = w;
    323    const uint8_t *s = src_ptr;
    324    uint8_t *d = dst_ptr;
    325 
    326    do {
    327      // Load
    328      vuint8mf2_t t0, t1, t2, t3, t4, t5, t6, t7;
    329      load_u8_8x8(s, 1, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, vl);
    330 
    331      // Convert to 16-bit integers
    332      vint16m1_t s0 =
    333          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl));
    334      vint16m1_t s1 =
    335          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl));
    336      vint16m1_t s2 =
    337          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl));
    338      vint16m1_t s3 =
    339          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl));
    340      vint16m1_t s4 =
    341          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl));
    342      vint16m1_t s5 =
    343          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl));
    344      vint16m1_t s6 =
    345          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl));
    346      vint16m1_t s7 =
    347          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl));
    348 
    349      // Perform convolution
    350      vuint8mf2_t d0 = convolve8_8_x_rvv(s0, s1, s2, s3, s4, s5, s6, s7, filter,
    351                                         horiz_const, vl);
    352 
    353      // Store result
    354      __riscv_vse8_v_u8mf2(d, d0, vl);
    355 
    356      s += vl;
    357      d += vl;
    358      width -= vl;
    359    } while (width > 0);
    360    src_ptr += src_stride;
    361    dst_ptr += dst_stride;
    362  }
    363 }
    364 
    365 void av1_convolve_x_sr_rvv(const uint8_t *src, int src_stride, uint8_t *dst,
    366                           int dst_stride, int w, int h,
    367                           const InterpFilterParams *filter_params_x,
    368                           const int subpel_x_qn, ConvolveParams *conv_params) {
    369  if (w == 2 || h == 2) {
    370    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
    371                        subpel_x_qn, conv_params);
    372    return;
    373  }
    374 
    375  int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
    376  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
    377  const uint8_t *src_rvv = src - horiz_offset;
    378 
    379  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
    380      filter_params_x, subpel_x_qn & SUBPEL_MASK);
    381 
    382  if (filter_taps > 8) {
    383    convolve_x_sr_12tap_rvv(src_rvv, src_stride, dst, dst_stride, w, h,
    384                            x_filter_ptr);
    385    return;
    386  }
    387 
    388  if (filter_taps <= 4) {
    389    convolve_x_sr_4tap_rvv(src_rvv + 2, src_stride, dst, dst_stride, w, h,
    390                           x_filter_ptr);
    391    return;
    392  }
    393 
    394  convolve_x_sr_8tap_rvv(src_rvv, src_stride, dst, dst_stride, w, h,
    395                         x_filter_ptr);
    396  return;
    397 }
    398 
    399 static inline void store_u8_8x4(uint8_t *s, int p, const vuint8mf2_t s0,
    400                                const vuint8mf2_t s1, const vuint8mf2_t s2,
    401                                const vuint8mf2_t s3, size_t vl) {
    402  __riscv_vse8_v_u8mf2(s, s0, vl);
    403  s += p;
    404  __riscv_vse8_v_u8mf2(s, s1, vl);
    405  s += p;
    406  __riscv_vse8_v_u8mf2(s, s2, vl);
    407  s += p;
    408  __riscv_vse8_v_u8mf2(s, s3, vl);
    409 }
    410 
    411 static inline vuint8mf2_t convolve4_8_y_rvv(const vint16m1_t s0,
    412                                            const vint16m1_t s1,
    413                                            const vint16m1_t s2,
    414                                            const vint16m1_t s3,
    415                                            const int16_t *filter, size_t vl) {
    416  vint16m1_t sum = __riscv_vmul_vx_i16m1(s0, filter[0], vl);
    417  sum = __riscv_vmacc_vx_i16m1(sum, filter[1], s1, vl);
    418  sum = __riscv_vmacc_vx_i16m1(sum, filter[2], s2, vl);
    419  sum = __riscv_vmacc_vx_i16m1(sum, filter[3], s3, vl);
    420 
    421  // Round and shift
    422  // We halved the filter values so -1 from right shift
    423  vuint16m1_t d0 =
    424      __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(sum, 0, vl));
    425 
    426  return __riscv_vnclipu_wx_u8mf2(d0, FILTER_BITS - 1, __RISCV_VXRM_RNU, vl);
    427 }
    428 
    429 static inline void convolve_y_sr_4tap_rvv(const uint8_t *src,
    430                                          const int src_stride, uint8_t *dst,
    431                                          const int dst_stride, int w, int h,
    432                                          const int16_t *filter_y) {
    433  const int16_t *filter = filter_y + 2;
    434 
    435  if (w == 4) {
    436    size_t vl = 8;
    437 
    438    // Load initial data
    439    vuint8mf2_t t01 =
    440        load_strided_u8_4xN((uint8_t *)src + 0 * src_stride, src_stride, vl);
    441    vuint8mf2_t t12 =
    442        load_strided_u8_4xN((uint8_t *)src + 1 * src_stride, src_stride, vl);
    443 
    444    // Convert to 16-bit
    445    vint16m1_t s01 =
    446        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t01, vl));
    447    vint16m1_t s12 =
    448        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t12, vl));
    449 
    450    src += 2 * src_stride;
    451 
    452    do {
    453      // Load next set of data
    454      vuint8mf2_t t23 =
    455          load_strided_u8_4xN((uint8_t *)src + 0 * src_stride, src_stride, vl);
    456      vuint8mf2_t t34 =
    457          load_strided_u8_4xN((uint8_t *)src + 1 * src_stride, src_stride, vl);
    458      vuint8mf2_t t45 =
    459          load_strided_u8_4xN((uint8_t *)src + 2 * src_stride, src_stride, vl);
    460      vuint8mf2_t t56 =
    461          load_strided_u8_4xN((uint8_t *)src + 3 * src_stride, src_stride, vl);
    462 
    463      // Convert to 16-bit
    464      vint16m1_t s23 =
    465          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t23, vl));
    466      vint16m1_t s34 =
    467          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t34, vl));
    468      vint16m1_t s45 =
    469          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t45, vl));
    470      vint16m1_t s56 =
    471          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t56, vl));
    472 
    473      // Perform convolution
    474      vuint8mf2_t d01 = convolve4_8_y_rvv(s01, s12, s23, s34, filter, vl);
    475      vuint8mf2_t d23 = convolve4_8_y_rvv(s23, s34, s45, s56, filter, vl);
    476 
    477      // Store results
    478      store_strided_u8_4xN(dst + 0 * dst_stride, d01, dst_stride, vl);
    479      store_strided_u8_4xN(dst + 2 * dst_stride, d23, dst_stride, vl);
    480 
    481      s01 = __riscv_vmv_v_v_i16m1(s45, vl);
    482      s12 = __riscv_vmv_v_v_i16m1(s56, vl);
    483 
    484      src += 4 * src_stride;
    485      dst += 4 * dst_stride;
    486      h -= 4;
    487    } while (h != 0);
    488  } else {
    489    // Handle width > 4 case
    490    size_t vl = __riscv_vsetvl_e16m1(w);
    491    do {
    492      // Load initial 3 rows of data
    493      vuint8mf2_t t0 = __riscv_vle8_v_u8mf2(src + 0 * src_stride, vl);
    494      vuint8mf2_t t1 = __riscv_vle8_v_u8mf2(src + 1 * src_stride, vl);
    495      vuint8mf2_t t2 = __riscv_vle8_v_u8mf2(src + 2 * src_stride, vl);
    496 
    497      // Convert to 16-bit
    498      vint16m1_t s0 =
    499          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl));
    500      vint16m1_t s1 =
    501          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl));
    502      vint16m1_t s2 =
    503          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl));
    504 
    505      int height = h;
    506      const uint8_t *s = src + 3 * src_stride;
    507      uint8_t *d = dst;
    508 
    509      do {
    510        // Load next 4 rows of data
    511        vuint8mf2_t t3;
    512        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3, vl);
    513 
    514        // Convert to 16-bit
    515        vint16m1_t s3 =
    516            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl));
    517        vint16m1_t s4 =
    518            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl));
    519        vint16m1_t s5 =
    520            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl));
    521        vint16m1_t s6 =
    522            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl));
    523 
    524        // Perform convolution
    525        vuint8mf2_t d0 = convolve4_8_y_rvv(s0, s1, s2, s3, filter, vl);
    526        vuint8mf2_t d1 = convolve4_8_y_rvv(s1, s2, s3, s4, filter, vl);
    527        vuint8mf2_t d2 = convolve4_8_y_rvv(s2, s3, s4, s5, filter, vl);
    528        vuint8mf2_t d3 = convolve4_8_y_rvv(s3, s4, s5, s6, filter, vl);
    529 
    530        // Store results
    531        store_u8_8x4(d, dst_stride, d0, d1, d2, d3, vl);
    532 
    533        s0 = __riscv_vmv_v_v_i16m1(s4, vl);
    534        s1 = __riscv_vmv_v_v_i16m1(s5, vl);
    535        s2 = __riscv_vmv_v_v_i16m1(s6, vl);
    536 
    537        s += 4 * src_stride;
    538        d += 4 * dst_stride;
    539        height -= 4;
    540      } while (height != 0);
    541      src += vl;
    542      dst += vl;
    543      w -= vl;
    544    } while (w > 0);
    545  }
    546 }
    547 
    548 static inline void load_u8_8x5(const uint8_t *s, int p, vuint8mf2_t *const s0,
    549                               vuint8mf2_t *const s1, vuint8mf2_t *const s2,
    550                               vuint8mf2_t *const s3, vuint8mf2_t *const s4,
    551                               size_t vl) {
    552  *s0 = __riscv_vle8_v_u8mf2(s, vl);
    553  s += p;
    554  *s1 = __riscv_vle8_v_u8mf2(s, vl);
    555  s += p;
    556  *s2 = __riscv_vle8_v_u8mf2(s, vl);
    557  s += p;
    558  *s3 = __riscv_vle8_v_u8mf2(s, vl);
    559  s += p;
    560  *s4 = __riscv_vle8_v_u8mf2(s, vl);
    561 }
    562 
    563 static inline vuint8mf2_t convolve6_8_y_rvv(
    564    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    565    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
    566    const int16_t *filter, size_t vl) {
    567  // Filter values at indices 0 and 7 are 0, so we start from index 1
    568  vint16m1_t sum = __riscv_vmul_vx_i16m1(s0, filter[1], vl);
    569  sum = __riscv_vmacc_vx_i16m1(sum, filter[2], s1, vl);
    570  sum = __riscv_vmacc_vx_i16m1(sum, filter[3], s2, vl);
    571  sum = __riscv_vmacc_vx_i16m1(sum, filter[4], s3, vl);
    572  sum = __riscv_vmacc_vx_i16m1(sum, filter[5], s4, vl);
    573  sum = __riscv_vmacc_vx_i16m1(sum, filter[6], s5, vl);
    574 
    575  // Round and shift
    576  // We halved the filter values so -1 from right shift
    577  vuint16m1_t d0 =
    578      __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(sum, 0, vl));
    579 
    580  return __riscv_vnclipu_wx_u8mf2(d0, FILTER_BITS - 1, __RISCV_VXRM_RNU, vl);
    581 }
    582 
    583 static inline void convolve_y_sr_6tap_rvv(const uint8_t *src_ptr,
    584                                          int src_stride, uint8_t *dst_ptr,
    585                                          const int dst_stride, int w, int h,
    586                                          const int16_t *y_filter) {
    587  size_t vl = __riscv_vsetvl_e16m1(w);
    588  do {
    589    const uint8_t *s = src_ptr;
    590    uint8_t *d = dst_ptr;
    591    int height = h;
    592 
    593    // Load initial 5 rows of data
    594    vuint8mf2_t t0, t1, t2, t3, t4;
    595    load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4, vl);
    596 
    597    // Convert to 16-bit
    598    vint16m1_t s0 =
    599        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl));
    600    vint16m1_t s1 =
    601        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl));
    602    vint16m1_t s2 =
    603        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl));
    604    vint16m1_t s3 =
    605        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl));
    606    vint16m1_t s4 =
    607        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl));
    608 
    609    s += 5 * src_stride;
    610 
    611    do {
    612      // Load next row of data
    613      vuint8mf2_t t5 = __riscv_vle8_v_u8mf2(s + 0 * src_stride, vl);
    614      vuint8mf2_t t6 = __riscv_vle8_v_u8mf2(s + 1 * src_stride, vl);
    615      vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(s + 2 * src_stride, vl);
    616      vuint8mf2_t t8 = __riscv_vle8_v_u8mf2(s + 3 * src_stride, vl);
    617 
    618      // Convert to 16-bit
    619      vint16m1_t s5 =
    620          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl));
    621      vint16m1_t s6 =
    622          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl));
    623      vint16m1_t s7 =
    624          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl));
    625      vint16m1_t s8 =
    626          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t8, vl));
    627 
    628      // Perform convolution
    629      vuint8mf2_t d0 = convolve6_8_y_rvv(s0, s1, s2, s3, s4, s5, y_filter, vl);
    630      vuint8mf2_t d1 = convolve6_8_y_rvv(s1, s2, s3, s4, s5, s6, y_filter, vl);
    631      vuint8mf2_t d2 = convolve6_8_y_rvv(s2, s3, s4, s5, s6, s7, y_filter, vl);
    632      vuint8mf2_t d3 = convolve6_8_y_rvv(s3, s4, s5, s6, s7, s8, y_filter, vl);
    633 
    634      // Store result
    635      store_u8_8x4(d, dst_stride, d0, d1, d2, d3, vl);
    636 
    637      // Update sliding window
    638      s0 = __riscv_vmv_v_v_i16m1(s4, vl);
    639      s1 = __riscv_vmv_v_v_i16m1(s5, vl);
    640      s2 = __riscv_vmv_v_v_i16m1(s6, vl);
    641      s3 = __riscv_vmv_v_v_i16m1(s7, vl);
    642      s4 = __riscv_vmv_v_v_i16m1(s8, vl);
    643      s += 4 * src_stride;
    644      d += 4 * dst_stride;
    645      height -= 4;
    646    } while (height != 0);
    647    src_ptr += vl;
    648    dst_ptr += vl;
    649    w -= vl;
    650  } while (w > 0);
    651 }
    652 
    653 static inline void load_u8_8x7(const uint8_t *s, int p, vuint8mf2_t *const s0,
    654                               vuint8mf2_t *const s1, vuint8mf2_t *const s2,
    655                               vuint8mf2_t *const s3, vuint8mf2_t *const s4,
    656                               vuint8mf2_t *const s5, vuint8mf2_t *const s6,
    657                               size_t vl) {
    658  *s0 = __riscv_vle8_v_u8mf2(s, vl);
    659  s += p;
    660  *s1 = __riscv_vle8_v_u8mf2(s, vl);
    661  s += p;
    662  *s2 = __riscv_vle8_v_u8mf2(s, vl);
    663  s += p;
    664  *s3 = __riscv_vle8_v_u8mf2(s, vl);
    665  s += p;
    666  *s4 = __riscv_vle8_v_u8mf2(s, vl);
    667  s += p;
    668  *s5 = __riscv_vle8_v_u8mf2(s, vl);
    669  s += p;
    670  *s6 = __riscv_vle8_v_u8mf2(s, vl);
    671 }
    672 
    673 static inline vuint8mf2_t convolve8_8_y_rvv(
    674    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    675    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
    676    const vint16m1_t s6, const vint16m1_t s7, const int16_t *filter,
    677    size_t vl) {
    678  vint16m1_t sum = __riscv_vmul_vx_i16m1(s0, filter[0], vl);
    679  sum = __riscv_vmacc_vx_i16m1(sum, filter[1], s1, vl);
    680  sum = __riscv_vmacc_vx_i16m1(sum, filter[2], s2, vl);
    681  sum = __riscv_vmacc_vx_i16m1(sum, filter[3], s3, vl);
    682  sum = __riscv_vmacc_vx_i16m1(sum, filter[4], s4, vl);
    683  sum = __riscv_vmacc_vx_i16m1(sum, filter[5], s5, vl);
    684  sum = __riscv_vmacc_vx_i16m1(sum, filter[6], s6, vl);
    685  sum = __riscv_vmacc_vx_i16m1(sum, filter[7], s7, vl);
    686 
    687  // Round and shift
    688  // We halved the filter values so -1 from right shift
    689  vuint16m1_t d0 =
    690      __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(sum, 0, vl));
    691 
    692  return __riscv_vnclipu_wx_u8mf2(d0, FILTER_BITS - 1, __RISCV_VXRM_RNU, vl);
    693 }
    694 
    695 static inline void convolve_y_sr_8tap_rvv(const uint8_t *src_ptr,
    696                                          int src_stride, uint8_t *dst_ptr,
    697                                          const int dst_stride, int w, int h,
    698                                          const int16_t *y_filter) {
    699  size_t vl = __riscv_vsetvl_e16m1(w);
    700  do {
    701    const uint8_t *s = src_ptr;
    702    uint8_t *d = dst_ptr;
    703    int height = h;
    704 
    705    // Load initial 7 rows of data
    706    vuint8mf2_t t0, t1, t2, t3, t4, t5, t6;
    707    load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, vl);
    708 
    709    // Convert to 16-bit
    710    vint16m1_t s0 =
    711        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl));
    712    vint16m1_t s1 =
    713        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl));
    714    vint16m1_t s2 =
    715        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl));
    716    vint16m1_t s3 =
    717        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl));
    718    vint16m1_t s4 =
    719        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl));
    720    vint16m1_t s5 =
    721        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl));
    722    vint16m1_t s6 =
    723        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl));
    724 
    725    s += 7 * src_stride;
    726 
    727    do {
    728      // Load next row
    729      vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(s + 0 * src_stride, vl);
    730      vuint8mf2_t t8 = __riscv_vle8_v_u8mf2(s + 1 * src_stride, vl);
    731      vuint8mf2_t t9 = __riscv_vle8_v_u8mf2(s + 2 * src_stride, vl);
    732      vuint8mf2_t t10 = __riscv_vle8_v_u8mf2(s + 3 * src_stride, vl);
    733 
    734      // Convert to 16-bit
    735      vint16m1_t s7 =
    736          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl));
    737      vint16m1_t s8 =
    738          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t8, vl));
    739      vint16m1_t s9 =
    740          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t9, vl));
    741      vint16m1_t s10 =
    742          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t10, vl));
    743 
    744      // Perform 8-tap vertical convolution
    745      vuint8mf2_t d0 =
    746          convolve8_8_y_rvv(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, vl);
    747      vuint8mf2_t d1 =
    748          convolve8_8_y_rvv(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, vl);
    749      vuint8mf2_t d2 =
    750          convolve8_8_y_rvv(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, vl);
    751      vuint8mf2_t d3 =
    752          convolve8_8_y_rvv(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, vl);
    753 
    754      // Store result
    755      store_u8_8x4(d, dst_stride, d0, d1, d2, d3, vl);
    756 
    757      // Update sliding window
    758      s0 = __riscv_vmv_v_v_i16m1(s4, vl);
    759      s1 = __riscv_vmv_v_v_i16m1(s5, vl);
    760      s2 = __riscv_vmv_v_v_i16m1(s6, vl);
    761      s3 = __riscv_vmv_v_v_i16m1(s7, vl);
    762      s4 = __riscv_vmv_v_v_i16m1(s8, vl);
    763      s5 = __riscv_vmv_v_v_i16m1(s9, vl);
    764      s6 = __riscv_vmv_v_v_i16m1(s10, vl);
    765      s += 4 * src_stride;
    766      d += 4 * dst_stride;
    767      height -= 4;
    768    } while (height > 0);
    769    src_ptr += vl;
    770    dst_ptr += vl;
    771    w -= vl;
    772  } while (w > 0);
    773 }
    774 
    775 static inline void load_u8_8x11(const uint8_t *s, int p, vuint8mf2_t *const s0,
    776                                vuint8mf2_t *const s1, vuint8mf2_t *const s2,
    777                                vuint8mf2_t *const s3, vuint8mf2_t *const s4,
    778                                vuint8mf2_t *const s5, vuint8mf2_t *const s6,
    779                                vuint8mf2_t *const s7, vuint8mf2_t *const s8,
    780                                vuint8mf2_t *const s9, vuint8mf2_t *const s10,
    781                                size_t vl) {
    782  *s0 = __riscv_vle8_v_u8mf2(s, vl);
    783  s += p;
    784  *s1 = __riscv_vle8_v_u8mf2(s, vl);
    785  s += p;
    786  *s2 = __riscv_vle8_v_u8mf2(s, vl);
    787  s += p;
    788  *s3 = __riscv_vle8_v_u8mf2(s, vl);
    789  s += p;
    790  *s4 = __riscv_vle8_v_u8mf2(s, vl);
    791  s += p;
    792  *s5 = __riscv_vle8_v_u8mf2(s, vl);
    793  s += p;
    794  *s6 = __riscv_vle8_v_u8mf2(s, vl);
    795  s += p;
    796  *s7 = __riscv_vle8_v_u8mf2(s, vl);
    797  s += p;
    798  *s8 = __riscv_vle8_v_u8mf2(s, vl);
    799  s += p;
    800  *s9 = __riscv_vle8_v_u8mf2(s, vl);
    801  s += p;
    802  *s10 = __riscv_vle8_v_u8mf2(s, vl);
    803 }
    804 
    805 static inline vuint8mf2_t convolve12_8_y_rvv(
    806    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    807    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
    808    const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8,
    809    const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11,
    810    const int16_t *y_filter, size_t vl) {
    811  // Initialize sum with first multiplication
    812  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl);
    813  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl);
    814  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl);
    815  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl);
    816  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[4], s4, vl);
    817  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[5], s5, vl);
    818  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[6], s6, vl);
    819  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[7], s7, vl);
    820  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[8], s8, vl);
    821  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[9], s9, vl);
    822  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[10], s10, vl);
    823  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[11], s11, vl);
    824 
    825  // Round and shift
    826  sum = __riscv_vadd_vx_i32m2(sum, 1 << (FILTER_BITS - 1), vl);
    827  vint16m1_t i16_sum = __riscv_vnsra_wx_i16m1(sum, FILTER_BITS, vl);
    828  vint16m1_t iclip_sum =
    829      __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), 255, vl);
    830 
    831  // Convert to 8-bit
    832  return __riscv_vncvt_x_x_w_u8mf2(
    833      __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum), vl);
    834 }
    835 
    836 static inline void convolve_y_sr_12tap_rvv(const uint8_t *src_ptr,
    837                                           int src_stride, uint8_t *dst_ptr,
    838                                           const int dst_stride, int w, int h,
    839                                           const int16_t *y_filter) {
    840  size_t vl = __riscv_vsetvl_e16m1(w);
    841  do {
    842    const uint8_t *s = src_ptr;
    843    uint8_t *d = dst_ptr;
    844    int height = h;
    845 
    846    // Load initial 11 rows of data
    847    vuint8mf2_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
    848    load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
    849                 &t9, &t10, vl);
    850 
    851    // Convert to 16-bit
    852    vint16m1_t s0 =
    853        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl));
    854    vint16m1_t s1 =
    855        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl));
    856    vint16m1_t s2 =
    857        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl));
    858    vint16m1_t s3 =
    859        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl));
    860    vint16m1_t s4 =
    861        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl));
    862    vint16m1_t s5 =
    863        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl));
    864    vint16m1_t s6 =
    865        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl));
    866    vint16m1_t s7 =
    867        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl));
    868    vint16m1_t s8 =
    869        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t8, vl));
    870    vint16m1_t s9 =
    871        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t9, vl));
    872    vint16m1_t s10 =
    873        __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t10, vl));
    874 
    875    s += 11 * src_stride;
    876 
    877    do {
    878      // Load next 4 rows
    879      vuint8mf2_t t11 = __riscv_vle8_v_u8mf2(s + 0 * src_stride, vl);
    880      vuint8mf2_t t12 = __riscv_vle8_v_u8mf2(s + 1 * src_stride, vl);
    881      vuint8mf2_t t13 = __riscv_vle8_v_u8mf2(s + 2 * src_stride, vl);
    882      vuint8mf2_t t14 = __riscv_vle8_v_u8mf2(s + 3 * src_stride, vl);
    883 
    884      // Convert to 16-bit
    885      vint16m1_t s11 =
    886          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t11, vl));
    887      vint16m1_t s12 =
    888          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t12, vl));
    889      vint16m1_t s13 =
    890          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t13, vl));
    891      vint16m1_t s14 =
    892          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t14, vl));
    893 
    894      // Perform 12-tap convolution
    895      vuint8mf2_t d0 = convolve12_8_y_rvv(s0, s1, s2, s3, s4, s5, s6, s7, s8,
    896                                          s9, s10, s11, y_filter, vl);
    897      vuint8mf2_t d1 = convolve12_8_y_rvv(s1, s2, s3, s4, s5, s6, s7, s8, s9,
    898                                          s10, s11, s12, y_filter, vl);
    899      vuint8mf2_t d2 = convolve12_8_y_rvv(s2, s3, s4, s5, s6, s7, s8, s9, s10,
    900                                          s11, s12, s13, y_filter, vl);
    901      vuint8mf2_t d3 = convolve12_8_y_rvv(s3, s4, s5, s6, s7, s8, s9, s10, s11,
    902                                          s12, s13, s14, y_filter, vl);
    903 
    904      // Store results
    905      store_u8_8x4(d, dst_stride, d0, d1, d2, d3, vl);
    906 
    907      // Update source pointers for next iteration
    908      s0 = __riscv_vmv_v_v_i16m1(s4, vl);
    909      s1 = __riscv_vmv_v_v_i16m1(s5, vl);
    910      s2 = __riscv_vmv_v_v_i16m1(s6, vl);
    911      s3 = __riscv_vmv_v_v_i16m1(s7, vl);
    912      s4 = __riscv_vmv_v_v_i16m1(s8, vl);
    913      s5 = __riscv_vmv_v_v_i16m1(s9, vl);
    914      s6 = __riscv_vmv_v_v_i16m1(s10, vl);
    915      s7 = __riscv_vmv_v_v_i16m1(s11, vl);
    916      s8 = __riscv_vmv_v_v_i16m1(s12, vl);
    917      s9 = __riscv_vmv_v_v_i16m1(s13, vl);
    918      s10 = __riscv_vmv_v_v_i16m1(s14, vl);
    919      s += 4 * src_stride;
    920      d += 4 * dst_stride;
    921      height -= 4;
    922    } while (height != 0);
    923    src_ptr += vl;
    924    dst_ptr += vl;
    925    w -= vl;
    926  } while (w > 0);
    927 }
    928 
    929 void av1_convolve_y_sr_rvv(const uint8_t *src, int src_stride, uint8_t *dst,
    930                           int dst_stride, int w, int h,
    931                           const InterpFilterParams *filter_params_y,
    932                           const int subpel_y_qn) {
    933  if (w == 2 || h == 2) {
    934    av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
    935                        subpel_y_qn);
    936    return;
    937  }
    938 
    939  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
    940  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
    941  const int vert_offset = clamped_y_taps / 2 - 1;
    942  const uint8_t *src_rvv = src - vert_offset * src_stride;
    943  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
    944      filter_params_y, subpel_y_qn & SUBPEL_MASK);
    945 
    946  if (y_filter_taps > 8) {
    947    convolve_y_sr_12tap_rvv(src_rvv, src_stride, dst, dst_stride, w, h,
    948                            y_filter_ptr);
    949    return;
    950  }
    951 
    952  // Filter values are even so halve to reduce precision requirements.
    953  // In RVV, we need to create a temporary array for the halved filter values
    954  int16_t halved_filter[8];
    955  for (int i = 0; i < 8; i++) {
    956    halved_filter[i] = y_filter_ptr[i] >> 1;
    957  }
    958 
    959  if (y_filter_taps <= 4) {
    960    convolve_y_sr_4tap_rvv(src_rvv, src_stride, dst, dst_stride, w, h,
    961                           halved_filter);
    962  } else if (y_filter_taps == 6) {
    963    convolve_y_sr_6tap_rvv(src_rvv, src_stride, dst, dst_stride, w, h,
    964                           halved_filter);
    965  } else {
    966    convolve_y_sr_8tap_rvv(src_rvv, src_stride, dst, dst_stride, w, h,
    967                           halved_filter);
    968  }
    969 }
    970 
    971 static inline vint16m1_t convolve12_4_2d_h_rvv(
    972    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    973    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
    974    const vint16m1_t filter0, const vint16m1_t filter1,
    975    const vint16m1_t filter2, const vint16m1_t filter3,
    976    const vint16m1_t filter4, const vint16m1_t filter5,
    977    const int16_t horiz_const, size_t vl) {
    978  vint32m2_t sum = __riscv_vwmul_vv_i32m2(s0, filter0, vl);
    979  sum = __riscv_vwmacc_vv_i32m2(sum, filter1, s1, vl);
    980  sum = __riscv_vwmacc_vv_i32m2(sum, filter2, s2, vl);
    981  sum = __riscv_vwmacc_vv_i32m2(sum, filter3, s3, vl);
    982  sum = __riscv_vwmacc_vv_i32m2(sum, filter4, s4, vl);
    983  sum = __riscv_vwmacc_vv_i32m2(sum, filter5, s5, vl);
    984 
    985  sum = __riscv_vadd_vv_i32m2(
    986      sum, __riscv_vslidedown_vx_i32m2(sum, vl >> 1, vl), vl >> 1);
    987  sum = __riscv_vadd_vx_i32m2(sum, horiz_const, vl >> 1);
    988 
    989  return __riscv_vnsra_wx_i16m1(sum, ROUND0_BITS, vl >> 1);
    990 }
    991 
    992 static inline vint16m1_t convolve12_8_2d_h_rvv(
    993    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    994    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
    995    const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8,
    996    const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11,
    997    const int16_t *x_filter, const int16_t horiz_const, size_t vl) {
    998  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, x_filter[0], vl);
    999  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[1], s1, vl);
   1000  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[2], s2, vl);
   1001  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[3], s3, vl);
   1002  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[4], s4, vl);
   1003  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[5], s5, vl);
   1004  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[6], s6, vl);
   1005  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[7], s7, vl);
   1006  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[8], s8, vl);
   1007  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[9], s9, vl);
   1008  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[10], s10, vl);
   1009  sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[11], s11, vl);
   1010 
   1011  sum = __riscv_vadd_vx_i32m2(sum, horiz_const, vl);
   1012 
   1013  return __riscv_vnsra_wx_i16m1(sum, ROUND0_BITS, vl);
   1014 }
   1015 
   1016 static inline void convolve_2d_sr_horiz_12tap_rvv(
   1017    const uint8_t *src, int src_stride, int16_t *dst, const int dst_stride,
   1018    int w, int h, const int16_t *x_filter_ptr, size_t vl) {
   1019  const int bd = 8;
   1020  const int16_t horiz_const =
   1021      (1 << (bd + FILTER_BITS - 1)) + (1 << ((ROUND0_BITS - 1)));
   1022 
   1023  const int16_t xf0 = x_filter_ptr[0];
   1024  const int16_t xf1 = x_filter_ptr[1];
   1025  const int16_t xf2 = x_filter_ptr[2];
   1026  const int16_t xf3 = x_filter_ptr[3];
   1027  const int16_t xf4 = x_filter_ptr[4];
   1028  const int16_t xf5 = x_filter_ptr[5];
   1029  const int16_t xf6 = x_filter_ptr[6];
   1030  const int16_t xf7 = x_filter_ptr[7];
   1031  const int16_t xf8 = x_filter_ptr[8];
   1032  const int16_t xf9 = x_filter_ptr[9];
   1033  const int16_t xf10 = x_filter_ptr[10];
   1034  const int16_t xf11 = x_filter_ptr[11];
   1035 
   1036  if (w == 4) {
   1037    uint8_t *s = (uint8_t *)src;
   1038    int16_t *d = dst;
   1039 
   1040    vl = vl << 1;
   1041 
   1042    const int16_t filter0[8] = { xf0, xf0, xf0, xf0, xf4, xf4, xf4, xf4 };
   1043    const int16_t filter1[8] = { xf1, xf1, xf1, xf1, xf5, xf5, xf5, xf5 };
   1044    const int16_t filter2[8] = { xf2, xf2, xf2, xf2, xf6, xf6, xf6, xf6 };
   1045    const int16_t filter3[8] = { xf3, xf3, xf3, xf3, xf7, xf7, xf7, xf7 };
   1046    const int16_t filter4[8] = { xf8, xf8, xf8, xf8, xf9, xf9, xf9, xf9 };
   1047    const int16_t filter5[8] = {
   1048      xf10, xf10, xf10, xf10, xf11, xf11, xf11, xf11
   1049    };
   1050 
   1051    const vint16m1_t vfilter0 = __riscv_vle16_v_i16m1(filter0, vl);
   1052    const vint16m1_t vfilter1 = __riscv_vle16_v_i16m1(filter1, vl);
   1053    const vint16m1_t vfilter2 = __riscv_vle16_v_i16m1(filter2, vl);
   1054    const vint16m1_t vfilter3 = __riscv_vle16_v_i16m1(filter3, vl);
   1055    const vint16m1_t vfilter4 = __riscv_vle16_v_i16m1(filter4, vl);
   1056    const vint16m1_t vfilter5 = __riscv_vle16_v_i16m1(filter5, vl);
   1057 
   1058    do {
   1059      vuint8mf2_t t0 = __riscv_vle8_v_u8mf2(s, vl);
   1060      vuint8mf2_t t1 = __riscv_vle8_v_u8mf2(s + 1, vl);
   1061      vuint8mf2_t t2 = __riscv_vle8_v_u8mf2(s + 2, vl);
   1062      vuint8mf2_t t3 = __riscv_vle8_v_u8mf2(s + 3, vl);
   1063      vuint8mf2_t t4 = load_strided_u8_4xN(s + 8, 1, vl);
   1064      vuint8mf2_t t5 = load_strided_u8_4xN(s + 10, 1, vl);
   1065 
   1066      vuint8mf2_t t6 = __riscv_vle8_v_u8mf2(s + src_stride, vl);
   1067      vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(s + src_stride + 1, vl);
   1068      vuint8mf2_t t8 = __riscv_vle8_v_u8mf2(s + src_stride + 2, vl);
   1069      vuint8mf2_t t9 = __riscv_vle8_v_u8mf2(s + src_stride + 3, vl);
   1070      vuint8mf2_t t10 = load_strided_u8_4xN(s + src_stride + 8, 1, vl);
   1071      vuint8mf2_t t11 = load_strided_u8_4xN(s + src_stride + 10, 1, vl);
   1072 
   1073      vint16m1_t s0 =
   1074          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl));
   1075      vint16m1_t s1 =
   1076          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl));
   1077      vint16m1_t s2 =
   1078          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl));
   1079      vint16m1_t s3 =
   1080          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl));
   1081      vint16m1_t s4 =
   1082          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl));
   1083      vint16m1_t s5 =
   1084          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl));
   1085      vint16m1_t s6 =
   1086          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl));
   1087      vint16m1_t s7 =
   1088          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl));
   1089      vint16m1_t s8 =
   1090          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t8, vl));
   1091      vint16m1_t s9 =
   1092          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t9, vl));
   1093      vint16m1_t s10 =
   1094          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t10, vl));
   1095      vint16m1_t s11 =
   1096          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t11, vl));
   1097 
   1098      vint16m1_t d0 = convolve12_4_2d_h_rvv(
   1099          s0, s1, s2, s3, s4, s5, vfilter0, vfilter1, vfilter2, vfilter3,
   1100          vfilter4, vfilter5, horiz_const, vl);
   1101      vint16m1_t d1 = convolve12_4_2d_h_rvv(
   1102          s6, s7, s8, s9, s10, s11, vfilter0, vfilter1, vfilter2, vfilter3,
   1103          vfilter4, vfilter5, horiz_const, vl);
   1104 
   1105      __riscv_vse16_v_i16m1(d, d0, vl >> 1);
   1106      __riscv_vse16_v_i16m1(d + dst_stride, d1, vl >> 1);
   1107 
   1108      s += src_stride << 1;
   1109      d += dst_stride << 1;
   1110      h -= 2;
   1111    } while (h > 0);
   1112  } else {
   1113    do {
   1114      const uint8_t *s = src;
   1115      int16_t *d = dst;
   1116      int width = w;
   1117 
   1118      do {
   1119        vuint8mf2_t t0 = __riscv_vle8_v_u8mf2(s, vl);
   1120        vuint8mf2_t t1 = __riscv_vle8_v_u8mf2(s + 1, vl);
   1121        vuint8mf2_t t2 = __riscv_vle8_v_u8mf2(s + 2, vl);
   1122        vuint8mf2_t t3 = __riscv_vle8_v_u8mf2(s + 3, vl);
   1123        vuint8mf2_t t4 = __riscv_vle8_v_u8mf2(s + 4, vl);
   1124        vuint8mf2_t t5 = __riscv_vle8_v_u8mf2(s + 5, vl);
   1125        vuint8mf2_t t6 = __riscv_vle8_v_u8mf2(s + 6, vl);
   1126        vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(s + 7, vl);
   1127        vuint8mf2_t t8 = __riscv_vle8_v_u8mf2(s + 8, vl);
   1128        vuint8mf2_t t9 = __riscv_vle8_v_u8mf2(s + 9, vl);
   1129        vuint8mf2_t t10 = __riscv_vle8_v_u8mf2(s + 10, vl);
   1130        vuint8mf2_t t11 = __riscv_vle8_v_u8mf2(s + 11, vl);
   1131 
   1132        vint16m1_t s0 =
   1133            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl));
   1134        vint16m1_t s1 =
   1135            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl));
   1136        vint16m1_t s2 =
   1137            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl));
   1138        vint16m1_t s3 =
   1139            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl));
   1140        vint16m1_t s4 =
   1141            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl));
   1142        vint16m1_t s5 =
   1143            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl));
   1144        vint16m1_t s6 =
   1145            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl));
   1146        vint16m1_t s7 =
   1147            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl));
   1148        vint16m1_t s8 =
   1149            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t8, vl));
   1150        vint16m1_t s9 =
   1151            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t9, vl));
   1152        vint16m1_t s10 = __riscv_vreinterpret_v_u16m1_i16m1(
   1153            __riscv_vzext_vf2_u16m1(t10, vl));
   1154        vint16m1_t s11 = __riscv_vreinterpret_v_u16m1_i16m1(
   1155            __riscv_vzext_vf2_u16m1(t11, vl));
   1156 
   1157        vint16m1_t d0 =
   1158            convolve12_8_2d_h_rvv(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
   1159                                  s11, x_filter_ptr, horiz_const, vl);
   1160 
   1161        __riscv_vse16_v_i16m1(d, d0, vl);
   1162 
   1163        s += vl;
   1164        d += vl;
   1165        width -= vl;
   1166      } while (width != 0);
   1167      src += src_stride;
   1168      dst += dst_stride;
   1169    } while (--h != 0);
   1170  }
   1171 }
   1172 
   1173 static inline vint16m1_t convolve4_2d_h_rvv(
   1174    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
   1175    const vint16m1_t s3, const int16_t *x_filter, const int16_t horiz_const,
   1176    size_t vl) {
   1177  vint16m1_t sum = __riscv_vmul_vx_i16m1(s0, x_filter[0], vl);
   1178  sum = __riscv_vmacc_vx_i16m1(sum, x_filter[1], s1, vl);
   1179  sum = __riscv_vmacc_vx_i16m1(sum, x_filter[2], s2, vl);
   1180  sum = __riscv_vmacc_vx_i16m1(sum, x_filter[3], s3, vl);
   1181 
   1182  sum = __riscv_vadd_vx_i16m1(sum, horiz_const, vl);
   1183 
   1184  return __riscv_vsra_vx_i16m1(sum, ROUND0_BITS - 1, vl);
   1185 }
   1186 
   1187 static inline void convolve_2d_sr_horiz_4tap_rvv(
   1188    const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
   1189    ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x, size_t vl) {
   1190  const int bd = 8;
   1191  const int16_t *filter = filter_x + 2;
   1192  const int16_t horiz_const =
   1193      (1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1));
   1194 
   1195  const int16_t xf0 = filter[0] >> 1;
   1196  const int16_t xf1 = filter[1] >> 1;
   1197  const int16_t xf2 = filter[2] >> 1;
   1198  const int16_t xf3 = filter[3] >> 1;
   1199  const int16_t xfilter[4] = { xf0, xf1, xf2, xf3 };
   1200 
   1201  if (w <= 4) {
   1202    vl = vl << 1;
   1203 
   1204    do {
   1205      vuint8mf2_t t0 = load_strided_u8_4xN((uint8_t *)src + 0, src_stride, vl);
   1206      vuint8mf2_t t1 = load_strided_u8_4xN((uint8_t *)src + 1, src_stride, vl);
   1207      vuint8mf2_t t2 = load_strided_u8_4xN((uint8_t *)src + 2, src_stride, vl);
   1208      vuint8mf2_t t3 = load_strided_u8_4xN((uint8_t *)src + 3, src_stride, vl);
   1209 
   1210      vint16m1_t s0 =
   1211          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl));
   1212      vint16m1_t s1 =
   1213          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl));
   1214      vint16m1_t s2 =
   1215          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl));
   1216      vint16m1_t s3 =
   1217          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl));
   1218 
   1219      vint16m1_t d0 =
   1220          convolve4_2d_h_rvv(s0, s1, s2, s3, xfilter, horiz_const, vl);
   1221 
   1222      store_strided_i16_4xN(dst, d0, dst_stride, vl);
   1223 
   1224      src += src_stride << 1;
   1225      dst += dst_stride << 1;
   1226      h -= 2;
   1227    } while (h > 0);
   1228  } else {
   1229    do {
   1230      int width = w;
   1231      const uint8_t *s = src;
   1232      int16_t *d = dst;
   1233 
   1234      do {
   1235        vuint8mf2_t t0 = __riscv_vle8_v_u8mf2(s + 0, vl);
   1236        vuint8mf2_t t1 = __riscv_vle8_v_u8mf2(s + 1, vl);
   1237        vuint8mf2_t t2 = __riscv_vle8_v_u8mf2(s + 2, vl);
   1238        vuint8mf2_t t3 = __riscv_vle8_v_u8mf2(s + 3, vl);
   1239 
   1240        vuint8mf2_t t4 = __riscv_vle8_v_u8mf2(s + src_stride, vl);
   1241        vuint8mf2_t t5 = __riscv_vle8_v_u8mf2(s + src_stride + 1, vl);
   1242        vuint8mf2_t t6 = __riscv_vle8_v_u8mf2(s + src_stride + 2, vl);
   1243        vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(s + src_stride + 3, vl);
   1244 
   1245        vint16m1_t s0 =
   1246            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl));
   1247        vint16m1_t s1 =
   1248            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl));
   1249        vint16m1_t s2 =
   1250            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl));
   1251        vint16m1_t s3 =
   1252            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl));
   1253 
   1254        vint16m1_t s4 =
   1255            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl));
   1256        vint16m1_t s5 =
   1257            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl));
   1258        vint16m1_t s6 =
   1259            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl));
   1260        vint16m1_t s7 =
   1261            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl));
   1262 
   1263        vint16m1_t d0 =
   1264            convolve4_2d_h_rvv(s0, s1, s2, s3, xfilter, horiz_const, vl);
   1265        vint16m1_t d1 =
   1266            convolve4_2d_h_rvv(s4, s5, s6, s7, xfilter, horiz_const, vl);
   1267 
   1268        __riscv_vse16_v_i16m1(d, d0, vl);
   1269        __riscv_vse16_v_i16m1(d + dst_stride, d1, vl);
   1270 
   1271        s += vl;
   1272        d += vl;
   1273        width -= vl;
   1274      } while (width != 0);
   1275      src += src_stride << 1;
   1276      dst += dst_stride << 1;
   1277      h -= 2;
   1278    } while (h > 0);
   1279  }
   1280 }
   1281 
   1282 static inline vint16m1_t convolve8_4_2d_h_rvv(
   1283    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
   1284    const vint16m1_t s3, const vint16m1_t x_filter0, const vint16m1_t x_filter1,
   1285    const vint16m1_t x_filter2, const vint16m1_t x_filter3,
   1286    const int16_t horiz_const, size_t vl) {
   1287  vint16m1_t sum = __riscv_vmul_vv_i16m1(s0, x_filter0, vl);
   1288  sum = __riscv_vmacc_vv_i16m1(sum, x_filter1, s1, vl);
   1289  sum = __riscv_vmacc_vv_i16m1(sum, x_filter2, s2, vl);
   1290  sum = __riscv_vmacc_vv_i16m1(sum, x_filter3, s3, vl);
   1291 
   1292  sum = __riscv_vadd_vv_i16m1(
   1293      sum, __riscv_vslidedown_vx_i16m1(sum, vl >> 1, vl), vl >> 1);
   1294  sum = __riscv_vadd_vx_i16m1(sum, horiz_const, vl >> 1);
   1295 
   1296  return __riscv_vsra_vx_i16m1(sum, ROUND0_BITS - 1, vl >> 1);
   1297 }
   1298 
   1299 static inline vint16m1_t convolve8_8_2d_h_rvv(
   1300    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
   1301    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
   1302    const vint16m1_t s6, const vint16m1_t s7, const int16_t *x_filter,
   1303    const int16_t horiz_const, size_t vl) {
   1304  vint16m1_t sum = __riscv_vmul_vx_i16m1(s0, x_filter[0], vl);
   1305  sum = __riscv_vmacc_vx_i16m1(sum, x_filter[1], s1, vl);
   1306  sum = __riscv_vmacc_vx_i16m1(sum, x_filter[2], s2, vl);
   1307  sum = __riscv_vmacc_vx_i16m1(sum, x_filter[3], s3, vl);
   1308  sum = __riscv_vmacc_vx_i16m1(sum, x_filter[4], s4, vl);
   1309  sum = __riscv_vmacc_vx_i16m1(sum, x_filter[5], s5, vl);
   1310  sum = __riscv_vmacc_vx_i16m1(sum, x_filter[6], s6, vl);
   1311  sum = __riscv_vmacc_vx_i16m1(sum, x_filter[7], s7, vl);
   1312 
   1313  sum = __riscv_vadd_vx_i16m1(sum, horiz_const, vl);
   1314 
   1315  return __riscv_vsra_vx_i16m1(sum, ROUND0_BITS - 1, vl);
   1316 }
   1317 
   1318 static inline void convolve_2d_sr_horiz_8tap_rvv(
   1319    const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
   1320    ptrdiff_t dst_stride, int w, int im_h, const int16_t *x_filter_ptr,
   1321    size_t vl) {
   1322  const int bd = 8;
   1323  const int16_t horiz_const =
   1324      (1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1));
   1325 
   1326  int height = im_h;
   1327 
   1328  const int16_t xf0 = x_filter_ptr[0] >> 1;
   1329  const int16_t xf1 = x_filter_ptr[1] >> 1;
   1330  const int16_t xf2 = x_filter_ptr[2] >> 1;
   1331  const int16_t xf3 = x_filter_ptr[3] >> 1;
   1332  const int16_t xf4 = x_filter_ptr[4] >> 1;
   1333  const int16_t xf5 = x_filter_ptr[5] >> 1;
   1334  const int16_t xf6 = x_filter_ptr[6] >> 1;
   1335  const int16_t xf7 = x_filter_ptr[7] >> 1;
   1336 
   1337  if (w <= 4) {
   1338    vl = vl << 1;
   1339 
   1340    const int16_t filter0[8] = { xf0, xf0, xf0, xf0, xf4, xf4, xf4, xf4 };
   1341    const int16_t filter1[8] = { xf1, xf1, xf1, xf1, xf5, xf5, xf5, xf5 };
   1342    const int16_t filter2[8] = { xf2, xf2, xf2, xf2, xf6, xf6, xf6, xf6 };
   1343    const int16_t filter3[8] = { xf3, xf3, xf3, xf3, xf7, xf7, xf7, xf7 };
   1344 
   1345    const vint16m1_t vfilter0 = __riscv_vle16_v_i16m1(filter0, vl);
   1346    const vint16m1_t vfilter1 = __riscv_vle16_v_i16m1(filter1, vl);
   1347    const vint16m1_t vfilter2 = __riscv_vle16_v_i16m1(filter2, vl);
   1348    const vint16m1_t vfilter3 = __riscv_vle16_v_i16m1(filter3, vl);
   1349 
   1350    do {
   1351      vuint8mf2_t t0 = __riscv_vle8_v_u8mf2(src, vl);
   1352      vuint8mf2_t t1 = __riscv_vle8_v_u8mf2(src + 1, vl);
   1353      vuint8mf2_t t2 = __riscv_vle8_v_u8mf2(src + 2, vl);
   1354      vuint8mf2_t t3 = __riscv_vle8_v_u8mf2(src + 3, vl);
   1355 
   1356      vuint8mf2_t t4 = __riscv_vle8_v_u8mf2(src + src_stride, vl);
   1357      vuint8mf2_t t5 = __riscv_vle8_v_u8mf2(src + src_stride + 1, vl);
   1358      vuint8mf2_t t6 = __riscv_vle8_v_u8mf2(src + src_stride + 2, vl);
   1359      vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(src + src_stride + 3, vl);
   1360 
   1361      vint16m1_t s0 =
   1362          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl));
   1363      vint16m1_t s1 =
   1364          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl));
   1365      vint16m1_t s2 =
   1366          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl));
   1367      vint16m1_t s3 =
   1368          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl));
   1369      vint16m1_t s4 =
   1370          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl));
   1371      vint16m1_t s5 =
   1372          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl));
   1373      vint16m1_t s6 =
   1374          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl));
   1375      vint16m1_t s7 =
   1376          __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl));
   1377 
   1378      vint16m1_t d0 = convolve8_4_2d_h_rvv(s0, s1, s2, s3, vfilter0, vfilter1,
   1379                                           vfilter2, vfilter3, horiz_const, vl);
   1380      vint16m1_t d1 = convolve8_4_2d_h_rvv(s4, s5, s6, s7, vfilter0, vfilter1,
   1381                                           vfilter2, vfilter3, horiz_const, vl);
   1382 
   1383      __riscv_vse16_v_i16m1(dst, d0, vl >> 1);
   1384      __riscv_vse16_v_i16m1(dst + dst_stride, d1, vl >> 1);
   1385 
   1386      src += src_stride << 1;
   1387      dst += dst_stride << 1;
   1388      height -= 2;
   1389    } while (height > 0);
   1390  } else {
   1391    const int16_t xfilter[8] = { xf0, xf1, xf2, xf3, xf4, xf5, xf6, xf7 };
   1392 
   1393    do {
   1394      const uint8_t *s = src;
   1395      int16_t *d = dst;
   1396      int width = w;
   1397 
   1398      do {
   1399        vuint8mf2_t t0 = __riscv_vle8_v_u8mf2(s, vl);
   1400        vuint8mf2_t t1 = __riscv_vle8_v_u8mf2(s + 1, vl);
   1401        vuint8mf2_t t2 = __riscv_vle8_v_u8mf2(s + 2, vl);
   1402        vuint8mf2_t t3 = __riscv_vle8_v_u8mf2(s + 3, vl);
   1403        vuint8mf2_t t4 = __riscv_vle8_v_u8mf2(s + 4, vl);
   1404        vuint8mf2_t t5 = __riscv_vle8_v_u8mf2(s + 5, vl);
   1405        vuint8mf2_t t6 = __riscv_vle8_v_u8mf2(s + 6, vl);
   1406        vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(s + 7, vl);
   1407 
   1408        vint16m1_t s0 =
   1409            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl));
   1410        vint16m1_t s1 =
   1411            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl));
   1412        vint16m1_t s2 =
   1413            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl));
   1414        vint16m1_t s3 =
   1415            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl));
   1416        vint16m1_t s4 =
   1417            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl));
   1418        vint16m1_t s5 =
   1419            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl));
   1420        vint16m1_t s6 =
   1421            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl));
   1422        vint16m1_t s7 =
   1423            __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl));
   1424 
   1425        vint16m1_t d0 = convolve8_8_2d_h_rvv(s0, s1, s2, s3, s4, s5, s6, s7,
   1426                                             xfilter, horiz_const, vl);
   1427 
   1428        __riscv_vse16_v_i16m1(d, d0, vl);
   1429 
   1430        s += vl;
   1431        d += vl;
   1432        width -= vl;
   1433      } while (width != 0);
   1434      src += src_stride;
   1435      dst += dst_stride;
   1436    } while (--height != 0);
   1437  }
   1438 }
   1439 
   1440 void av1_convolve_2d_sr_rvv(const uint8_t *src, int src_stride, uint8_t *dst,
   1441                            int dst_stride, int w, int h,
   1442                            const InterpFilterParams *filter_params_x,
   1443                            const InterpFilterParams *filter_params_y,
   1444                            const int subpel_x_qn, const int subpel_y_qn,
   1445                            ConvolveParams *conv_params) {
   1446  if (w == 2 || h == 2) {
   1447    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
   1448                         filter_params_x, filter_params_y, subpel_x_qn,
   1449                         subpel_y_qn, conv_params);
   1450    return;
   1451  }
   1452 
   1453  const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
   1454  const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
   1455  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
   1456  const int im_h = h + clamped_y_taps - 1;
   1457  const int im_stride = MAX_SB_SIZE;
   1458  const int vert_offset = clamped_y_taps / 2 - 1;
   1459  const int horiz_offset = filter_params_x->taps / 2 - 1;
   1460  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
   1461 
   1462  const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1463      filter_params_x, subpel_x_qn & SUBPEL_MASK);
   1464  const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
   1465      filter_params_y, subpel_y_qn & SUBPEL_MASK);
   1466 
   1467  size_t vl = __riscv_vsetvl_e16m1(w);
   1468 
   1469  if (filter_params_x->taps > 8) {
   1470    DECLARE_ALIGNED(16, int16_t,
   1471                    im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
   1472 
   1473    convolve_2d_sr_horiz_12tap_rvv(src_ptr, src_stride, im_block, im_stride, w,
   1474                                   im_h, x_filter_ptr, vl);
   1475    convolve_2d_sr_vert_12tap_rvv(im_block, im_stride, dst, dst_stride, w, h,
   1476                                  y_filter_ptr, vl);
   1477  } else {
   1478    DECLARE_ALIGNED(16, int16_t,
   1479                    im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
   1480 
   1481    // horizontal filter
   1482    if (x_filter_taps <= 4) {
   1483      convolve_2d_sr_horiz_4tap_rvv(src_ptr + 2, src_stride, im_block,
   1484                                    im_stride, w, im_h, x_filter_ptr, vl);
   1485    } else {
   1486      convolve_2d_sr_horiz_8tap_rvv(src_ptr, src_stride, im_block, im_stride, w,
   1487                                    im_h, x_filter_ptr, vl);
   1488    }
   1489 
   1490    // vertical filter
   1491    if (clamped_y_taps <= 4) {
   1492      convolve_2d_sr_vert_4tap_rvv(im_block, im_stride, dst, dst_stride, w, h,
   1493                                   y_filter_ptr, vl);
   1494    } else if (clamped_y_taps == 6) {
   1495      convolve_2d_sr_vert_6tap_rvv(im_block, im_stride, dst, dst_stride, w, h,
   1496                                   y_filter_ptr, vl);
   1497    } else {
   1498      convolve_2d_sr_vert_8tap_rvv(im_block, im_stride, dst, dst_stride, w, h,
   1499                                   y_filter_ptr, vl);
   1500    }
   1501  }
   1502 }
   1503 
   1504 void av1_convolve_x_sr_intrabc_rvv(const uint8_t *src, int src_stride,
   1505                                   uint8_t *dst, int dst_stride, int w, int h,
   1506                                   const InterpFilterParams *filter_params_x,
   1507                                   const int subpel_x_qn,
   1508                                   ConvolveParams *conv_params) {
   1509  assert(subpel_x_qn == 8);
   1510  assert(filter_params_x->taps == 2);
   1511  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
   1512  (void)filter_params_x;
   1513  (void)subpel_x_qn;
   1514  (void)conv_params;
   1515 
   1516  size_t vl = __riscv_vsetvl_e8m1(w);
   1517  if (w <= 8) {
   1518    do {
   1519      // Load
   1520      vuint8mf2_t s0_0 = __riscv_vle8_v_u8mf2(src, vl);
   1521      vuint8mf2_t s0_1 = __riscv_vle8_v_u8mf2(src + 1, vl);
   1522      vuint8mf2_t s1_0 = __riscv_vle8_v_u8mf2(src + src_stride, vl);
   1523      vuint8mf2_t s1_1 = __riscv_vle8_v_u8mf2(src + src_stride + 1, vl);
   1524 
   1525      // Average the values
   1526      vuint8mf2_t d0 =
   1527          __riscv_vaaddu_vv_u8mf2(s0_0, s0_1, __RISCV_VXRM_RNU, vl);
   1528      vuint8mf2_t d1 =
   1529          __riscv_vaaddu_vv_u8mf2(s1_0, s1_1, __RISCV_VXRM_RNU, vl);
   1530 
   1531      __riscv_vse8_v_u8mf2(dst, d0, vl);
   1532      __riscv_vse8_v_u8mf2(dst + dst_stride, d1, vl);
   1533 
   1534      src += src_stride << 1;
   1535      dst += dst_stride << 1;
   1536      h -= 2;
   1537    } while (h > 0);
   1538  } else {
   1539    do {
   1540      const uint8_t *src_ptr = src;
   1541      uint8_t *dst_ptr = dst;
   1542      int width = w;
   1543 
   1544      do {
   1545        // Load
   1546        vuint8m1_t s0 = __riscv_vle8_v_u8m1(src_ptr, vl);
   1547        vuint8m1_t s1 = __riscv_vle8_v_u8m1(src_ptr + 1, vl);
   1548        vuint8m1_t s2 = __riscv_vle8_v_u8m1(src_ptr + src_stride, vl);
   1549        vuint8m1_t s3 = __riscv_vle8_v_u8m1(src_ptr + src_stride + 1, vl);
   1550 
   1551        // Average the values
   1552        vuint8m1_t d0 = __riscv_vaaddu_vv_u8m1(s0, s1, __RISCV_VXRM_RNU, vl);
   1553        vuint8m1_t d1 = __riscv_vaaddu_vv_u8m1(s2, s3, __RISCV_VXRM_RNU, vl);
   1554 
   1555        // Store
   1556        __riscv_vse8_v_u8m1(dst_ptr, d0, vl);
   1557        __riscv_vse8_v_u8m1(dst_ptr + dst_stride, d1, vl);
   1558 
   1559        src_ptr += vl;
   1560        dst_ptr += vl;
   1561        width -= vl;
   1562      } while (width > 0);
   1563      src += src_stride << 1;
   1564      dst += dst_stride << 1;
   1565      h -= 2;
   1566    } while (h > 0);
   1567  }
   1568 }
   1569 
   1570 void av1_convolve_y_sr_intrabc_rvv(const uint8_t *src, int src_stride,
   1571                                   uint8_t *dst, int dst_stride, int w, int h,
   1572                                   const InterpFilterParams *filter_params_y,
   1573                                   const int subpel_y_qn) {
   1574  assert(subpel_y_qn == 8);
   1575  assert(filter_params_y->taps == 2);
   1576  (void)filter_params_y;
   1577  (void)subpel_y_qn;
   1578 
   1579  size_t vl = __riscv_vsetvl_e8m1(w);
   1580  if (w <= 8) {
   1581    vuint8mf2_t s0 = __riscv_vle8_v_u8mf2(src, vl);
   1582 
   1583    do {
   1584      vuint8mf2_t s1 = __riscv_vle8_v_u8mf2(src + src_stride, vl);
   1585      vuint8mf2_t s2 = __riscv_vle8_v_u8mf2(src + 2 * src_stride, vl);
   1586 
   1587      // Average the values
   1588      vuint8mf2_t d0 = __riscv_vaaddu_vv_u8mf2(s0, s1, __RISCV_VXRM_RNU, vl);
   1589      vuint8mf2_t d1 = __riscv_vaaddu_vv_u8mf2(s1, s2, __RISCV_VXRM_RNU, vl);
   1590 
   1591      __riscv_vse8_v_u8mf2(dst, d0, vl);
   1592      __riscv_vse8_v_u8mf2(dst + dst_stride, d1, vl);
   1593 
   1594      s0 = s2;
   1595      src += src_stride << 1;
   1596      dst += dst_stride << 1;
   1597      h -= 2;
   1598    } while (h > 0);
   1599  } else {
   1600    do {
   1601      const uint8_t *src_ptr = src;
   1602      uint8_t *dst_ptr = dst;
   1603      int height = h;
   1604 
   1605      vuint8m1_t s0 = __riscv_vle8_v_u8m1(src_ptr, vl);
   1606 
   1607      do {
   1608        vuint8m1_t s1 = __riscv_vle8_v_u8m1(src_ptr + src_stride, vl);
   1609        vuint8m1_t s2 = __riscv_vle8_v_u8m1(src_ptr + 2 * src_stride, vl);
   1610 
   1611        // Average the values
   1612        vuint8m1_t d0 = __riscv_vaaddu_vv_u8m1(s0, s1, __RISCV_VXRM_RNU, vl);
   1613        vuint8m1_t d1 = __riscv_vaaddu_vv_u8m1(s1, s2, __RISCV_VXRM_RNU, vl);
   1614 
   1615        // Store
   1616        __riscv_vse8_v_u8m1(dst_ptr, d0, vl);
   1617        __riscv_vse8_v_u8m1(dst_ptr + dst_stride, d1, vl);
   1618 
   1619        s0 = s2;
   1620        src_ptr += src_stride << 1;
   1621        dst_ptr += dst_stride << 1;
   1622        height -= 2;
   1623      } while (height > 0);
   1624      src += vl;
   1625      dst += vl;
   1626      w -= vl;
   1627    } while (w > 0);
   1628  }
   1629 }
   1630 
   1631 void av1_convolve_2d_sr_intrabc_rvv(const uint8_t *src, int src_stride,
   1632                                    uint8_t *dst, int dst_stride, int w, int h,
   1633                                    const InterpFilterParams *filter_params_x,
   1634                                    const InterpFilterParams *filter_params_y,
   1635                                    const int subpel_x_qn,
   1636                                    const int subpel_y_qn,
   1637                                    ConvolveParams *conv_params) {
   1638  assert(subpel_x_qn == 8);
   1639  assert(subpel_y_qn == 8);
   1640  assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
   1641  assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
   1642  (void)filter_params_x;
   1643  (void)subpel_x_qn;
   1644  (void)filter_params_y;
   1645  (void)subpel_y_qn;
   1646  (void)conv_params;
   1647 
   1648  size_t vl = __riscv_vsetvl_e16m1(w);
   1649 
   1650  if (w <= 8) {
   1651    // Horizontal filter.
   1652    vuint8mf2_t s0 = __riscv_vle8_v_u8mf2(src, vl);
   1653    vuint8mf2_t s1 = __riscv_vle8_v_u8mf2(src + 1, vl);
   1654    src += src_stride;
   1655 
   1656    vuint16m1_t sum0 = __riscv_vwaddu_vv_u16m1(s0, s1, vl);
   1657 
   1658    do {
   1659      vuint8mf2_t s2 = __riscv_vle8_v_u8mf2(src, vl);
   1660      vuint8mf2_t s3 = __riscv_vle8_v_u8mf2(src + 1, vl);
   1661      src += src_stride;
   1662      vuint8mf2_t s4 = __riscv_vle8_v_u8mf2(src, vl);
   1663      vuint8mf2_t s5 = __riscv_vle8_v_u8mf2(src + 1, vl);
   1664      src += src_stride;
   1665 
   1666      vuint16m1_t sum1 = __riscv_vwaddu_vv_u16m1(s2, s3, vl);
   1667      vuint16m1_t sum2 = __riscv_vwaddu_vv_u16m1(s4, s5, vl);
   1668 
   1669      // Vertical filter.
   1670      vuint8mf2_t d0 = __riscv_vnclipu_wx_u8mf2(
   1671          __riscv_vadd_vv_u16m1(sum0, sum1, vl), 2, __RISCV_VXRM_RNU, vl);
   1672      vuint8mf2_t d1 = __riscv_vnclipu_wx_u8mf2(
   1673          __riscv_vadd_vv_u16m1(sum1, sum2, vl), 2, __RISCV_VXRM_RNU, vl);
   1674 
   1675      __riscv_vse8_v_u8mf2(dst, d0, vl);
   1676      dst += dst_stride;
   1677      __riscv_vse8_v_u8mf2(dst, d1, vl);
   1678      dst += dst_stride;
   1679 
   1680      sum0 = sum2;
   1681      h -= 2;
   1682    } while (h != 0);
   1683  } else {
   1684    do {
   1685      uint8_t *src_ptr = (uint8_t *)src;
   1686      uint8_t *dst_ptr = dst;
   1687      int height = h;
   1688 
   1689      // Horizontal filter.
   1690      vuint8mf2_t s0 = __riscv_vle8_v_u8mf2(src_ptr, vl);
   1691      vuint8mf2_t s1 = __riscv_vle8_v_u8mf2(src_ptr + 1, vl);
   1692      src_ptr += src_stride;
   1693 
   1694      vuint16m1_t sum0 = __riscv_vwaddu_vv_u16m1(s0, s1, vl);
   1695 
   1696      do {
   1697        vuint8mf2_t s2 = __riscv_vle8_v_u8mf2(src_ptr, vl);
   1698        vuint8mf2_t s3 = __riscv_vle8_v_u8mf2(src_ptr + 1, vl);
   1699        src_ptr += src_stride;
   1700        vuint8mf2_t s4 = __riscv_vle8_v_u8mf2(src_ptr, vl);
   1701        vuint8mf2_t s5 = __riscv_vle8_v_u8mf2(src_ptr + 1, vl);
   1702        src_ptr += src_stride;
   1703 
   1704        vuint16m1_t sum1 = __riscv_vwaddu_vv_u16m1(s2, s3, vl);
   1705        vuint16m1_t sum2 = __riscv_vwaddu_vv_u16m1(s4, s5, vl);
   1706 
   1707        // Vertical filter.
   1708        vuint8mf2_t d0 = __riscv_vnclipu_wx_u8mf2(
   1709            __riscv_vadd_vv_u16m1(sum0, sum1, vl), 2, __RISCV_VXRM_RNU, vl);
   1710        vuint8mf2_t d1 = __riscv_vnclipu_wx_u8mf2(
   1711            __riscv_vadd_vv_u16m1(sum1, sum2, vl), 2, __RISCV_VXRM_RNU, vl);
   1712 
   1713        __riscv_vse8_v_u8mf2(dst_ptr, d0, vl);
   1714        dst_ptr += dst_stride;
   1715        __riscv_vse8_v_u8mf2(dst_ptr, d1, vl);
   1716        dst_ptr += dst_stride;
   1717 
   1718        sum0 = sum2;
   1719        height -= 2;
   1720      } while (height != 0);
   1721 
   1722      src += vl;
   1723      dst += vl;
   1724      w -= vl;
   1725    } while (w != 0);
   1726  }
   1727 }