tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

convolve_rvv.h (21227B)


      1 /*
      2 * Copyright (c) 2025, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AV1_COMMON_RISCV_CONVOLVE_RVV_H_
     13 #define AOM_AV1_COMMON_RISCV_CONVOLVE_RVV_H_
     14 
     15 #include "config/aom_config.h"
     16 
     17 #include "av1/common/convolve.h"
     18 #include "av1/common/filter.h"
     19 
     20 // load_strided_u8_4xN
     21 static inline vuint8mf2_t load_strided_u8_4xN(uint8_t *addr, ptrdiff_t stride,
     22                                              size_t vl) {
     23  const vuint8mf2_t px_l1 = __riscv_vle8_v_u8mf2(addr + stride, vl);
     24  const vuint8mf2_t px_l0 = __riscv_vle8_v_u8mf2(addr, vl);
     25  return __riscv_vslideup_vx_u8mf2(px_l0, px_l1, vl >> 1, vl);
     26 }
     27 
     28 // store_strided_u8_4xN
     29 static inline void store_strided_u8_4xN(uint8_t *addr, vuint8mf2_t vdst,
     30                                        ptrdiff_t stride, size_t vl) {
     31  __riscv_vse8_v_u8mf2(addr, vdst, vl >> 1);
     32  vdst = __riscv_vslidedown_vx_u8mf2(vdst, vl >> 1, vl);
     33  __riscv_vse8_v_u8mf2(addr + stride, vdst, vl >> 1);
     34 }
     35 
     36 // load_strided_i16_4xN
     37 static inline vint16m1_t load_strided_i16_4xN(int16_t *addr, ptrdiff_t stride,
     38                                              size_t vl) {
     39  const vint16m1_t px_l1 = __riscv_vle16_v_i16m1(addr + stride, vl >> 1);
     40  const vint16m1_t px_l0 = __riscv_vle16_v_i16m1(addr, vl >> 1);
     41  return __riscv_vslideup_vx_i16m1(px_l0, px_l1, vl >> 1, vl);
     42 }
     43 
     44 // store_strided_i16_4xN
     45 static inline void store_strided_i16_4xN(int16_t *addr, vint16m1_t vdst,
     46                                         ptrdiff_t stride, size_t vl) {
     47  __riscv_vse16_v_i16m1(addr, vdst, vl >> 1);
     48  vdst = __riscv_vslidedown_vx_i16m1(vdst, vl >> 1, vl);
     49  __riscv_vse16_v_i16m1(addr + stride, vdst, vl >> 1);
     50 }
     51 
     52 static inline vuint8mf2_t convolve12_2d_v_rvv(
     53    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
     54    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
     55    const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8,
     56    const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11,
     57    const int16_t *y_filter, const int16_t sub_const, const int vert_const,
     58    size_t vl) {
     59  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl);
     60  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl);
     61  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl);
     62  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl);
     63  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[4], s4, vl);
     64  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[5], s5, vl);
     65  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[6], s6, vl);
     66  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[7], s7, vl);
     67  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[8], s8, vl);
     68  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[9], s9, vl);
     69  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[10], s10, vl);
     70  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[11], s11, vl);
     71  sum = __riscv_vadd_vx_i32m2(sum, vert_const, vl);
     72 
     73  vint16m1_t i16_sum =
     74      __riscv_vnsra_wx_i16m1(sum, ((FILTER_BITS << 1) - ROUND0_BITS), vl);
     75  i16_sum = __riscv_vsub_vx_i16m1(i16_sum, sub_const, vl);
     76  vint16m1_t iclip_sum =
     77      __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), 255, vl);
     78 
     79  return __riscv_vncvt_x_x_w_u8mf2(
     80      __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum), vl);
     81 }
     82 
     83 static inline void convolve_2d_sr_vert_12tap_rvv(
     84    int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
     85    int h, const int16_t *y_filter_ptr, size_t vl) {
     86  const int vert_const = (1 << ((FILTER_BITS << 1) - ROUND0_BITS)) >> 1;
     87  const int16_t sub_const = 1 << FILTER_BITS;
     88 
     89  if (w == 4) {
     90    vl = vl << 1;
     91 
     92    vint16m1_t s0 = load_strided_i16_4xN(src_ptr, src_stride, vl);
     93    src_ptr += src_stride;
     94    vint16m1_t s1 = load_strided_i16_4xN(src_ptr, src_stride, vl);
     95    src_ptr += src_stride;
     96    vint16m1_t s2 = load_strided_i16_4xN(src_ptr, src_stride, vl);
     97    src_ptr += src_stride;
     98    vint16m1_t s3 = load_strided_i16_4xN(src_ptr, src_stride, vl);
     99    src_ptr += src_stride;
    100    vint16m1_t s4 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    101    src_ptr += src_stride;
    102    vint16m1_t s5 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    103    src_ptr += src_stride;
    104    vint16m1_t s6 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    105    src_ptr += src_stride;
    106    vint16m1_t s7 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    107    src_ptr += src_stride;
    108    vint16m1_t s8 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    109    src_ptr += src_stride;
    110    vint16m1_t s9 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    111    src_ptr += src_stride;
    112 
    113    do {
    114      vint16m1_t s10 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    115      src_ptr += src_stride;
    116      vint16m1_t s11 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    117      src_ptr += src_stride;
    118      vint16m1_t s12 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    119      src_ptr += src_stride;
    120      vint16m1_t s13 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    121      src_ptr += src_stride;
    122 
    123      vuint8mf2_t d0 =
    124          convolve12_2d_v_rvv(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
    125                              y_filter_ptr, sub_const, vert_const, vl);
    126      vuint8mf2_t d1 =
    127          convolve12_2d_v_rvv(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
    128                              s13, y_filter_ptr, sub_const, vert_const, vl);
    129 
    130      store_strided_u8_4xN(dst_ptr, d0, dst_stride, vl);
    131      dst_ptr += dst_stride << 1;
    132      store_strided_u8_4xN(dst_ptr, d1, dst_stride, vl);
    133      dst_ptr += dst_stride << 1;
    134 
    135      s0 = s4;
    136      s1 = s5;
    137      s2 = s6;
    138      s3 = s7;
    139      s4 = s8;
    140      s5 = s9;
    141      s6 = s10;
    142      s7 = s11;
    143      s8 = s12;
    144      s9 = s13;
    145 
    146      h -= 4;
    147    } while (h != 0);
    148  } else {
    149    do {
    150      int height = h;
    151      int16_t *s = src_ptr;
    152      uint8_t *d = dst_ptr;
    153 
    154      vint16m1_t s0 = __riscv_vle16_v_i16m1(s, vl);
    155      s += src_stride;
    156      vint16m1_t s1 = __riscv_vle16_v_i16m1(s, vl);
    157      s += src_stride;
    158      vint16m1_t s2 = __riscv_vle16_v_i16m1(s, vl);
    159      s += src_stride;
    160      vint16m1_t s3 = __riscv_vle16_v_i16m1(s, vl);
    161      s += src_stride;
    162      vint16m1_t s4 = __riscv_vle16_v_i16m1(s, vl);
    163      s += src_stride;
    164      vint16m1_t s5 = __riscv_vle16_v_i16m1(s, vl);
    165      s += src_stride;
    166      vint16m1_t s6 = __riscv_vle16_v_i16m1(s, vl);
    167      s += src_stride;
    168      vint16m1_t s7 = __riscv_vle16_v_i16m1(s, vl);
    169      s += src_stride;
    170      vint16m1_t s8 = __riscv_vle16_v_i16m1(s, vl);
    171      s += src_stride;
    172      vint16m1_t s9 = __riscv_vle16_v_i16m1(s, vl);
    173      s += src_stride;
    174      vint16m1_t s10 = __riscv_vle16_v_i16m1(s, vl);
    175      s += src_stride;
    176 
    177      do {
    178        vint16m1_t s11 = __riscv_vle16_v_i16m1(s, vl);
    179        s += src_stride;
    180        vint16m1_t s12 = __riscv_vle16_v_i16m1(s, vl);
    181        s += src_stride;
    182        vint16m1_t s13 = __riscv_vle16_v_i16m1(s, vl);
    183        s += src_stride;
    184        vint16m1_t s14 = __riscv_vle16_v_i16m1(s, vl);
    185        s += src_stride;
    186 
    187        vuint8mf2_t d0 =
    188            convolve12_2d_v_rvv(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
    189                                s11, y_filter_ptr, sub_const, vert_const, vl);
    190        vuint8mf2_t d1 =
    191            convolve12_2d_v_rvv(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
    192                                s12, y_filter_ptr, sub_const, vert_const, vl);
    193        vuint8mf2_t d2 =
    194            convolve12_2d_v_rvv(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
    195                                s13, y_filter_ptr, sub_const, vert_const, vl);
    196        vuint8mf2_t d3 =
    197            convolve12_2d_v_rvv(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
    198                                s14, y_filter_ptr, sub_const, vert_const, vl);
    199 
    200        __riscv_vse8_v_u8mf2(d, d0, vl);
    201        d += dst_stride;
    202        __riscv_vse8_v_u8mf2(d, d1, vl);
    203        d += dst_stride;
    204        __riscv_vse8_v_u8mf2(d, d2, vl);
    205        d += dst_stride;
    206        __riscv_vse8_v_u8mf2(d, d3, vl);
    207        d += dst_stride;
    208 
    209        s0 = s4;
    210        s1 = s5;
    211        s2 = s6;
    212        s3 = s7;
    213        s4 = s8;
    214        s5 = s9;
    215        s6 = s10;
    216        s7 = s11;
    217        s8 = s12;
    218        s9 = s13;
    219        s10 = s14;
    220 
    221        height -= 4;
    222      } while (height != 0);
    223 
    224      src_ptr += vl;
    225      dst_ptr += vl;
    226      w -= vl;
    227    } while (w != 0);
    228  }
    229 }
    230 
    231 static inline vuint8mf2_t convolve8_2d_v_rvv(
    232    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    233    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
    234    const vint16m1_t s6, const vint16m1_t s7, const int16_t *y_filter,
    235    const int16_t sub_const, const int vert_const, size_t vl) {
    236  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl);
    237  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl);
    238  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl);
    239  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl);
    240  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[4], s4, vl);
    241  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[5], s5, vl);
    242  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[6], s6, vl);
    243  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[7], s7, vl);
    244  sum = __riscv_vadd_vx_i32m2(sum, vert_const, vl);
    245 
    246  vint16m1_t i16_sum =
    247      __riscv_vnsra_wx_i16m1(sum, ((FILTER_BITS << 1) - ROUND0_BITS), vl);
    248  i16_sum = __riscv_vsub_vx_i16m1(i16_sum, sub_const, vl);
    249  vint16m1_t iclip_sum =
    250      __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), 255, vl);
    251 
    252  return __riscv_vncvt_x_x_w_u8mf2(
    253      __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum), vl);
    254 }
    255 
    256 static inline void convolve_2d_sr_vert_8tap_rvv(
    257    int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
    258    int h, const int16_t *y_filter_ptr, size_t vl) {
    259  const int vert_const = (1 << ((FILTER_BITS << 1) - ROUND0_BITS)) >> 1;
    260  const int16_t sub_const = 1 << FILTER_BITS;
    261 
    262  if (w <= 4) {
    263    vl = vl << 1;
    264 
    265    vint16m1_t s0 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    266    src_ptr += src_stride;
    267    vint16m1_t s1 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    268    src_ptr += src_stride;
    269    vint16m1_t s2 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    270    src_ptr += src_stride;
    271    vint16m1_t s3 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    272    src_ptr += src_stride;
    273    vint16m1_t s4 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    274    src_ptr += src_stride;
    275    vint16m1_t s5 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    276    src_ptr += src_stride;
    277 
    278    do {
    279      vint16m1_t s6 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    280      src_ptr += src_stride;
    281      vint16m1_t s7 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    282      src_ptr += src_stride;
    283 
    284      vuint8mf2_t d0 =
    285          convolve8_2d_v_rvv(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_ptr,
    286                             sub_const, vert_const, vl);
    287 
    288      store_strided_u8_4xN(dst_ptr, d0, dst_stride, vl);
    289      dst_ptr += dst_stride << 1;
    290 
    291      s0 = s2;
    292      s1 = s3;
    293      s2 = s4;
    294      s3 = s5;
    295      s4 = s6;
    296      s5 = s7;
    297 
    298      h -= 2;
    299    } while (h != 0);
    300  } else {
    301    do {
    302      int height = h;
    303      int16_t *s = src_ptr;
    304      uint8_t *d = dst_ptr;
    305 
    306      vint16m1_t s0 = __riscv_vle16_v_i16m1(s, vl);
    307      s += src_stride;
    308      vint16m1_t s1 = __riscv_vle16_v_i16m1(s, vl);
    309      s += src_stride;
    310      vint16m1_t s2 = __riscv_vle16_v_i16m1(s, vl);
    311      s += src_stride;
    312      vint16m1_t s3 = __riscv_vle16_v_i16m1(s, vl);
    313      s += src_stride;
    314      vint16m1_t s4 = __riscv_vle16_v_i16m1(s, vl);
    315      s += src_stride;
    316      vint16m1_t s5 = __riscv_vle16_v_i16m1(s, vl);
    317      s += src_stride;
    318      vint16m1_t s6 = __riscv_vle16_v_i16m1(s, vl);
    319      s += src_stride;
    320 
    321      do {
    322        vint16m1_t s7 = __riscv_vle16_v_i16m1(s, vl);
    323        vuint8mf2_t d0 =
    324            convolve8_2d_v_rvv(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_ptr,
    325                               sub_const, vert_const, vl);
    326        __riscv_vse8_v_u8mf2(d, d0, vl);
    327 
    328        s0 = s1;
    329        s1 = s2;
    330        s2 = s3;
    331        s3 = s4;
    332        s4 = s5;
    333        s5 = s6;
    334        s6 = s7;
    335        s += src_stride;
    336        d += dst_stride;
    337        height--;
    338      } while (height != 0);
    339 
    340      src_ptr += vl;
    341      dst_ptr += vl;
    342      w -= vl;
    343    } while (w != 0);
    344  }
    345 }
    346 
    347 static inline vuint8mf2_t convolve6_2d_v_rvv(
    348    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    349    const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5,
    350    const int16_t *y_filter, const int16_t sub_const, const int vert_const,
    351    size_t vl) {
    352  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl);
    353  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl);
    354  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl);
    355  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl);
    356  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[4], s4, vl);
    357  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[5], s5, vl);
    358  sum = __riscv_vadd_vx_i32m2(sum, vert_const, vl);
    359 
    360  vint16m1_t i16_sum =
    361      __riscv_vnsra_wx_i16m1(sum, ((FILTER_BITS << 1) - ROUND0_BITS), vl);
    362  i16_sum = __riscv_vsub_vx_i16m1(i16_sum, sub_const, vl);
    363  vint16m1_t iclip_sum =
    364      __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), 255, vl);
    365 
    366  return __riscv_vncvt_x_x_w_u8mf2(
    367      __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum), vl);
    368 }
    369 
    370 static inline void convolve_2d_sr_vert_6tap_rvv(
    371    int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
    372    int h, const int16_t *y_filter_ptr, size_t vl) {
    373  const int vert_const = (1 << ((FILTER_BITS << 1) - ROUND0_BITS)) >> 1;
    374  const int16_t sub_const = 1 << FILTER_BITS;
    375 
    376  const int16_t *filter = y_filter_ptr + 1;
    377 
    378  if (w <= 4) {
    379    vl = vl << 1;
    380 
    381    vint16m1_t s0 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    382    src_ptr += src_stride;
    383    vint16m1_t s1 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    384    src_ptr += src_stride;
    385    vint16m1_t s2 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    386    src_ptr += src_stride;
    387    vint16m1_t s3 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    388    src_ptr += src_stride;
    389 
    390    do {
    391      vint16m1_t s4 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    392      src_ptr += src_stride;
    393      vint16m1_t s5 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    394      src_ptr += src_stride;
    395      vint16m1_t s6 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    396      src_ptr += src_stride;
    397      vint16m1_t s7 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    398      src_ptr += src_stride;
    399 
    400      vuint8mf2_t d0 = convolve6_2d_v_rvv(s0, s1, s2, s3, s4, s5, filter,
    401                                          sub_const, vert_const, vl);
    402      vuint8mf2_t d1 = convolve6_2d_v_rvv(s2, s3, s4, s5, s6, s7, filter,
    403                                          sub_const, vert_const, vl);
    404 
    405      store_strided_u8_4xN(dst_ptr, d0, dst_stride, vl);
    406      dst_ptr += dst_stride << 1;
    407      store_strided_u8_4xN(dst_ptr, d1, dst_stride, vl);
    408      dst_ptr += dst_stride << 1;
    409 
    410      s0 = s4;
    411      s1 = s5;
    412      s2 = s6;
    413      s3 = s7;
    414 
    415      h -= 4;
    416    } while (h != 0);
    417  } else {
    418    do {
    419      int height = h;
    420      int16_t *s = src_ptr;
    421      uint8_t *d = dst_ptr;
    422 
    423      vint16m1_t s0 = __riscv_vle16_v_i16m1(s, vl);
    424      s += src_stride;
    425      vint16m1_t s1 = __riscv_vle16_v_i16m1(s, vl);
    426      s += src_stride;
    427      vint16m1_t s2 = __riscv_vle16_v_i16m1(s, vl);
    428      s += src_stride;
    429      vint16m1_t s3 = __riscv_vle16_v_i16m1(s, vl);
    430      s += src_stride;
    431      vint16m1_t s4 = __riscv_vle16_v_i16m1(s, vl);
    432      s += src_stride;
    433 
    434      do {
    435        vint16m1_t s5 = __riscv_vle16_v_i16m1(s, vl);
    436        s += src_stride;
    437        vint16m1_t s6 = __riscv_vle16_v_i16m1(s, vl);
    438        s += src_stride;
    439        vint16m1_t s7 = __riscv_vle16_v_i16m1(s, vl);
    440        s += src_stride;
    441        vint16m1_t s8 = __riscv_vle16_v_i16m1(s, vl);
    442        s += src_stride;
    443 
    444        vuint8mf2_t d0 = convolve6_2d_v_rvv(s0, s1, s2, s3, s4, s5, filter,
    445                                            sub_const, vert_const, vl);
    446        vuint8mf2_t d1 = convolve6_2d_v_rvv(s1, s2, s3, s4, s5, s6, filter,
    447                                            sub_const, vert_const, vl);
    448        vuint8mf2_t d2 = convolve6_2d_v_rvv(s2, s3, s4, s5, s6, s7, filter,
    449                                            sub_const, vert_const, vl);
    450        vuint8mf2_t d3 = convolve6_2d_v_rvv(s3, s4, s5, s6, s7, s8, filter,
    451                                            sub_const, vert_const, vl);
    452 
    453        __riscv_vse8_v_u8mf2(d, d0, vl);
    454        d += dst_stride;
    455        __riscv_vse8_v_u8mf2(d, d1, vl);
    456        d += dst_stride;
    457        __riscv_vse8_v_u8mf2(d, d2, vl);
    458        d += dst_stride;
    459        __riscv_vse8_v_u8mf2(d, d3, vl);
    460        d += dst_stride;
    461 
    462        s0 = s4;
    463        s1 = s5;
    464        s2 = s6;
    465        s3 = s7;
    466        s4 = s8;
    467 
    468        height -= 4;
    469      } while (height != 0);
    470 
    471      src_ptr += vl;
    472      dst_ptr += vl;
    473      w -= vl;
    474    } while (w != 0);
    475  }
    476 }
    477 
    478 static inline vuint8mf2_t convolve4_2d_v_rvv(
    479    const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2,
    480    const vint16m1_t s3, const int16_t *y_filter, const int16_t sub_const,
    481    const int vert_const, size_t vl) {
    482  vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl);
    483  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl);
    484  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl);
    485  sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl);
    486  sum = __riscv_vadd_vx_i32m2(sum, vert_const, vl);
    487 
    488  vint16m1_t i16_sum =
    489      __riscv_vnsra_wx_i16m1(sum, ((FILTER_BITS << 1) - ROUND0_BITS), vl);
    490  i16_sum = __riscv_vsub_vx_i16m1(i16_sum, sub_const, vl);
    491  vint16m1_t iclip_sum =
    492      __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), 255, vl);
    493 
    494  return __riscv_vncvt_x_x_w_u8mf2(
    495      __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum), vl);
    496 }
    497 
    498 static inline void convolve_2d_sr_vert_4tap_rvv(
    499    int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w,
    500    int h, const int16_t *y_filter_ptr, size_t vl) {
    501  const int vert_const = (1 << ((FILTER_BITS << 1) - ROUND0_BITS)) >> 1;
    502  const int16_t sub_const = 1 << FILTER_BITS;
    503  // Filter values are at offset 2
    504  const int16_t *filter = y_filter_ptr + 2;
    505 
    506  if (w <= 4) {
    507    vl = vl << 1;
    508 
    509    vint16m1_t s0 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    510    src_ptr += src_stride;
    511    vint16m1_t s1 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    512    src_ptr += src_stride;
    513 
    514    do {
    515      vint16m1_t s2 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    516      src_ptr += src_stride;
    517      vint16m1_t s3 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    518      src_ptr += src_stride;
    519      vint16m1_t s4 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    520      src_ptr += src_stride;
    521      vint16m1_t s5 = load_strided_i16_4xN(src_ptr, src_stride, vl);
    522      src_ptr += src_stride;
    523 
    524      vuint8mf2_t d0 =
    525          convolve4_2d_v_rvv(s0, s1, s2, s3, filter, sub_const, vert_const, vl);
    526      vuint8mf2_t d1 =
    527          convolve4_2d_v_rvv(s2, s3, s4, s5, filter, sub_const, vert_const, vl);
    528 
    529      store_strided_u8_4xN(dst_ptr, d0, dst_stride, vl);
    530      dst_ptr += dst_stride << 1;
    531      store_strided_u8_4xN(dst_ptr, d1, dst_stride, vl);
    532      dst_ptr += dst_stride << 1;
    533 
    534      s0 = s4;
    535      s1 = s5;
    536 
    537      h -= 4;
    538    } while (h != 0);
    539  } else {
    540    do {
    541      int height = h;
    542      int16_t *s = src_ptr;
    543      uint8_t *d = dst_ptr;
    544 
    545      vint16m1_t s0 = __riscv_vle16_v_i16m1(s, vl);
    546      s += src_stride;
    547      vint16m1_t s1 = __riscv_vle16_v_i16m1(s, vl);
    548      s += src_stride;
    549      vint16m1_t s2 = __riscv_vle16_v_i16m1(s, vl);
    550      s += src_stride;
    551 
    552      do {
    553        vint16m1_t s3 = __riscv_vle16_v_i16m1(s, vl);
    554        s += src_stride;
    555        vint16m1_t s4 = __riscv_vle16_v_i16m1(s, vl);
    556        s += src_stride;
    557        vint16m1_t s5 = __riscv_vle16_v_i16m1(s, vl);
    558        s += src_stride;
    559        vint16m1_t s6 = __riscv_vle16_v_i16m1(s, vl);
    560        s += src_stride;
    561 
    562        vuint8mf2_t d0 = convolve4_2d_v_rvv(s0, s1, s2, s3, filter, sub_const,
    563                                            vert_const, vl);
    564        vuint8mf2_t d1 = convolve4_2d_v_rvv(s1, s2, s3, s4, filter, sub_const,
    565                                            vert_const, vl);
    566        vuint8mf2_t d2 = convolve4_2d_v_rvv(s2, s3, s4, s5, filter, sub_const,
    567                                            vert_const, vl);
    568        vuint8mf2_t d3 = convolve4_2d_v_rvv(s3, s4, s5, s6, filter, sub_const,
    569                                            vert_const, vl);
    570 
    571        __riscv_vse8_v_u8mf2(d, d0, vl);
    572        d += dst_stride;
    573        __riscv_vse8_v_u8mf2(d, d1, vl);
    574        d += dst_stride;
    575        __riscv_vse8_v_u8mf2(d, d2, vl);
    576        d += dst_stride;
    577        __riscv_vse8_v_u8mf2(d, d3, vl);
    578        d += dst_stride;
    579 
    580        s0 = s4;
    581        s1 = s5;
    582        s2 = s6;
    583 
    584        height -= 4;
    585      } while (height != 0);
    586 
    587      src_ptr += vl;
    588      dst_ptr += vl;
    589      w -= vl;
    590    } while (w != 0);
    591  }
    592 }
    593 
    594 #endif  // AOM_AV1_COMMON_RISCV_CONVOLVE_RVV_H_