tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cdef_block_rvv.c (51725B)


      1 /*
      2 * Copyright (c) 2025, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <riscv_vector.h>
     13 
     14 #include "config/aom_config.h"
     15 #include "config/av1_rtcd.h"
     16 #include "av1/common/cdef_block.h"
     17 
     18 // partial A is a 16-bit vector of the form:
     19 // [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
     20 // [0  y1 y2 y3 y4 y5 y6 y7].
     21 // This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
     22 // (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
     23 // and const2.
     24 static inline vuint32m1_t fold_mul_and_sum_rvv(vint16m1_t partiala,
     25                                               vint16m1_t partialb,
     26                                               vuint32m1_t const1,
     27                                               vuint32m1_t const2) {
     28  // Square and add the corresponding x and y values.
     29  vint32m2_t cost = __riscv_vwmul_vv_i32m2(partiala, partiala, 8);
     30  cost = __riscv_vwmacc_vv_i32m2(cost, partialb, partialb, 8);
     31 
     32  // Multiply by constant.
     33  vuint32m2_t tmp1_u32m2 = __riscv_vreinterpret_v_i32m2_u32m2(cost);
     34  vuint32m1_t cost_u32m1 = __riscv_vmul_vv_u32m1(
     35      __riscv_vlmul_trunc_v_u32m2_u32m1(tmp1_u32m2), const1, 4);
     36  tmp1_u32m2 = __riscv_vslidedown_vx_u32m2(tmp1_u32m2, 4, 8);
     37  vuint32m1_t ret = __riscv_vmacc_vv_u32m1(
     38      cost_u32m1, __riscv_vlmul_trunc_v_u32m2_u32m1(tmp1_u32m2), const2, 4);
     39  return ret;
     40 }
     41 
     42 // This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal
     43 // down-right, 6 is vertical).
     44 //
     45 // For each direction the lines are shifted so that we can perform a
     46 // basic sum on each vector element. For example, direction 5 is "south by
     47 // southeast", so we need to add the pixels along each line i below:
     48 //
     49 // 0  1 2 3 4 5 6 7
     50 // 0  1 2 3 4 5 6 7
     51 // 8  0 1 2 3 4 5 6
     52 // 8  0 1 2 3 4 5 6
     53 // 9  8 0 1 2 3 4 5
     54 // 9  8 0 1 2 3 4 5
     55 // 10 9 8 0 1 2 3 4
     56 // 10 9 8 0 1 2 3 4
     57 //
     58 // For this to fit nicely in vectors, the lines need to be shifted like so:
     59 //        0 1 2 3 4 5 6 7
     60 //        0 1 2 3 4 5 6 7
     61 //      8 0 1 2 3 4 5 6
     62 //      8 0 1 2 3 4 5 6
     63 //    9 8 0 1 2 3 4 5
     64 //    9 8 0 1 2 3 4 5
     65 // 10 9 8 0 1 2 3 4
     66 // 10 9 8 0 1 2 3 4
     67 //
     68 // In this configuration we can now perform SIMD additions to get the cost
     69 // along direction 5. Since this won't fit into a single 128-bit vector, we use
     70 // two of them to compute each half of the new configuration, and pad the empty
     71 // spaces with zeros. Similar shifting is done for other directions, except
     72 // direction 6 which is straightforward as it's the vertical direction.
     73 static vuint32m1_t compute_vert_directions_rvv(
     74    vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2,
     75    vint16m1_t lines_3, vint16m1_t lines_4, vint16m1_t lines_5,
     76    vint16m1_t lines_6, vint16m1_t lines_7, uint32_t cost[4], size_t vl) {
     77  size_t VL_SLIDE_DOWN = __riscv_vsetvl_e16m1(16);
     78  vint16m1_t vec_zero_i16m1 = __riscv_vmv_v_x_i16m1(0, vl);
     79 
     80  // Partial sums for lines 0 and 1.
     81  vint16m1_t partial4a =
     82      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_0, (8 - 1), vl);
     83  vint16m1_t tmp1_i16m1 =
     84      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 2), vl);
     85  partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl);
     86  vint16m1_t partial4b = __riscv_vslide1down_vx_i16m1(lines_0, 0, vl);
     87  tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_1, 2, VL_SLIDE_DOWN);
     88  partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl);
     89  tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_0, lines_1, VL_SLIDE_DOWN);
     90  vint16m1_t partial5a =
     91      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 3), vl);
     92  vint16m1_t partial5b =
     93      __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 3, VL_SLIDE_DOWN);
     94  vint16m1_t partial7a =
     95      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 6), vl);
     96  vint16m1_t partial7b =
     97      __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 6, VL_SLIDE_DOWN);
     98  vint16m1_t partial6 = __riscv_vmv_v_v_i16m1(tmp1_i16m1, vl);
     99 
    100  // Partial sums for lines 2 and 3.
    101  tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 3), vl);
    102  partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl);
    103  tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 4), vl);
    104  partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl);
    105  tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_2, 3, VL_SLIDE_DOWN);
    106  partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl);
    107  tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_3, 4, VL_SLIDE_DOWN);
    108  partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl);
    109  tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_2, lines_3, VL_SLIDE_DOWN);
    110  partial5a = __riscv_vadd_vv_i16m1(
    111      partial5a,
    112      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 4), vl), vl);
    113  partial5b = __riscv_vadd_vv_i16m1(
    114      partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 4, VL_SLIDE_DOWN), vl);
    115  partial7a = __riscv_vadd_vv_i16m1(
    116      partial7a,
    117      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 5), vl), vl);
    118  partial7b = __riscv_vadd_vv_i16m1(
    119      partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 5, VL_SLIDE_DOWN), vl);
    120  partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl);
    121 
    122  // Partial sums for lines 4 and 5.
    123  partial4a = __riscv_vadd_vv_i16m1(
    124      partial4a,
    125      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 5), vl), vl);
    126  partial4a = __riscv_vadd_vv_i16m1(
    127      partial4a,
    128      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 6), vl), vl);
    129  partial4b = __riscv_vadd_vv_i16m1(
    130      partial4b, __riscv_vslidedown_vx_i16m1(lines_4, 5, VL_SLIDE_DOWN), vl);
    131  partial4b = __riscv_vadd_vv_i16m1(
    132      partial4b, __riscv_vslidedown_vx_i16m1(lines_5, 6, VL_SLIDE_DOWN), vl);
    133  tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_4, lines_5, VL_SLIDE_DOWN);
    134  partial5a = __riscv_vadd_vv_i16m1(
    135      partial5a,
    136      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 5), vl), vl);
    137  partial5b = __riscv_vadd_vv_i16m1(
    138      partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 5, VL_SLIDE_DOWN), vl);
    139  partial7a = __riscv_vadd_vv_i16m1(
    140      partial7a,
    141      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 4), vl), vl);
    142  partial7b = __riscv_vadd_vv_i16m1(
    143      partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 4, VL_SLIDE_DOWN), vl);
    144  partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl);
    145 
    146  // Partial sums for lines 6 and 7.
    147  partial4a = __riscv_vadd_vv_i16m1(
    148      partial4a,
    149      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 7), vl), vl);
    150  partial4a = __riscv_vadd_vv_i16m1(partial4a, lines_7, vl);
    151  partial4b = __riscv_vadd_vv_i16m1(
    152      partial4b, __riscv_vslidedown_vx_i16m1(lines_6, 7, VL_SLIDE_DOWN), vl);
    153  tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_6, lines_7, VL_SLIDE_DOWN);
    154  partial5a = __riscv_vadd_vv_i16m1(
    155      partial5a,
    156      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 6), vl), vl);
    157  partial5b = __riscv_vadd_vv_i16m1(
    158      partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 6, VL_SLIDE_DOWN), vl);
    159  partial7a = __riscv_vadd_vv_i16m1(
    160      partial7a,
    161      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 3), vl), vl);
    162  partial7b = __riscv_vadd_vv_i16m1(
    163      partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 3, VL_SLIDE_DOWN), vl);
    164  partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl);
    165 
    166  // const0 = { 840, 420, 280, 210, }
    167  vuint32m1_t const0 = __riscv_vmv_s_x_u32m1(210, 4);
    168  const0 = __riscv_vslide1up_vx_u32m1(const0, 280, 4);
    169  const0 = __riscv_vslide1up_vx_u32m1(const0, 420, 4);
    170  const0 = __riscv_vslide1up_vx_u32m1(const0, 840, 4);
    171 
    172  // const1 = { 168, 140, 120, 105, }
    173  vuint32m1_t const1 = __riscv_vmv_s_x_u32m1(105, 4);
    174  const1 = __riscv_vslide1up_vx_u32m1(const1, 120, 4);
    175  const1 = __riscv_vslide1up_vx_u32m1(const1, 140, 4);
    176  const1 = __riscv_vslide1up_vx_u32m1(const1, 168, 4);
    177 
    178  // const2 = { 0, 0, 420, 210, }
    179  vuint32m1_t const2 = __riscv_vmv_v_x_u32m1(0, 4);
    180  const2 = __riscv_vslide1down_vx_u32m1(const2, 420, 4);
    181  const2 = __riscv_vslide1down_vx_u32m1(const2, 210, 4);
    182 
    183  // const3 = { 140, 105, 105, 105, };
    184  vuint32m1_t const3 = __riscv_vmv_v_x_u32m1(105, 4);
    185  const3 = __riscv_vslide1up_vx_u32m1(const3, 140, 4);
    186 
    187  // Compute costs in terms of partial sums.
    188  vint32m2_t tmp1_i32m2 = __riscv_vwmul_vv_i32m2(partial6, partial6, vl);
    189  vint32m2_t partial6_s32 = __riscv_vslidedown_vx_i32m2(tmp1_i32m2, 4, vl);
    190  partial6_s32 = __riscv_vadd_vv_i32m2(partial6_s32, tmp1_i32m2, 4);
    191 
    192  // Reverse partial B.
    193  // pattern = { 6, 5, 4, 3, 2, 1, 0, 7, }.
    194  vuint32m1_t costs_0, costs_1, costs_2, costs_3;
    195  static const uint16_t tab_u16[8] = {
    196    6, 5, 4, 3, 2, 1, 0, 7,
    197  };
    198  vuint16m1_t index_u16m1 = __riscv_vle16_v_u16m1(tab_u16, 8);
    199  vint16m1_t partial4b_rv =
    200      __riscv_vrgather_vv_i16m1(partial4b, index_u16m1, 8);
    201  costs_0 = fold_mul_and_sum_rvv(partial4a, partial4b_rv, const0, const1);
    202  vuint32m1_t partial6_u32 = __riscv_vreinterpret_v_i32m1_u32m1(
    203      __riscv_vlmul_trunc_v_i32m2_i32m1(partial6_s32));
    204  costs_2 = __riscv_vmul_vx_u32m1(partial6_u32, 105, 4);
    205  vint16m1_t partial5b_rv =
    206      __riscv_vrgather_vv_i16m1(partial5b, index_u16m1, 8);
    207  costs_1 = fold_mul_and_sum_rvv(partial5a, partial5b_rv, const2, const3);
    208  vint16m1_t partial7b_rv =
    209      __riscv_vrgather_vv_i16m1(partial7b, index_u16m1, 8);
    210  costs_3 = fold_mul_and_sum_rvv(partial7a, partial7b_rv, const2, const3);
    211 
    212  // combine values
    213  vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1);
    214  vuint32m1_t cost0_sum =
    215      __riscv_vredsum_vs_u32m1_u32m1(costs_0, vec_scalar_u32m1, 4);
    216  vuint32m1_t cost1_sum =
    217      __riscv_vredsum_vs_u32m1_u32m1(costs_1, vec_scalar_u32m1, 4);
    218  vuint32m1_t cost2_sum =
    219      __riscv_vredsum_vs_u32m1_u32m1(costs_2, vec_scalar_u32m1, 4);
    220  vuint32m1_t cost3_sum =
    221      __riscv_vredsum_vs_u32m1_u32m1(costs_3, vec_scalar_u32m1, 4);
    222 
    223  vuint32m1_t cost47 = __riscv_vslideup_vx_u32m1(cost0_sum, cost1_sum, 1, 4);
    224  cost47 = __riscv_vslideup_vx_u32m1(cost47, cost2_sum, 2, 4);
    225  cost47 = __riscv_vslideup_vx_u32m1(cost47, cost3_sum, 3, 4);
    226  __riscv_vse32_v_u32m1(&cost[0], cost47, 4);
    227  return cost47;
    228 }
    229 
    230 static inline vuint32m1_t fold_mul_and_sum_pairwise_rvv(vint16m1_t partiala,
    231                                                        vint16m1_t partialb,
    232                                                        vint16m1_t partialc,
    233                                                        vuint32m1_t const0) {
    234  vuint16m1_t vid_u16m1 = __riscv_vid_v_u16m1(4);
    235  vuint16m1_t index_u16m1 = __riscv_vsll_vx_u16m1(vid_u16m1, 1, 4);
    236  vint16m1_t tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partiala, 0, 8);
    237  vint32m2_t partiala_i32m2 = __riscv_vwadd_vv_i32m2(partiala, tmp_i16m1, 8);
    238  tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partialb, 0, 8);
    239  vint32m2_t partialb_i32m2 = __riscv_vwadd_vv_i32m2(partialb, tmp_i16m1, 8);
    240 
    241  tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partialc, 0, 8);
    242  vint32m2_t partialc_i32m2 = __riscv_vwadd_vv_i32m2(partialc, tmp_i16m1, 8);
    243  partiala_i32m2 = __riscv_vmul_vv_i32m2(partiala_i32m2, partiala_i32m2, 8);
    244  partialb_i32m2 = __riscv_vmul_vv_i32m2(partialb_i32m2, partialb_i32m2, 8);
    245  vint32m1_t partialb_i32m1 = __riscv_vlmul_trunc_v_i32m2_i32m1(
    246      __riscv_vrgatherei16_vv_i32m2(partialb_i32m2, index_u16m1, 4));
    247  partialc_i32m2 = __riscv_vmul_vv_i32m2(partialc_i32m2, partialc_i32m2, 8);
    248  partiala_i32m2 = __riscv_vadd_vv_i32m2(partiala_i32m2, partialc_i32m2, 8);
    249  vint32m1_t partiala_i32m1 = __riscv_vlmul_trunc_v_i32m2_i32m1(
    250      __riscv_vrgatherei16_vv_i32m2(partiala_i32m2, index_u16m1, 4));
    251 
    252  vuint32m1_t cost = __riscv_vmul_vx_u32m1(
    253      __riscv_vreinterpret_v_i32m1_u32m1(partialb_i32m1), 105, 4);
    254  cost = __riscv_vmacc_vv_u32m1(
    255      cost, __riscv_vreinterpret_v_i32m1_u32m1(partiala_i32m1), const0, 4);
    256  return cost;
    257 }
    258 
    259 static inline vint32m1_t horizontal_add_4d_s16x8(vint16m1_t lines_0,
    260                                                 vint16m1_t lines_1,
    261                                                 vint16m1_t lines_2,
    262                                                 vint16m1_t lines_3) {
    263  vint32m1_t vec_scalar_i32m1 = __riscv_vmv_s_x_i32m1(0, 1);
    264  vint32m1_t lines0_sum =
    265      __riscv_vwredsum_vs_i16m1_i32m1(lines_0, vec_scalar_i32m1, 8);
    266  vint32m1_t lines1_sum =
    267      __riscv_vwredsum_vs_i16m1_i32m1(lines_1, vec_scalar_i32m1, 8);
    268  vint32m1_t lines2_sum =
    269      __riscv_vwredsum_vs_i16m1_i32m1(lines_2, vec_scalar_i32m1, 8);
    270  vint32m1_t lines3_sum =
    271      __riscv_vwredsum_vs_i16m1_i32m1(lines_3, vec_scalar_i32m1, 8);
    272 
    273  vint32m1_t ret = __riscv_vslideup_vx_i32m1(lines0_sum, lines1_sum, 1, 4);
    274  ret = __riscv_vslideup_vx_i32m1(ret, lines2_sum, 2, 4);
    275  ret = __riscv_vslideup_vx_i32m1(ret, lines3_sum, 3, 4);
    276  return ret;
    277 }
    278 
    279 // This function computes the cost along directions 0, 1, 2, 3. (0 means
    280 // 45-degree up-right, 2 is horizontal).
    281 //
    282 // For direction 1 and 3 ("east northeast" and "east southeast") the shifted
    283 // lines need three vectors instead of two. For direction 1 for example, we need
    284 // to compute the sums along the line i below:
    285 // 0 0 1 1 2 2 3  3
    286 // 1 1 2 2 3 3 4  4
    287 // 2 2 3 3 4 4 5  5
    288 // 3 3 4 4 5 5 6  6
    289 // 4 4 5 5 6 6 7  7
    290 // 5 5 6 6 7 7 8  8
    291 // 6 6 7 7 8 8 9  9
    292 // 7 7 8 8 9 9 10 10
    293 //
    294 // Which means we need the following configuration:
    295 // 0 0 1 1 2 2 3 3
    296 //     1 1 2 2 3 3 4 4
    297 //         2 2 3 3 4 4 5 5
    298 //             3 3 4 4 5 5 6 6
    299 //                 4 4 5 5 6 6 7 7
    300 //                     5 5 6 6 7 7 8 8
    301 //                         6 6 7 7 8 8 9 9
    302 //                             7 7 8 8 9 9 10 10
    303 //
    304 // Three vectors are needed to compute this, as well as some extra pairwise
    305 // additions.
    306 static vuint32m1_t compute_horiz_directions_rvv(
    307    vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2,
    308    vint16m1_t lines_3, vint16m1_t lines_4, vint16m1_t lines_5,
    309    vint16m1_t lines_6, vint16m1_t lines_7, uint32_t cost[4], size_t vl) {
    310  // Compute diagonal directions (1, 2, 3).
    311  // Partial sums for lines 0 and 1.
    312  size_t VL_SLIDE_DOWN = __riscv_vsetvl_e16m1(16);
    313  vint16m1_t vec_zero_i16m1 = __riscv_vmv_v_x_i16m1(0, vl);
    314  vint16m1_t partial0a = __riscv_vmv_v_v_i16m1(lines_0, vl);
    315  partial0a = __riscv_vadd_vv_i16m1(
    316      partial0a,
    317      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 7), vl), vl);
    318  vint16m1_t partial0b = __riscv_vslidedown_vx_i16m1(lines_1, 7, VL_SLIDE_DOWN);
    319  vint16m1_t partial1a = __riscv_vadd_vv_i16m1(
    320      lines_0, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 6), vl),
    321      vl);
    322  vint16m1_t partial1b = __riscv_vslidedown_vx_i16m1(lines_1, 6, VL_SLIDE_DOWN);
    323  vint16m1_t partial3a = __riscv_vslidedown_vx_i16m1(lines_0, 2, VL_SLIDE_DOWN);
    324  partial3a = __riscv_vadd_vv_i16m1(
    325      partial3a, __riscv_vslidedown_vx_i16m1(lines_1, 4, VL_SLIDE_DOWN), vl);
    326  vint16m1_t partial3b =
    327      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_0, (8 - 2), vl);
    328  partial3b = __riscv_vadd_vv_i16m1(
    329      partial3b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, 4, vl), vl);
    330 
    331  // Partial sums for lines 2 and 3.
    332  partial0a = __riscv_vadd_vv_i16m1(
    333      partial0a,
    334      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 6), vl), vl);
    335  partial0a = __riscv_vadd_vv_i16m1(
    336      partial0a,
    337      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 5), vl), vl);
    338  partial0b = __riscv_vadd_vv_i16m1(
    339      partial0b, __riscv_vslidedown_vx_i16m1(lines_2, 6, VL_SLIDE_DOWN), vl);
    340  partial0b = __riscv_vadd_vv_i16m1(
    341      partial0b, __riscv_vslidedown_vx_i16m1(lines_3, 5, VL_SLIDE_DOWN), vl);
    342  partial1a = __riscv_vadd_vv_i16m1(
    343      partial1a,
    344      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 4), vl), vl);
    345  partial1a = __riscv_vadd_vv_i16m1(
    346      partial1a,
    347      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 2), vl), vl);
    348  partial1b = __riscv_vadd_vv_i16m1(
    349      partial1b, __riscv_vslidedown_vx_i16m1(lines_2, 4, VL_SLIDE_DOWN), vl);
    350  partial1b = __riscv_vadd_vv_i16m1(
    351      partial1b, __riscv_vslidedown_vx_i16m1(lines_3, 2, VL_SLIDE_DOWN), vl);
    352  partial3a = __riscv_vadd_vv_i16m1(
    353      partial3a, __riscv_vslidedown_vx_i16m1(lines_2, 6, VL_SLIDE_DOWN), vl);
    354  partial3b = __riscv_vadd_vv_i16m1(
    355      partial3b,
    356      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 6), vl), vl);
    357  partial3b = __riscv_vadd_vv_i16m1(partial3b, lines_3, vl);
    358 
    359  // Partial sums for lines 4 and 5.
    360  partial0a = __riscv_vadd_vv_i16m1(
    361      partial0a,
    362      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 4), vl), vl);
    363  partial0a = __riscv_vadd_vv_i16m1(
    364      partial0a,
    365      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 3), vl), vl);
    366  partial0b = __riscv_vadd_vv_i16m1(
    367      partial0b, __riscv_vslidedown_vx_i16m1(lines_4, 4, VL_SLIDE_DOWN), vl);
    368  partial0b = __riscv_vadd_vv_i16m1(
    369      partial0b, __riscv_vslidedown_vx_i16m1(lines_5, 3, VL_SLIDE_DOWN), vl);
    370  partial1b = __riscv_vadd_vv_i16m1(partial1b, lines_4, vl);
    371  partial1b = __riscv_vadd_vv_i16m1(
    372      partial1b,
    373      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 6), vl), vl);
    374  vint16m1_t partial1c = __riscv_vslidedown_vx_i16m1(lines_5, 6, VL_SLIDE_DOWN);
    375  partial3b = __riscv_vadd_vv_i16m1(
    376      partial3b, __riscv_vslidedown_vx_i16m1(lines_4, 2, VL_SLIDE_DOWN), vl);
    377  partial3b = __riscv_vadd_vv_i16m1(
    378      partial3b, __riscv_vslidedown_vx_i16m1(lines_5, 4, VL_SLIDE_DOWN), vl);
    379  vint16m1_t partial3c =
    380      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 2), vl);
    381  partial3c = __riscv_vadd_vv_i16m1(
    382      partial3c,
    383      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 4), vl), vl);
    384 
    385  // Partial sums for lines 6 and 7.
    386  partial0a = __riscv_vadd_vv_i16m1(
    387      partial0a,
    388      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 2), vl), vl);
    389  partial0a = __riscv_vadd_vv_i16m1(
    390      partial0a,
    391      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_7, (8 - 1), vl), vl);
    392  partial0b = __riscv_vadd_vv_i16m1(
    393      partial0b, __riscv_vslidedown_vx_i16m1(lines_6, 2, VL_SLIDE_DOWN), vl);
    394  partial0b = __riscv_vadd_vv_i16m1(
    395      partial0b, __riscv_vslide1down_vx_i16m1(lines_7, 0, vl), vl);
    396  partial1b = __riscv_vadd_vv_i16m1(
    397      partial1b,
    398      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 4), vl), vl);
    399  partial1b = __riscv_vadd_vv_i16m1(
    400      partial1b,
    401      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_7, (8 - 2), vl), vl);
    402  partial1c = __riscv_vadd_vv_i16m1(
    403      partial1c, __riscv_vslidedown_vx_i16m1(lines_6, 4, VL_SLIDE_DOWN), vl);
    404  partial1c = __riscv_vadd_vv_i16m1(
    405      partial1c, __riscv_vslidedown_vx_i16m1(lines_7, 2, VL_SLIDE_DOWN), vl);
    406  partial3b = __riscv_vadd_vv_i16m1(
    407      partial3b, __riscv_vslidedown_vx_i16m1(lines_6, 6, VL_SLIDE_DOWN), vl);
    408  partial3c = __riscv_vadd_vv_i16m1(
    409      partial3c,
    410      __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 6), vl), vl);
    411  partial3c = __riscv_vadd_vv_i16m1(partial3c, lines_7, vl);
    412 
    413  // Special case for direction 2 as it's just a sum along each line.
    414  vint32m1_t partial2a =
    415      horizontal_add_4d_s16x8(lines_0, lines_1, lines_2, lines_3);
    416  vint32m1_t partial2b =
    417      horizontal_add_4d_s16x8(lines_4, lines_5, lines_6, lines_7);
    418  vuint32m1_t partial2a_u32 = __riscv_vreinterpret_v_i32m1_u32m1(
    419      __riscv_vmul_vv_i32m1(partial2a, partial2a, 4));
    420  vuint32m1_t partial2b_u32 = __riscv_vreinterpret_v_i32m1_u32m1(
    421      __riscv_vmul_vv_i32m1(partial2b, partial2b, 4));
    422 
    423  // const0 = { 840, 420, 280, 210, }
    424  vuint32m1_t const0 = __riscv_vmv_s_x_u32m1(210, 4);
    425  const0 = __riscv_vslide1up_vx_u32m1(const0, 280, 4);
    426  const0 = __riscv_vslide1up_vx_u32m1(const0, 420, 4);
    427  const0 = __riscv_vslide1up_vx_u32m1(const0, 840, 4);
    428 
    429  // const1 = { 168, 140, 120, 105, }
    430  vuint32m1_t const1 = __riscv_vmv_s_x_u32m1(105, 4);
    431  const1 = __riscv_vslide1up_vx_u32m1(const1, 120, 4);
    432  const1 = __riscv_vslide1up_vx_u32m1(const1, 140, 4);
    433  const1 = __riscv_vslide1up_vx_u32m1(const1, 168, 4);
    434 
    435  // const2 = { 420, 210, 140, 105, };
    436  vuint32m1_t const2 = __riscv_vmv_s_x_u32m1(105, 4);
    437  const2 = __riscv_vslide1up_vx_u32m1(const2, 140, 4);
    438  const2 = __riscv_vslide1up_vx_u32m1(const2, 210, 4);
    439  const2 = __riscv_vslide1up_vx_u32m1(const2, 420, 4);
    440 
    441  static const uint16_t tab_u16[8] = {
    442    0, 6, 5, 4, 3, 2, 1, 0,
    443  };
    444  vuint32m1_t costs_0, costs_1, costs_2, costs_3;
    445  vuint16m1_t template_u16m1 = __riscv_vle16_v_u16m1(tab_u16, 8);
    446 
    447  // Reverse partial c.
    448  // pattern = { 6, 5, 4, 3, 2, 1, 0, 7, }
    449  vuint16m1_t index_u16m1 = __riscv_vslide1down_vx_u16m1(template_u16m1, 7, 8);
    450  vint16m1_t partial0b_rv =
    451      __riscv_vrgather_vv_i16m1(partial0b, index_u16m1, 8);
    452  costs_0 = fold_mul_and_sum_rvv(partial0a, partial0b_rv, const0, const1);
    453 
    454  // Reverse partial c.
    455  // pattern = { 5, 4, 3, 2, 1, 0, 6, 7, }
    456  vuint16m1_t index_pair_u16m1 =
    457      __riscv_vslide1down_vx_u16m1(template_u16m1, 6, 8);
    458  index_pair_u16m1 = __riscv_vslide1down_vx_u16m1(index_pair_u16m1, 7, 8);
    459  vint16m1_t partialc_rv =
    460      __riscv_vrgather_vv_i16m1(partial1c, index_pair_u16m1, 8);
    461  costs_1 =
    462      fold_mul_and_sum_pairwise_rvv(partial1a, partial1b, partialc_rv, const2);
    463 
    464  costs_2 = __riscv_vadd_vv_u32m1(partial2a_u32, partial2b_u32, 4);
    465  costs_2 = __riscv_vmul_vx_u32m1(costs_2, 105, 4);
    466 
    467  vint16m1_t partial3a_rv =
    468      __riscv_vrgather_vv_i16m1(partial3a, index_pair_u16m1, 8);
    469  costs_3 =
    470      fold_mul_and_sum_pairwise_rvv(partial3c, partial3b, partial3a_rv, const2);
    471 
    472  // combine values
    473  vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1);
    474  vuint32m1_t cost0_sum =
    475      __riscv_vredsum_vs_u32m1_u32m1(costs_0, vec_scalar_u32m1, 4);
    476  vuint32m1_t cost1_sum =
    477      __riscv_vredsum_vs_u32m1_u32m1(costs_1, vec_scalar_u32m1, 4);
    478  vuint32m1_t cost2_sum =
    479      __riscv_vredsum_vs_u32m1_u32m1(costs_2, vec_scalar_u32m1, 4);
    480  vuint32m1_t cost3_sum =
    481      __riscv_vredsum_vs_u32m1_u32m1(costs_3, vec_scalar_u32m1, 4);
    482 
    483  costs_0 = __riscv_vslideup_vx_u32m1(cost0_sum, cost1_sum, 1, 4);
    484  costs_0 = __riscv_vslideup_vx_u32m1(costs_0, cost2_sum, 2, 4);
    485  costs_0 = __riscv_vslideup_vx_u32m1(costs_0, cost3_sum, 3, 4);
    486  __riscv_vse32_v_u32m1(&cost[0], costs_0, 4);
    487  return costs_0;
    488 }
    489 
    490 int cdef_find_dir_rvv(const uint16_t *img, int stride, int32_t *var,
    491                      int coeff_shift) {
    492  size_t vl = 8;
    493  size_t vlmax = __riscv_vsetvlmax_e16m1();
    494  vuint16m1_t s;
    495  vint16m1_t lines_0, lines_1, lines_2, lines_3;
    496  vint16m1_t lines_4, lines_5, lines_6, lines_7;
    497  vuint16m1_t vec_zero_u16m1 =
    498      __riscv_vmv_v_x_u16m1(0, __riscv_vsetvl_e16m1(16));
    499 
    500  if (vlmax == 8)
    501    s = __riscv_vle16_v_u16m1(img, vl);
    502  else
    503    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
    504  lines_0 = __riscv_vreinterpret_v_u16m1_i16m1(
    505      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
    506  lines_0 = __riscv_vsub_vx_i16m1(lines_0, 128, vl);
    507 
    508  img += stride;
    509  if (vlmax == 8)
    510    s = __riscv_vle16_v_u16m1(img, vl);
    511  else
    512    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
    513  lines_1 = __riscv_vreinterpret_v_u16m1_i16m1(
    514      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
    515  lines_1 = __riscv_vsub_vx_i16m1(lines_1, 128, vl);
    516 
    517  img += stride;
    518  if (vlmax == 8)
    519    s = __riscv_vle16_v_u16m1(img, vl);
    520  else
    521    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
    522  lines_2 = __riscv_vreinterpret_v_u16m1_i16m1(
    523      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
    524  lines_2 = __riscv_vsub_vx_i16m1(lines_2, 128, vl);
    525 
    526  img += stride;
    527  if (vlmax == 8)
    528    s = __riscv_vle16_v_u16m1(img, vl);
    529  else
    530    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
    531  lines_3 = __riscv_vreinterpret_v_u16m1_i16m1(
    532      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
    533  lines_3 = __riscv_vsub_vx_i16m1(lines_3, 128, vl);
    534 
    535  img += stride;
    536  if (vlmax == 8)
    537    s = __riscv_vle16_v_u16m1(img, vl);
    538  else
    539    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
    540  lines_4 = __riscv_vreinterpret_v_u16m1_i16m1(
    541      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
    542  lines_4 = __riscv_vsub_vx_i16m1(lines_4, 128, vl);
    543 
    544  img += stride;
    545  if (vlmax == 8)
    546    s = __riscv_vle16_v_u16m1(img, vl);
    547  else
    548    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
    549  lines_5 = __riscv_vreinterpret_v_u16m1_i16m1(
    550      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
    551  lines_5 = __riscv_vsub_vx_i16m1(lines_5, 128, vl);
    552 
    553  img += stride;
    554  if (vlmax == 8)
    555    s = __riscv_vle16_v_u16m1(img, vl);
    556  else
    557    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
    558  lines_6 = __riscv_vreinterpret_v_u16m1_i16m1(
    559      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
    560  lines_6 = __riscv_vsub_vx_i16m1(lines_6, 128, vl);
    561 
    562  img += stride;
    563  if (vlmax == 8)
    564    s = __riscv_vle16_v_u16m1(img, vl);
    565  else
    566    s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl);
    567  lines_7 = __riscv_vreinterpret_v_u16m1_i16m1(
    568      __riscv_vsrl_vx_u16m1(s, coeff_shift, vl));
    569  lines_7 = __riscv_vsub_vx_i16m1(lines_7, 128, vl);
    570 
    571  // Compute "mostly vertical" directions.
    572  uint32_t cost[8];
    573  vuint32m1_t cost47 =
    574      compute_vert_directions_rvv(lines_0, lines_1, lines_2, lines_3, lines_4,
    575                                  lines_5, lines_6, lines_7, cost + 4, vl);
    576 
    577  // Compute "mostly horizontal" directions.
    578  vuint32m1_t cost03 =
    579      compute_horiz_directions_rvv(lines_0, lines_1, lines_2, lines_3, lines_4,
    580                                   lines_5, lines_6, lines_7, cost, vl);
    581 
    582  // Find max cost as well as its index to get best_dir.
    583  // The max cost needs to be propagated in the whole vector to find its
    584  // position in the original cost vectors cost03 and cost47.
    585  vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1);
    586  vuint32m1_t cost07 = __riscv_vmaxu_vv_u32m1(cost03, cost47, 4);
    587  uint32_t best_cost = __riscv_vmv_x_s_u32m1_u32(
    588      __riscv_vredmaxu_vs_u32m1_u32m1(cost07, vec_scalar_u32m1, 4));
    589  vbool32_t mask_cost = __riscv_vmseq_vx_u32m1_b32(cost03, best_cost, 4);
    590  long best_dir = __riscv_vfirst_m_b32(mask_cost, 4);
    591  if (best_dir == -1) {
    592    mask_cost = __riscv_vmseq_vx_u32m1_b32(cost47, best_cost, 4);
    593    best_dir = __riscv_vfirst_m_b32(mask_cost, 4);
    594    best_dir += 4;
    595  }
    596 
    597  // Difference between the optimal variance and the variance along the
    598  // orthogonal direction. Again, the sum(x^2) terms cancel out.
    599  *var = best_cost - cost[(best_dir + 4) & 7];
    600 
    601  // We'd normally divide by 840, but dividing by 1024 is close enough
    602  // for what we're going to do with this.
    603  *var >>= 10;
    604  return (int)best_dir;
    605 }
    606 
    607 void cdef_copy_rect8_8bit_to_16bit_rvv(uint16_t *dst, int dstride,
    608                                       const uint8_t *src, int sstride,
    609                                       int width, int height) {
    610  do {
    611    int w = 0;
    612    size_t num_cols = width;
    613    while (num_cols > 0) {
    614      size_t vl = __riscv_vsetvl_e8mf2(num_cols);
    615      vuint8mf2_t u8_src = __riscv_vle8_v_u8mf2(src + w, vl);
    616      vuint16m1_t u16_src = __riscv_vwcvtu_x_x_v_u16m1(u8_src, vl);
    617      __riscv_vse16_v_u16m1(dst + w, u16_src, vl);
    618 
    619      w += vl;
    620      num_cols -= vl;
    621    }
    622    src += sstride;
    623    dst += dstride;
    624  } while (--height != 0);
    625 }
    626 
    627 void cdef_copy_rect8_16bit_to_16bit_rvv(uint16_t *dst, int dstride,
    628                                        const uint16_t *src, int sstride,
    629                                        int width, int height) {
    630  do {
    631    int w = 0;
    632    size_t num_cols = width;
    633    while (num_cols > 0) {
    634      size_t vl = __riscv_vsetvl_e16m1(num_cols);
    635      vuint16m1_t u16_src = __riscv_vle16_v_u16m1(src + w, vl);
    636      __riscv_vse16_v_u16m1(dst + w, u16_src, vl);
    637 
    638      w += vl;
    639      num_cols -= vl;
    640    }
    641    src += sstride;
    642    dst += dstride;
    643  } while (--height != 0);
    644 }
    645 
    646 static inline vint16m1_t constrain16(vint16m1_t a, vint16m1_t b,
    647                                     int16_t threshold, int16_t adjdamp,
    648                                     size_t vl) {
    649  if (!threshold) return __riscv_vmv_v_x_i16m1(0, vl);
    650  const vbool16_t mask = __riscv_vmslt_vv_i16m1_b16(a, b, vl);
    651  const vint16m1_t diff = __riscv_vsub_vv_i16m1(a, b, vl);
    652  const vint16m1_t abs_diff = __riscv_vneg_v_i16m1_tumu(mask, diff, diff, vl);
    653  const vint16m1_t shift = __riscv_vsra_vx_i16m1(abs_diff, adjdamp, vl);
    654  const vint16m1_t thr = __riscv_vmv_v_x_i16m1(threshold, vl);
    655  const vint16m1_t sub = __riscv_vsub_vv_i16m1(thr, shift, vl);
    656  const vint16m1_t max = __riscv_vmax_vx_i16m1(sub, 0, vl);
    657  const vint16m1_t min = __riscv_vmin_vv_i16m1(abs_diff, max, vl);
    658  return __riscv_vneg_v_i16m1_tumu(mask, min, min, vl);
    659 }
    660 
    661 static inline vint16m1_t vmax_mask(vint16m1_t a, vint16m1_t b, size_t vl) {
    662  const vbool16_t mask =
    663      __riscv_vmseq_vx_i16m1_b16(a, (int16_t)CDEF_VERY_LARGE, vl);
    664  const vint16m1_t val = __riscv_vmerge_vvm_i16m1(a, b, mask, vl);
    665  return __riscv_vmax_vv_i16m1(val, b, vl);
    666 }
    667 
    668 static inline vint16m1_t load_strided_i16_4x2(int16_t *addr,
    669                                              const ptrdiff_t stride,
    670                                              size_t vl) {
    671  const vint16m1_t px_l1 = __riscv_vle16_v_i16m1(addr + stride, vl);
    672  const vint16m1_t px_l0 = __riscv_vle16_v_i16m1(addr, vl);
    673  return __riscv_vslideup_vx_i16m1(px_l0, px_l1, 4, vl);
    674 }
    675 
    676 static inline void store_strided_u8_4x2(uint8_t *addr, vuint8mf2_t vdst,
    677                                        const ptrdiff_t stride, size_t vl) {
    678  __riscv_vse8_v_u8mf2(addr, vdst, vl >> 1);
    679  vdst = __riscv_vslidedown_vx_u8mf2(vdst, 4, vl);
    680  __riscv_vse8_v_u8mf2(addr + stride, vdst, vl >> 1);
    681 }
    682 
    683 static inline void store_strided_u16_4x2(uint16_t *addr, vuint16m1_t vdst,
    684                                         const ptrdiff_t stride, size_t vl) {
    685  __riscv_vse16_v_u16m1(addr, vdst, vl >> 1);
    686  vdst = __riscv_vslidedown_vx_u16m1(vdst, 4, vl);
    687  __riscv_vse16_v_u16m1(addr + stride, vdst, vl >> 1);
    688 }
    689 
    690 #define LOAD_PIX(addr)                                              \
    691  const vint16m1_t px = __riscv_vle16_v_i16m1((int16_t *)addr, vl); \
    692  vint16m1_t sum = __riscv_vmv_v_x_i16m1(0, vl)
    693 
    694 #define LOAD_PIX4(addr)                                        \
    695  const vint16m1_t px =                                        \
    696      load_strided_i16_4x2((int16_t *)addr, CDEF_BSTRIDE, vl); \
    697  vint16m1_t sum = __riscv_vmv_v_x_i16m1(0, vl)
    698 
    699 #define LOAD_DIR(p, addr, o0, o1)                                          \
    700  const vint16m1_t p##0 = __riscv_vle16_v_i16m1((int16_t *)addr + o0, vl); \
    701  const vint16m1_t p##1 = __riscv_vle16_v_i16m1((int16_t *)addr - o0, vl); \
    702  const vint16m1_t p##2 = __riscv_vle16_v_i16m1((int16_t *)addr + o1, vl); \
    703  const vint16m1_t p##3 = __riscv_vle16_v_i16m1((int16_t *)addr - o1, vl)
    704 
    705 #define LOAD_DIR4(p, addr, o0, o1)                                  \
    706  const vint16m1_t p##0 =                                           \
    707      load_strided_i16_4x2((int16_t *)addr + o0, CDEF_BSTRIDE, vl); \
    708  const vint16m1_t p##1 =                                           \
    709      load_strided_i16_4x2((int16_t *)addr - o0, CDEF_BSTRIDE, vl); \
    710  const vint16m1_t p##2 =                                           \
    711      load_strided_i16_4x2((int16_t *)addr + o1, CDEF_BSTRIDE, vl); \
    712  const vint16m1_t p##3 =                                           \
    713      load_strided_i16_4x2((int16_t *)addr - o1, CDEF_BSTRIDE, vl)
    714 
    715 #define MAKE_TAPS                                                         \
    716  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; \
    717  const int16_t tap0 = (int16_t)(pri_taps[0]);                            \
    718  const int16_t tap1 = (int16_t)(pri_taps[1])
    719 
    720 #define CONSTRAIN(p, strength, shift)                               \
    721  vint16m1_t p##_c0 =                                               \
    722      constrain16(p##0, px, (int16_t)strength, (int16_t)shift, vl); \
    723  vint16m1_t p##_c1 =                                               \
    724      constrain16(p##1, px, (int16_t)strength, (int16_t)shift, vl); \
    725  vint16m1_t p##_c2 =                                               \
    726      constrain16(p##2, px, (int16_t)strength, (int16_t)shift, vl); \
    727  vint16m1_t p##_c3 =                                               \
    728      constrain16(p##3, px, (int16_t)strength, (int16_t)shift, vl)
    729 
    730 #define SETUP_MINMAX   \
    731  vint16m1_t max = px; \
    732  vint16m1_t min = px
    733 
    734 #define MIN_MAX(p)                              \
    735  do {                                          \
    736    max = vmax_mask(p##0, max, vl);             \
    737    min = __riscv_vmin_vv_i16m1(p##0, min, vl); \
    738    max = vmax_mask(p##1, max, vl);             \
    739    min = __riscv_vmin_vv_i16m1(p##1, min, vl); \
    740    max = vmax_mask(p##2, max, vl);             \
    741    min = __riscv_vmin_vv_i16m1(p##2, min, vl); \
    742    max = vmax_mask(p##3, max, vl);             \
    743    min = __riscv_vmin_vv_i16m1(p##3, min, vl); \
    744  } while (0)
    745 
    746 #define PRI_0_UPDATE_SUM(p)                                             \
    747  const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \
    748  const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \
    749  sum = __riscv_vmacc_vx_i16m1(sum, tap0, p##sum0, vl);                 \
    750  sum = __riscv_vmacc_vx_i16m1(sum, tap1, p##sum1, vl)
    751 
    752 #define UPDATE_SUM(p)                                                   \
    753  const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \
    754  const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \
    755  sum = __riscv_vadd_vv_i16m1(sum, p##sum0, vl);                        \
    756  sum = __riscv_vadd_vv_i16m1(sum, p##sum1, vl)
    757 
    758 #define SEC_0_UPDATE_SUM(p)                                               \
    759  const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl);   \
    760  const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl);   \
    761  const vint16m1_t p##sum2 = __riscv_vadd_vv_i16m1(p##sum0, p##sum1, vl); \
    762  sum = __riscv_vadd_vv_i16m1(sum, __riscv_vsll_vx_i16m1(p##sum2, 1, vl), vl)
    763 
    764 #define BIAS                                                                  \
    765  const vbool16_t mask = __riscv_vmslt_vx_i16m1_b16(sum, 0, vl);              \
    766  const vint16m1_t v_8 = __riscv_vmv_v_x_i16m1(8, vl);                        \
    767  const vint16m1_t bias = __riscv_vsub_vx_i16m1_tumu(mask, v_8, v_8, 1, vl);  \
    768  const vint16m1_t unclamped = __riscv_vadd_vv_i16m1(                         \
    769      px, __riscv_vsra_vx_i16m1(__riscv_vadd_vv_i16m1(bias, sum, vl), 4, vl), \
    770      vl)
    771 
    772 #define STORE4                                     \
    773  do {                                             \
    774    store_strided_u8_4x2(dst8, vdst, dstride, vl); \
    775                                                   \
    776    in += (CDEF_BSTRIDE << 1);                     \
    777    dst8 += (dstride << 1);                        \
    778  } while (0)
    779 
    780 #define STORE4_CLAMPED                                       \
    781  do {                                                       \
    782    BIAS;                                                    \
    783    vint16m1_t clamped = __riscv_vmin_vv_i16m1(              \
    784        __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \
    785    vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(            \
    786        __riscv_vreinterpret_v_i16m1_u16m1(clamped), vl);    \
    787    STORE4;                                                  \
    788  } while (0)
    789 
    790 #define STORE4_UNCLAMPED                                    \
    791  do {                                                      \
    792    BIAS;                                                   \
    793    vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(           \
    794        __riscv_vreinterpret_v_i16m1_u16m1(unclamped), vl); \
    795    STORE4;                                                 \
    796  } while (0)
    797 
    798 #define STORE8                            \
    799  do {                                    \
    800    __riscv_vse8_v_u8mf2(dst8, vdst, vl); \
    801                                          \
    802    in += CDEF_BSTRIDE;                   \
    803    dst8 += dstride;                      \
    804  } while (0)
    805 
    806 #define STORE8_CLAMPED                                       \
    807  do {                                                       \
    808    BIAS;                                                    \
    809    vint16m1_t clamped = __riscv_vmin_vv_i16m1(              \
    810        __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \
    811    vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(            \
    812        __riscv_vreinterpret_v_i16m1_u16m1(clamped), vl);    \
    813    STORE8;                                                  \
    814  } while (0)
    815 
    816 #define STORE8_UNCLAMPED                                    \
    817  do {                                                      \
    818    BIAS;                                                   \
    819    vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(           \
    820        __riscv_vreinterpret_v_i16m1_u16m1(unclamped), vl); \
    821    STORE8;                                                 \
    822  } while (0)
    823 
    824 #define STORE16_4                                    \
    825  do {                                               \
    826    store_strided_u16_4x2(dst16, vdst, dstride, vl); \
    827                                                     \
    828    in += (CDEF_BSTRIDE << 1);                       \
    829    dst16 += (dstride << 1);                         \
    830  } while (0)
    831 
    832 #define STORE16_4_CLAMPED                                           \
    833  do {                                                              \
    834    BIAS;                                                           \
    835    vint16m1_t clamped = __riscv_vmin_vv_i16m1(                     \
    836        __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl);        \
    837    vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(clamped); \
    838    STORE16_4;                                                      \
    839  } while (0)
    840 
    841 #define STORE16_4_UNCLAMPED                                           \
    842  do {                                                                \
    843    BIAS;                                                             \
    844    vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(unclamped); \
    845    STORE16_4;                                                        \
    846  } while (0)
    847 
    848 #define STORE16                             \
    849  do {                                      \
    850    __riscv_vse16_v_u16m1(dst16, vdst, vl); \
    851                                            \
    852    in += CDEF_BSTRIDE;                     \
    853    dst16 += dstride;                       \
    854  } while (0)
    855 
    856 #define STORE16_CLAMPED                                             \
    857  do {                                                              \
    858    BIAS;                                                           \
    859    vint16m1_t clamped = __riscv_vmin_vv_i16m1(                     \
    860        __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl);        \
    861    vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(clamped); \
    862    STORE16;                                                        \
    863  } while (0)
    864 
    865 #define STORE16_UNCLAMPED                                             \
    866  do {                                                                \
    867    BIAS;                                                             \
    868    vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(unclamped); \
    869    STORE16;                                                          \
    870  } while (0)
    871 
    872 void cdef_filter_8_0_rvv(void *dest, int dstride, const uint16_t *in,
    873                         int pri_strength, int sec_strength, int dir,
    874                         int pri_damping, int sec_damping, int coeff_shift,
    875                         int block_width, int block_height) {
    876  const int po1 = cdef_directions[dir][0];
    877  const int po2 = cdef_directions[dir][1];
    878  const int s1o1 = cdef_directions[dir + 2][0];
    879  const int s1o2 = cdef_directions[dir + 2][1];
    880  const int s2o1 = cdef_directions[dir - 2][0];
    881  const int s2o2 = cdef_directions[dir - 2][1];
    882  MAKE_TAPS;
    883 
    884  if (pri_strength) {
    885    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
    886  }
    887  if (sec_strength) {
    888    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
    889  }
    890 
    891  if (block_width == 8) {
    892    uint8_t *dst8 = (uint8_t *)dest;
    893 
    894    int h = block_height;
    895    const size_t vl = block_width;
    896    do {
    897      LOAD_PIX(in);
    898      SETUP_MINMAX;
    899 
    900      // Primary pass
    901      LOAD_DIR(p, in, po1, po2);
    902      CONSTRAIN(p, pri_strength, pri_damping);
    903      MIN_MAX(p);
    904      PRI_0_UPDATE_SUM(p);
    905 
    906      // Secondary pass 1
    907      LOAD_DIR(s, in, s1o1, s2o1);
    908      CONSTRAIN(s, sec_strength, sec_damping);
    909      MIN_MAX(s);
    910      SEC_0_UPDATE_SUM(s);
    911 
    912      // Secondary pass 2
    913      LOAD_DIR(s2, in, s1o2, s2o2);
    914      CONSTRAIN(s2, sec_strength, sec_damping);
    915      MIN_MAX(s2);
    916      UPDATE_SUM(s2);
    917 
    918      // Store
    919      STORE8_CLAMPED;
    920    } while (--h != 0);
    921  } else {
    922    uint8_t *dst8 = (uint8_t *)dest;
    923 
    924    int h = block_height;
    925    const size_t vl = block_width << 1;
    926    do {
    927      LOAD_PIX4(in);
    928      SETUP_MINMAX;
    929 
    930      // Primary pass
    931      LOAD_DIR4(p, in, po1, po2);
    932      CONSTRAIN(p, pri_strength, pri_damping);
    933      MIN_MAX(p);
    934      PRI_0_UPDATE_SUM(p);
    935 
    936      // Secondary pass 1
    937      LOAD_DIR4(s, in, s1o1, s2o1);
    938      CONSTRAIN(s, sec_strength, sec_damping);
    939      MIN_MAX(s);
    940      SEC_0_UPDATE_SUM(s);
    941 
    942      // Secondary pass 2
    943      LOAD_DIR4(s2, in, s1o2, s2o2);
    944      CONSTRAIN(s2, sec_strength, sec_damping);
    945      MIN_MAX(s2);
    946      UPDATE_SUM(s2);
    947 
    948      // Store
    949      STORE4_CLAMPED;
    950 
    951      h -= 2;
    952    } while (h != 0);
    953  }
    954 }
    955 
    956 void cdef_filter_8_1_rvv(void *dest, int dstride, const uint16_t *in,
    957                         int pri_strength, int sec_strength, int dir,
    958                         int pri_damping, int sec_damping, int coeff_shift,
    959                         int block_width, int block_height) {
    960  (void)sec_strength;
    961  (void)sec_damping;
    962 
    963  const int po1 = cdef_directions[dir][0];
    964  const int po2 = cdef_directions[dir][1];
    965  MAKE_TAPS;
    966 
    967  if (pri_strength) {
    968    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
    969  }
    970 
    971  if (block_width == 8) {
    972    uint8_t *dst8 = (uint8_t *)dest;
    973 
    974    int h = block_height;
    975    const size_t vl = block_width;
    976    do {
    977      LOAD_PIX(in);
    978 
    979      // Primary pass
    980      LOAD_DIR(p, in, po1, po2);
    981      CONSTRAIN(p, pri_strength, pri_damping);
    982      PRI_0_UPDATE_SUM(p);
    983 
    984      // Store
    985      STORE8_UNCLAMPED;
    986    } while (--h != 0);
    987  } else {
    988    uint8_t *dst8 = (uint8_t *)dest;
    989 
    990    int h = block_height;
    991    const size_t vl = block_width << 1;
    992    do {
    993      LOAD_PIX4(in);
    994 
    995      // Primary pass
    996      LOAD_DIR4(p, in, po1, po2);
    997      CONSTRAIN(p, pri_strength, pri_damping);
    998      PRI_0_UPDATE_SUM(p);
    999 
   1000      // Store
   1001      STORE4_UNCLAMPED;
   1002 
   1003      h -= 2;
   1004    } while (h != 0);
   1005  }
   1006 }
   1007 
   1008 void cdef_filter_8_2_rvv(void *dest, int dstride, const uint16_t *in,
   1009                         int pri_strength, int sec_strength, int dir,
   1010                         int pri_damping, int sec_damping, int coeff_shift,
   1011                         int block_width, int block_height) {
   1012  (void)pri_strength;
   1013  (void)pri_damping;
   1014  (void)coeff_shift;
   1015 
   1016  const int s1o1 = cdef_directions[dir + 2][0];
   1017  const int s1o2 = cdef_directions[dir + 2][1];
   1018  const int s2o1 = cdef_directions[dir - 2][0];
   1019  const int s2o2 = cdef_directions[dir - 2][1];
   1020 
   1021  if (sec_strength) {
   1022    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
   1023  }
   1024 
   1025  if (block_width == 8) {
   1026    uint8_t *dst8 = (uint8_t *)dest;
   1027 
   1028    int h = block_height;
   1029    const size_t vl = block_width;
   1030    do {
   1031      LOAD_PIX(in);
   1032 
   1033      // Secondary pass 1
   1034      LOAD_DIR(s, in, s1o1, s2o1);
   1035      CONSTRAIN(s, sec_strength, sec_damping);
   1036      SEC_0_UPDATE_SUM(s);
   1037 
   1038      // Secondary pass 2
   1039      LOAD_DIR(s2, in, s1o2, s2o2);
   1040      CONSTRAIN(s2, sec_strength, sec_damping);
   1041      UPDATE_SUM(s2);
   1042 
   1043      // Store
   1044      STORE8_UNCLAMPED;
   1045    } while (--h != 0);
   1046  } else {
   1047    uint8_t *dst8 = (uint8_t *)dest;
   1048 
   1049    int h = block_height;
   1050    const size_t vl = block_width << 1;
   1051    do {
   1052      LOAD_PIX4(in);
   1053 
   1054      // Secondary pass 1
   1055      LOAD_DIR4(s, in, s1o1, s2o1);
   1056      CONSTRAIN(s, sec_strength, sec_damping);
   1057      SEC_0_UPDATE_SUM(s);
   1058 
   1059      // Secondary pass 2
   1060      LOAD_DIR4(s2, in, s1o2, s2o2);
   1061      CONSTRAIN(s2, sec_strength, sec_damping);
   1062      UPDATE_SUM(s2);
   1063 
   1064      // Store
   1065      STORE4_UNCLAMPED;
   1066 
   1067      h -= 2;
   1068    } while (h != 0);
   1069  }
   1070 }
   1071 
   1072 void cdef_filter_8_3_rvv(void *dest, int dstride, const uint16_t *in,
   1073                         int pri_strength, int sec_strength, int dir,
   1074                         int pri_damping, int sec_damping, int coeff_shift,
   1075                         int block_width, int block_height) {
   1076  (void)pri_strength;
   1077  (void)sec_strength;
   1078  (void)dir;
   1079  (void)pri_damping;
   1080  (void)sec_damping;
   1081  (void)coeff_shift;
   1082 
   1083  if (block_width == 8) {
   1084    uint8_t *dst8 = (uint8_t *)dest;
   1085 
   1086    int h = block_height;
   1087    const size_t vl = block_width;
   1088    do {
   1089      const vuint16m1_t px = __riscv_vle16_v_u16m1(in, vl);
   1090      const vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(px, vl);
   1091      __riscv_vse8_v_u8mf2(dst8, vdst, vl);
   1092 
   1093      in += CDEF_BSTRIDE;
   1094      dst8 += dstride;
   1095    } while (--h != 0);
   1096  } else {
   1097    uint8_t *dst8 = (uint8_t *)dest;
   1098 
   1099    int h = block_height;
   1100    const size_t vl = block_width << 1;
   1101    do {
   1102      const vint16m1_t px =
   1103          load_strided_i16_4x2((int16_t *)in, CDEF_BSTRIDE, vl);
   1104      vuint8mf2_t vdst =
   1105          __riscv_vncvt_x_x_w_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(px), vl);
   1106      store_strided_u8_4x2(dst8, vdst, dstride, vl);
   1107 
   1108      in += 2 * CDEF_BSTRIDE;
   1109      dst8 += 2 * dstride;
   1110      h -= 2;
   1111    } while (h != 0);
   1112  }
   1113 }
   1114 
   1115 void cdef_filter_16_0_rvv(void *dest, int dstride, const uint16_t *in,
   1116                          int pri_strength, int sec_strength, int dir,
   1117                          int pri_damping, int sec_damping, int coeff_shift,
   1118                          int block_width, int block_height) {
   1119  const int po1 = cdef_directions[dir][0];
   1120  const int po2 = cdef_directions[dir][1];
   1121  const int s1o1 = cdef_directions[dir + 2][0];
   1122  const int s1o2 = cdef_directions[dir + 2][1];
   1123  const int s2o1 = cdef_directions[dir - 2][0];
   1124  const int s2o2 = cdef_directions[dir - 2][1];
   1125  MAKE_TAPS;
   1126 
   1127  if (pri_strength) {
   1128    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
   1129  }
   1130  if (sec_strength) {
   1131    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
   1132  }
   1133 
   1134  if (block_width == 8) {
   1135    uint16_t *dst16 = (uint16_t *)dest;
   1136 
   1137    int h = block_height;
   1138    const size_t vl = block_width;
   1139    do {
   1140      LOAD_PIX(in);
   1141      SETUP_MINMAX;
   1142 
   1143      // Primary pass
   1144      LOAD_DIR(p, in, po1, po2);
   1145      CONSTRAIN(p, pri_strength, pri_damping);
   1146      MIN_MAX(p);
   1147      PRI_0_UPDATE_SUM(p);
   1148 
   1149      // Secondary pass 1
   1150      LOAD_DIR(s, in, s1o1, s2o1);
   1151      CONSTRAIN(s, sec_strength, sec_damping);
   1152      MIN_MAX(s);
   1153      SEC_0_UPDATE_SUM(s);
   1154 
   1155      // Secondary pass 2
   1156      LOAD_DIR(s2, in, s1o2, s2o2);
   1157      CONSTRAIN(s2, sec_strength, sec_damping);
   1158      MIN_MAX(s2);
   1159      UPDATE_SUM(s2);
   1160 
   1161      // Store
   1162      STORE16_CLAMPED;
   1163    } while (--h != 0);
   1164  } else {
   1165    uint16_t *dst16 = (uint16_t *)dest;
   1166 
   1167    int h = block_height;
   1168    const size_t vl = block_width << 1;
   1169    do {
   1170      LOAD_PIX4(in);
   1171      SETUP_MINMAX;
   1172 
   1173      // Primary pass
   1174      LOAD_DIR4(p, in, po1, po2);
   1175      CONSTRAIN(p, pri_strength, pri_damping);
   1176      MIN_MAX(p);
   1177      PRI_0_UPDATE_SUM(p);
   1178 
   1179      // Secondary pass 1
   1180      LOAD_DIR4(s, in, s1o1, s2o1);
   1181      CONSTRAIN(s, sec_strength, sec_damping);
   1182      MIN_MAX(s);
   1183      SEC_0_UPDATE_SUM(s);
   1184 
   1185      // Secondary pass 2
   1186      LOAD_DIR4(s2, in, s1o2, s2o2);
   1187      CONSTRAIN(s2, sec_strength, sec_damping);
   1188      MIN_MAX(s2);
   1189      UPDATE_SUM(s2);
   1190 
   1191      // Store
   1192      STORE16_4_CLAMPED;
   1193 
   1194      h -= 2;
   1195    } while (h != 0);
   1196  }
   1197 }
   1198 
   1199 void cdef_filter_16_1_rvv(void *dest, int dstride, const uint16_t *in,
   1200                          int pri_strength, int sec_strength, int dir,
   1201                          int pri_damping, int sec_damping, int coeff_shift,
   1202                          int block_width, int block_height) {
   1203  (void)sec_strength;
   1204  (void)sec_damping;
   1205 
   1206  const int po1 = cdef_directions[dir][0];
   1207  const int po2 = cdef_directions[dir][1];
   1208  MAKE_TAPS;
   1209 
   1210  if (pri_strength) {
   1211    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
   1212  }
   1213 
   1214  if (block_width == 8) {
   1215    uint16_t *dst16 = (uint16_t *)dest;
   1216 
   1217    int h = block_height;
   1218    const size_t vl = block_width;
   1219    do {
   1220      LOAD_PIX(in);
   1221 
   1222      // Primary pass
   1223      LOAD_DIR(p, in, po1, po2);
   1224      CONSTRAIN(p, pri_strength, pri_damping);
   1225      PRI_0_UPDATE_SUM(p);
   1226 
   1227      // Store
   1228      STORE16_UNCLAMPED;
   1229    } while (--h != 0);
   1230  } else {
   1231    uint16_t *dst16 = (uint16_t *)dest;
   1232 
   1233    int h = block_height;
   1234    const size_t vl = block_width << 1;
   1235    do {
   1236      LOAD_PIX4(in);
   1237 
   1238      // Primary pass
   1239      LOAD_DIR4(p, in, po1, po2);
   1240      CONSTRAIN(p, pri_strength, pri_damping);
   1241      PRI_0_UPDATE_SUM(p);
   1242 
   1243      // Store
   1244      STORE16_4_UNCLAMPED;
   1245 
   1246      h -= 2;
   1247    } while (h != 0);
   1248  }
   1249 }
   1250 
   1251 void cdef_filter_16_2_rvv(void *dest, int dstride, const uint16_t *in,
   1252                          int pri_strength, int sec_strength, int dir,
   1253                          int pri_damping, int sec_damping, int coeff_shift,
   1254                          int block_width, int block_height) {
   1255  (void)pri_strength;
   1256  (void)pri_damping;
   1257  (void)coeff_shift;
   1258 
   1259  const int s1o1 = cdef_directions[dir + 2][0];
   1260  const int s1o2 = cdef_directions[dir + 2][1];
   1261  const int s2o1 = cdef_directions[dir - 2][0];
   1262  const int s2o2 = cdef_directions[dir - 2][1];
   1263 
   1264  if (sec_strength) {
   1265    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
   1266  }
   1267 
   1268  if (block_width == 8) {
   1269    uint16_t *dst16 = (uint16_t *)dest;
   1270 
   1271    int h = block_height;
   1272    const size_t vl = block_width;
   1273    do {
   1274      LOAD_PIX(in);
   1275 
   1276      // Secondary pass 1
   1277      LOAD_DIR(s, in, s1o1, s2o1);
   1278      CONSTRAIN(s, sec_strength, sec_damping);
   1279      SEC_0_UPDATE_SUM(s);
   1280 
   1281      // Secondary pass 2
   1282      LOAD_DIR(s2, in, s1o2, s2o2);
   1283      CONSTRAIN(s2, sec_strength, sec_damping);
   1284      UPDATE_SUM(s2);
   1285 
   1286      // Store
   1287      STORE16_UNCLAMPED;
   1288    } while (--h != 0);
   1289  } else {
   1290    uint16_t *dst16 = (uint16_t *)dest;
   1291 
   1292    int h = block_height;
   1293    const size_t vl = block_width << 1;
   1294    do {
   1295      LOAD_PIX4(in);
   1296 
   1297      // Secondary pass 1
   1298      LOAD_DIR4(s, in, s1o1, s2o1);
   1299      CONSTRAIN(s, sec_strength, sec_damping);
   1300      SEC_0_UPDATE_SUM(s);
   1301 
   1302      // Secondary pass 2
   1303      LOAD_DIR4(s2, in, s1o2, s2o2);
   1304      CONSTRAIN(s2, sec_strength, sec_damping);
   1305      UPDATE_SUM(s2);
   1306 
   1307      // Store
   1308      STORE16_4_UNCLAMPED;
   1309 
   1310      h -= 2;
   1311    } while (h != 0);
   1312  }
   1313 }
   1314 
   1315 void cdef_filter_16_3_rvv(void *dest, int dstride, const uint16_t *in,
   1316                          int pri_strength, int sec_strength, int dir,
   1317                          int pri_damping, int sec_damping, int coeff_shift,
   1318                          int block_width, int block_height) {
   1319  (void)pri_strength;
   1320  (void)sec_strength;
   1321  (void)dir;
   1322  (void)pri_damping;
   1323  (void)sec_damping;
   1324  (void)coeff_shift;
   1325 
   1326  if (block_width == 8) {
   1327    uint16_t *dst16 = (uint16_t *)dest;
   1328 
   1329    int h = block_height;
   1330    const size_t vl = block_width;
   1331    do {
   1332      const vuint16m1_t px = __riscv_vle16_v_u16m1(in, vl);
   1333      __riscv_vse16_v_u16m1(dst16, px, vl);
   1334 
   1335      in += CDEF_BSTRIDE;
   1336      dst16 += dstride;
   1337    } while (--h != 0);
   1338  } else {
   1339    uint16_t *dst16 = (uint16_t *)dest;
   1340 
   1341    int h = block_height;
   1342    const size_t vl = block_width << 1;
   1343    do {
   1344      const vint16m1_t px =
   1345          load_strided_i16_4x2((int16_t *)in, CDEF_BSTRIDE, vl);
   1346      vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(px);
   1347      store_strided_u16_4x2(dst16, vdst, dstride, vl);
   1348 
   1349      in += 2 * CDEF_BSTRIDE;
   1350      dst16 += 2 * dstride;
   1351      h -= 2;
   1352    } while (h != 0);
   1353  }
   1354 }