cdef_block_rvv.c (51725B)
1 /* 2 * Copyright (c) 2025, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <riscv_vector.h> 13 14 #include "config/aom_config.h" 15 #include "config/av1_rtcd.h" 16 #include "av1/common/cdef_block.h" 17 18 // partial A is a 16-bit vector of the form: 19 // [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: 20 // [0 y1 y2 y3 y4 y5 y6 y7]. 21 // This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... 22 // (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 23 // and const2. 24 static inline vuint32m1_t fold_mul_and_sum_rvv(vint16m1_t partiala, 25 vint16m1_t partialb, 26 vuint32m1_t const1, 27 vuint32m1_t const2) { 28 // Square and add the corresponding x and y values. 29 vint32m2_t cost = __riscv_vwmul_vv_i32m2(partiala, partiala, 8); 30 cost = __riscv_vwmacc_vv_i32m2(cost, partialb, partialb, 8); 31 32 // Multiply by constant. 33 vuint32m2_t tmp1_u32m2 = __riscv_vreinterpret_v_i32m2_u32m2(cost); 34 vuint32m1_t cost_u32m1 = __riscv_vmul_vv_u32m1( 35 __riscv_vlmul_trunc_v_u32m2_u32m1(tmp1_u32m2), const1, 4); 36 tmp1_u32m2 = __riscv_vslidedown_vx_u32m2(tmp1_u32m2, 4, 8); 37 vuint32m1_t ret = __riscv_vmacc_vv_u32m1( 38 cost_u32m1, __riscv_vlmul_trunc_v_u32m2_u32m1(tmp1_u32m2), const2, 4); 39 return ret; 40 } 41 42 // This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal 43 // down-right, 6 is vertical). 44 // 45 // For each direction the lines are shifted so that we can perform a 46 // basic sum on each vector element. For example, direction 5 is "south by 47 // southeast", so we need to add the pixels along each line i below: 48 // 49 // 0 1 2 3 4 5 6 7 50 // 0 1 2 3 4 5 6 7 51 // 8 0 1 2 3 4 5 6 52 // 8 0 1 2 3 4 5 6 53 // 9 8 0 1 2 3 4 5 54 // 9 8 0 1 2 3 4 5 55 // 10 9 8 0 1 2 3 4 56 // 10 9 8 0 1 2 3 4 57 // 58 // For this to fit nicely in vectors, the lines need to be shifted like so: 59 // 0 1 2 3 4 5 6 7 60 // 0 1 2 3 4 5 6 7 61 // 8 0 1 2 3 4 5 6 62 // 8 0 1 2 3 4 5 6 63 // 9 8 0 1 2 3 4 5 64 // 9 8 0 1 2 3 4 5 65 // 10 9 8 0 1 2 3 4 66 // 10 9 8 0 1 2 3 4 67 // 68 // In this configuration we can now perform SIMD additions to get the cost 69 // along direction 5. Since this won't fit into a single 128-bit vector, we use 70 // two of them to compute each half of the new configuration, and pad the empty 71 // spaces with zeros. Similar shifting is done for other directions, except 72 // direction 6 which is straightforward as it's the vertical direction. 73 static vuint32m1_t compute_vert_directions_rvv( 74 vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2, 75 vint16m1_t lines_3, vint16m1_t lines_4, vint16m1_t lines_5, 76 vint16m1_t lines_6, vint16m1_t lines_7, uint32_t cost[4], size_t vl) { 77 size_t VL_SLIDE_DOWN = __riscv_vsetvl_e16m1(16); 78 vint16m1_t vec_zero_i16m1 = __riscv_vmv_v_x_i16m1(0, vl); 79 80 // Partial sums for lines 0 and 1. 81 vint16m1_t partial4a = 82 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_0, (8 - 1), vl); 83 vint16m1_t tmp1_i16m1 = 84 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 2), vl); 85 partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); 86 vint16m1_t partial4b = __riscv_vslide1down_vx_i16m1(lines_0, 0, vl); 87 tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_1, 2, VL_SLIDE_DOWN); 88 partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); 89 tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_0, lines_1, VL_SLIDE_DOWN); 90 vint16m1_t partial5a = 91 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 3), vl); 92 vint16m1_t partial5b = 93 __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 3, VL_SLIDE_DOWN); 94 vint16m1_t partial7a = 95 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 6), vl); 96 vint16m1_t partial7b = 97 __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 6, VL_SLIDE_DOWN); 98 vint16m1_t partial6 = __riscv_vmv_v_v_i16m1(tmp1_i16m1, vl); 99 100 // Partial sums for lines 2 and 3. 101 tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 3), vl); 102 partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); 103 tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 4), vl); 104 partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); 105 tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_2, 3, VL_SLIDE_DOWN); 106 partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); 107 tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_3, 4, VL_SLIDE_DOWN); 108 partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); 109 tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_2, lines_3, VL_SLIDE_DOWN); 110 partial5a = __riscv_vadd_vv_i16m1( 111 partial5a, 112 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 4), vl), vl); 113 partial5b = __riscv_vadd_vv_i16m1( 114 partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 4, VL_SLIDE_DOWN), vl); 115 partial7a = __riscv_vadd_vv_i16m1( 116 partial7a, 117 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 5), vl), vl); 118 partial7b = __riscv_vadd_vv_i16m1( 119 partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 5, VL_SLIDE_DOWN), vl); 120 partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); 121 122 // Partial sums for lines 4 and 5. 123 partial4a = __riscv_vadd_vv_i16m1( 124 partial4a, 125 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 5), vl), vl); 126 partial4a = __riscv_vadd_vv_i16m1( 127 partial4a, 128 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 6), vl), vl); 129 partial4b = __riscv_vadd_vv_i16m1( 130 partial4b, __riscv_vslidedown_vx_i16m1(lines_4, 5, VL_SLIDE_DOWN), vl); 131 partial4b = __riscv_vadd_vv_i16m1( 132 partial4b, __riscv_vslidedown_vx_i16m1(lines_5, 6, VL_SLIDE_DOWN), vl); 133 tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_4, lines_5, VL_SLIDE_DOWN); 134 partial5a = __riscv_vadd_vv_i16m1( 135 partial5a, 136 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 5), vl), vl); 137 partial5b = __riscv_vadd_vv_i16m1( 138 partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 5, VL_SLIDE_DOWN), vl); 139 partial7a = __riscv_vadd_vv_i16m1( 140 partial7a, 141 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 4), vl), vl); 142 partial7b = __riscv_vadd_vv_i16m1( 143 partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 4, VL_SLIDE_DOWN), vl); 144 partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); 145 146 // Partial sums for lines 6 and 7. 147 partial4a = __riscv_vadd_vv_i16m1( 148 partial4a, 149 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 7), vl), vl); 150 partial4a = __riscv_vadd_vv_i16m1(partial4a, lines_7, vl); 151 partial4b = __riscv_vadd_vv_i16m1( 152 partial4b, __riscv_vslidedown_vx_i16m1(lines_6, 7, VL_SLIDE_DOWN), vl); 153 tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_6, lines_7, VL_SLIDE_DOWN); 154 partial5a = __riscv_vadd_vv_i16m1( 155 partial5a, 156 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 6), vl), vl); 157 partial5b = __riscv_vadd_vv_i16m1( 158 partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 6, VL_SLIDE_DOWN), vl); 159 partial7a = __riscv_vadd_vv_i16m1( 160 partial7a, 161 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 3), vl), vl); 162 partial7b = __riscv_vadd_vv_i16m1( 163 partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 3, VL_SLIDE_DOWN), vl); 164 partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); 165 166 // const0 = { 840, 420, 280, 210, } 167 vuint32m1_t const0 = __riscv_vmv_s_x_u32m1(210, 4); 168 const0 = __riscv_vslide1up_vx_u32m1(const0, 280, 4); 169 const0 = __riscv_vslide1up_vx_u32m1(const0, 420, 4); 170 const0 = __riscv_vslide1up_vx_u32m1(const0, 840, 4); 171 172 // const1 = { 168, 140, 120, 105, } 173 vuint32m1_t const1 = __riscv_vmv_s_x_u32m1(105, 4); 174 const1 = __riscv_vslide1up_vx_u32m1(const1, 120, 4); 175 const1 = __riscv_vslide1up_vx_u32m1(const1, 140, 4); 176 const1 = __riscv_vslide1up_vx_u32m1(const1, 168, 4); 177 178 // const2 = { 0, 0, 420, 210, } 179 vuint32m1_t const2 = __riscv_vmv_v_x_u32m1(0, 4); 180 const2 = __riscv_vslide1down_vx_u32m1(const2, 420, 4); 181 const2 = __riscv_vslide1down_vx_u32m1(const2, 210, 4); 182 183 // const3 = { 140, 105, 105, 105, }; 184 vuint32m1_t const3 = __riscv_vmv_v_x_u32m1(105, 4); 185 const3 = __riscv_vslide1up_vx_u32m1(const3, 140, 4); 186 187 // Compute costs in terms of partial sums. 188 vint32m2_t tmp1_i32m2 = __riscv_vwmul_vv_i32m2(partial6, partial6, vl); 189 vint32m2_t partial6_s32 = __riscv_vslidedown_vx_i32m2(tmp1_i32m2, 4, vl); 190 partial6_s32 = __riscv_vadd_vv_i32m2(partial6_s32, tmp1_i32m2, 4); 191 192 // Reverse partial B. 193 // pattern = { 6, 5, 4, 3, 2, 1, 0, 7, }. 194 vuint32m1_t costs_0, costs_1, costs_2, costs_3; 195 static const uint16_t tab_u16[8] = { 196 6, 5, 4, 3, 2, 1, 0, 7, 197 }; 198 vuint16m1_t index_u16m1 = __riscv_vle16_v_u16m1(tab_u16, 8); 199 vint16m1_t partial4b_rv = 200 __riscv_vrgather_vv_i16m1(partial4b, index_u16m1, 8); 201 costs_0 = fold_mul_and_sum_rvv(partial4a, partial4b_rv, const0, const1); 202 vuint32m1_t partial6_u32 = __riscv_vreinterpret_v_i32m1_u32m1( 203 __riscv_vlmul_trunc_v_i32m2_i32m1(partial6_s32)); 204 costs_2 = __riscv_vmul_vx_u32m1(partial6_u32, 105, 4); 205 vint16m1_t partial5b_rv = 206 __riscv_vrgather_vv_i16m1(partial5b, index_u16m1, 8); 207 costs_1 = fold_mul_and_sum_rvv(partial5a, partial5b_rv, const2, const3); 208 vint16m1_t partial7b_rv = 209 __riscv_vrgather_vv_i16m1(partial7b, index_u16m1, 8); 210 costs_3 = fold_mul_and_sum_rvv(partial7a, partial7b_rv, const2, const3); 211 212 // combine values 213 vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); 214 vuint32m1_t cost0_sum = 215 __riscv_vredsum_vs_u32m1_u32m1(costs_0, vec_scalar_u32m1, 4); 216 vuint32m1_t cost1_sum = 217 __riscv_vredsum_vs_u32m1_u32m1(costs_1, vec_scalar_u32m1, 4); 218 vuint32m1_t cost2_sum = 219 __riscv_vredsum_vs_u32m1_u32m1(costs_2, vec_scalar_u32m1, 4); 220 vuint32m1_t cost3_sum = 221 __riscv_vredsum_vs_u32m1_u32m1(costs_3, vec_scalar_u32m1, 4); 222 223 vuint32m1_t cost47 = __riscv_vslideup_vx_u32m1(cost0_sum, cost1_sum, 1, 4); 224 cost47 = __riscv_vslideup_vx_u32m1(cost47, cost2_sum, 2, 4); 225 cost47 = __riscv_vslideup_vx_u32m1(cost47, cost3_sum, 3, 4); 226 __riscv_vse32_v_u32m1(&cost[0], cost47, 4); 227 return cost47; 228 } 229 230 static inline vuint32m1_t fold_mul_and_sum_pairwise_rvv(vint16m1_t partiala, 231 vint16m1_t partialb, 232 vint16m1_t partialc, 233 vuint32m1_t const0) { 234 vuint16m1_t vid_u16m1 = __riscv_vid_v_u16m1(4); 235 vuint16m1_t index_u16m1 = __riscv_vsll_vx_u16m1(vid_u16m1, 1, 4); 236 vint16m1_t tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partiala, 0, 8); 237 vint32m2_t partiala_i32m2 = __riscv_vwadd_vv_i32m2(partiala, tmp_i16m1, 8); 238 tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partialb, 0, 8); 239 vint32m2_t partialb_i32m2 = __riscv_vwadd_vv_i32m2(partialb, tmp_i16m1, 8); 240 241 tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partialc, 0, 8); 242 vint32m2_t partialc_i32m2 = __riscv_vwadd_vv_i32m2(partialc, tmp_i16m1, 8); 243 partiala_i32m2 = __riscv_vmul_vv_i32m2(partiala_i32m2, partiala_i32m2, 8); 244 partialb_i32m2 = __riscv_vmul_vv_i32m2(partialb_i32m2, partialb_i32m2, 8); 245 vint32m1_t partialb_i32m1 = __riscv_vlmul_trunc_v_i32m2_i32m1( 246 __riscv_vrgatherei16_vv_i32m2(partialb_i32m2, index_u16m1, 4)); 247 partialc_i32m2 = __riscv_vmul_vv_i32m2(partialc_i32m2, partialc_i32m2, 8); 248 partiala_i32m2 = __riscv_vadd_vv_i32m2(partiala_i32m2, partialc_i32m2, 8); 249 vint32m1_t partiala_i32m1 = __riscv_vlmul_trunc_v_i32m2_i32m1( 250 __riscv_vrgatherei16_vv_i32m2(partiala_i32m2, index_u16m1, 4)); 251 252 vuint32m1_t cost = __riscv_vmul_vx_u32m1( 253 __riscv_vreinterpret_v_i32m1_u32m1(partialb_i32m1), 105, 4); 254 cost = __riscv_vmacc_vv_u32m1( 255 cost, __riscv_vreinterpret_v_i32m1_u32m1(partiala_i32m1), const0, 4); 256 return cost; 257 } 258 259 static inline vint32m1_t horizontal_add_4d_s16x8(vint16m1_t lines_0, 260 vint16m1_t lines_1, 261 vint16m1_t lines_2, 262 vint16m1_t lines_3) { 263 vint32m1_t vec_scalar_i32m1 = __riscv_vmv_s_x_i32m1(0, 1); 264 vint32m1_t lines0_sum = 265 __riscv_vwredsum_vs_i16m1_i32m1(lines_0, vec_scalar_i32m1, 8); 266 vint32m1_t lines1_sum = 267 __riscv_vwredsum_vs_i16m1_i32m1(lines_1, vec_scalar_i32m1, 8); 268 vint32m1_t lines2_sum = 269 __riscv_vwredsum_vs_i16m1_i32m1(lines_2, vec_scalar_i32m1, 8); 270 vint32m1_t lines3_sum = 271 __riscv_vwredsum_vs_i16m1_i32m1(lines_3, vec_scalar_i32m1, 8); 272 273 vint32m1_t ret = __riscv_vslideup_vx_i32m1(lines0_sum, lines1_sum, 1, 4); 274 ret = __riscv_vslideup_vx_i32m1(ret, lines2_sum, 2, 4); 275 ret = __riscv_vslideup_vx_i32m1(ret, lines3_sum, 3, 4); 276 return ret; 277 } 278 279 // This function computes the cost along directions 0, 1, 2, 3. (0 means 280 // 45-degree up-right, 2 is horizontal). 281 // 282 // For direction 1 and 3 ("east northeast" and "east southeast") the shifted 283 // lines need three vectors instead of two. For direction 1 for example, we need 284 // to compute the sums along the line i below: 285 // 0 0 1 1 2 2 3 3 286 // 1 1 2 2 3 3 4 4 287 // 2 2 3 3 4 4 5 5 288 // 3 3 4 4 5 5 6 6 289 // 4 4 5 5 6 6 7 7 290 // 5 5 6 6 7 7 8 8 291 // 6 6 7 7 8 8 9 9 292 // 7 7 8 8 9 9 10 10 293 // 294 // Which means we need the following configuration: 295 // 0 0 1 1 2 2 3 3 296 // 1 1 2 2 3 3 4 4 297 // 2 2 3 3 4 4 5 5 298 // 3 3 4 4 5 5 6 6 299 // 4 4 5 5 6 6 7 7 300 // 5 5 6 6 7 7 8 8 301 // 6 6 7 7 8 8 9 9 302 // 7 7 8 8 9 9 10 10 303 // 304 // Three vectors are needed to compute this, as well as some extra pairwise 305 // additions. 306 static vuint32m1_t compute_horiz_directions_rvv( 307 vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2, 308 vint16m1_t lines_3, vint16m1_t lines_4, vint16m1_t lines_5, 309 vint16m1_t lines_6, vint16m1_t lines_7, uint32_t cost[4], size_t vl) { 310 // Compute diagonal directions (1, 2, 3). 311 // Partial sums for lines 0 and 1. 312 size_t VL_SLIDE_DOWN = __riscv_vsetvl_e16m1(16); 313 vint16m1_t vec_zero_i16m1 = __riscv_vmv_v_x_i16m1(0, vl); 314 vint16m1_t partial0a = __riscv_vmv_v_v_i16m1(lines_0, vl); 315 partial0a = __riscv_vadd_vv_i16m1( 316 partial0a, 317 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 7), vl), vl); 318 vint16m1_t partial0b = __riscv_vslidedown_vx_i16m1(lines_1, 7, VL_SLIDE_DOWN); 319 vint16m1_t partial1a = __riscv_vadd_vv_i16m1( 320 lines_0, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 6), vl), 321 vl); 322 vint16m1_t partial1b = __riscv_vslidedown_vx_i16m1(lines_1, 6, VL_SLIDE_DOWN); 323 vint16m1_t partial3a = __riscv_vslidedown_vx_i16m1(lines_0, 2, VL_SLIDE_DOWN); 324 partial3a = __riscv_vadd_vv_i16m1( 325 partial3a, __riscv_vslidedown_vx_i16m1(lines_1, 4, VL_SLIDE_DOWN), vl); 326 vint16m1_t partial3b = 327 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_0, (8 - 2), vl); 328 partial3b = __riscv_vadd_vv_i16m1( 329 partial3b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, 4, vl), vl); 330 331 // Partial sums for lines 2 and 3. 332 partial0a = __riscv_vadd_vv_i16m1( 333 partial0a, 334 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 6), vl), vl); 335 partial0a = __riscv_vadd_vv_i16m1( 336 partial0a, 337 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 5), vl), vl); 338 partial0b = __riscv_vadd_vv_i16m1( 339 partial0b, __riscv_vslidedown_vx_i16m1(lines_2, 6, VL_SLIDE_DOWN), vl); 340 partial0b = __riscv_vadd_vv_i16m1( 341 partial0b, __riscv_vslidedown_vx_i16m1(lines_3, 5, VL_SLIDE_DOWN), vl); 342 partial1a = __riscv_vadd_vv_i16m1( 343 partial1a, 344 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 4), vl), vl); 345 partial1a = __riscv_vadd_vv_i16m1( 346 partial1a, 347 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 2), vl), vl); 348 partial1b = __riscv_vadd_vv_i16m1( 349 partial1b, __riscv_vslidedown_vx_i16m1(lines_2, 4, VL_SLIDE_DOWN), vl); 350 partial1b = __riscv_vadd_vv_i16m1( 351 partial1b, __riscv_vslidedown_vx_i16m1(lines_3, 2, VL_SLIDE_DOWN), vl); 352 partial3a = __riscv_vadd_vv_i16m1( 353 partial3a, __riscv_vslidedown_vx_i16m1(lines_2, 6, VL_SLIDE_DOWN), vl); 354 partial3b = __riscv_vadd_vv_i16m1( 355 partial3b, 356 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 6), vl), vl); 357 partial3b = __riscv_vadd_vv_i16m1(partial3b, lines_3, vl); 358 359 // Partial sums for lines 4 and 5. 360 partial0a = __riscv_vadd_vv_i16m1( 361 partial0a, 362 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 4), vl), vl); 363 partial0a = __riscv_vadd_vv_i16m1( 364 partial0a, 365 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 3), vl), vl); 366 partial0b = __riscv_vadd_vv_i16m1( 367 partial0b, __riscv_vslidedown_vx_i16m1(lines_4, 4, VL_SLIDE_DOWN), vl); 368 partial0b = __riscv_vadd_vv_i16m1( 369 partial0b, __riscv_vslidedown_vx_i16m1(lines_5, 3, VL_SLIDE_DOWN), vl); 370 partial1b = __riscv_vadd_vv_i16m1(partial1b, lines_4, vl); 371 partial1b = __riscv_vadd_vv_i16m1( 372 partial1b, 373 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 6), vl), vl); 374 vint16m1_t partial1c = __riscv_vslidedown_vx_i16m1(lines_5, 6, VL_SLIDE_DOWN); 375 partial3b = __riscv_vadd_vv_i16m1( 376 partial3b, __riscv_vslidedown_vx_i16m1(lines_4, 2, VL_SLIDE_DOWN), vl); 377 partial3b = __riscv_vadd_vv_i16m1( 378 partial3b, __riscv_vslidedown_vx_i16m1(lines_5, 4, VL_SLIDE_DOWN), vl); 379 vint16m1_t partial3c = 380 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 2), vl); 381 partial3c = __riscv_vadd_vv_i16m1( 382 partial3c, 383 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 4), vl), vl); 384 385 // Partial sums for lines 6 and 7. 386 partial0a = __riscv_vadd_vv_i16m1( 387 partial0a, 388 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 2), vl), vl); 389 partial0a = __riscv_vadd_vv_i16m1( 390 partial0a, 391 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_7, (8 - 1), vl), vl); 392 partial0b = __riscv_vadd_vv_i16m1( 393 partial0b, __riscv_vslidedown_vx_i16m1(lines_6, 2, VL_SLIDE_DOWN), vl); 394 partial0b = __riscv_vadd_vv_i16m1( 395 partial0b, __riscv_vslide1down_vx_i16m1(lines_7, 0, vl), vl); 396 partial1b = __riscv_vadd_vv_i16m1( 397 partial1b, 398 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 4), vl), vl); 399 partial1b = __riscv_vadd_vv_i16m1( 400 partial1b, 401 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_7, (8 - 2), vl), vl); 402 partial1c = __riscv_vadd_vv_i16m1( 403 partial1c, __riscv_vslidedown_vx_i16m1(lines_6, 4, VL_SLIDE_DOWN), vl); 404 partial1c = __riscv_vadd_vv_i16m1( 405 partial1c, __riscv_vslidedown_vx_i16m1(lines_7, 2, VL_SLIDE_DOWN), vl); 406 partial3b = __riscv_vadd_vv_i16m1( 407 partial3b, __riscv_vslidedown_vx_i16m1(lines_6, 6, VL_SLIDE_DOWN), vl); 408 partial3c = __riscv_vadd_vv_i16m1( 409 partial3c, 410 __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 6), vl), vl); 411 partial3c = __riscv_vadd_vv_i16m1(partial3c, lines_7, vl); 412 413 // Special case for direction 2 as it's just a sum along each line. 414 vint32m1_t partial2a = 415 horizontal_add_4d_s16x8(lines_0, lines_1, lines_2, lines_3); 416 vint32m1_t partial2b = 417 horizontal_add_4d_s16x8(lines_4, lines_5, lines_6, lines_7); 418 vuint32m1_t partial2a_u32 = __riscv_vreinterpret_v_i32m1_u32m1( 419 __riscv_vmul_vv_i32m1(partial2a, partial2a, 4)); 420 vuint32m1_t partial2b_u32 = __riscv_vreinterpret_v_i32m1_u32m1( 421 __riscv_vmul_vv_i32m1(partial2b, partial2b, 4)); 422 423 // const0 = { 840, 420, 280, 210, } 424 vuint32m1_t const0 = __riscv_vmv_s_x_u32m1(210, 4); 425 const0 = __riscv_vslide1up_vx_u32m1(const0, 280, 4); 426 const0 = __riscv_vslide1up_vx_u32m1(const0, 420, 4); 427 const0 = __riscv_vslide1up_vx_u32m1(const0, 840, 4); 428 429 // const1 = { 168, 140, 120, 105, } 430 vuint32m1_t const1 = __riscv_vmv_s_x_u32m1(105, 4); 431 const1 = __riscv_vslide1up_vx_u32m1(const1, 120, 4); 432 const1 = __riscv_vslide1up_vx_u32m1(const1, 140, 4); 433 const1 = __riscv_vslide1up_vx_u32m1(const1, 168, 4); 434 435 // const2 = { 420, 210, 140, 105, }; 436 vuint32m1_t const2 = __riscv_vmv_s_x_u32m1(105, 4); 437 const2 = __riscv_vslide1up_vx_u32m1(const2, 140, 4); 438 const2 = __riscv_vslide1up_vx_u32m1(const2, 210, 4); 439 const2 = __riscv_vslide1up_vx_u32m1(const2, 420, 4); 440 441 static const uint16_t tab_u16[8] = { 442 0, 6, 5, 4, 3, 2, 1, 0, 443 }; 444 vuint32m1_t costs_0, costs_1, costs_2, costs_3; 445 vuint16m1_t template_u16m1 = __riscv_vle16_v_u16m1(tab_u16, 8); 446 447 // Reverse partial c. 448 // pattern = { 6, 5, 4, 3, 2, 1, 0, 7, } 449 vuint16m1_t index_u16m1 = __riscv_vslide1down_vx_u16m1(template_u16m1, 7, 8); 450 vint16m1_t partial0b_rv = 451 __riscv_vrgather_vv_i16m1(partial0b, index_u16m1, 8); 452 costs_0 = fold_mul_and_sum_rvv(partial0a, partial0b_rv, const0, const1); 453 454 // Reverse partial c. 455 // pattern = { 5, 4, 3, 2, 1, 0, 6, 7, } 456 vuint16m1_t index_pair_u16m1 = 457 __riscv_vslide1down_vx_u16m1(template_u16m1, 6, 8); 458 index_pair_u16m1 = __riscv_vslide1down_vx_u16m1(index_pair_u16m1, 7, 8); 459 vint16m1_t partialc_rv = 460 __riscv_vrgather_vv_i16m1(partial1c, index_pair_u16m1, 8); 461 costs_1 = 462 fold_mul_and_sum_pairwise_rvv(partial1a, partial1b, partialc_rv, const2); 463 464 costs_2 = __riscv_vadd_vv_u32m1(partial2a_u32, partial2b_u32, 4); 465 costs_2 = __riscv_vmul_vx_u32m1(costs_2, 105, 4); 466 467 vint16m1_t partial3a_rv = 468 __riscv_vrgather_vv_i16m1(partial3a, index_pair_u16m1, 8); 469 costs_3 = 470 fold_mul_and_sum_pairwise_rvv(partial3c, partial3b, partial3a_rv, const2); 471 472 // combine values 473 vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); 474 vuint32m1_t cost0_sum = 475 __riscv_vredsum_vs_u32m1_u32m1(costs_0, vec_scalar_u32m1, 4); 476 vuint32m1_t cost1_sum = 477 __riscv_vredsum_vs_u32m1_u32m1(costs_1, vec_scalar_u32m1, 4); 478 vuint32m1_t cost2_sum = 479 __riscv_vredsum_vs_u32m1_u32m1(costs_2, vec_scalar_u32m1, 4); 480 vuint32m1_t cost3_sum = 481 __riscv_vredsum_vs_u32m1_u32m1(costs_3, vec_scalar_u32m1, 4); 482 483 costs_0 = __riscv_vslideup_vx_u32m1(cost0_sum, cost1_sum, 1, 4); 484 costs_0 = __riscv_vslideup_vx_u32m1(costs_0, cost2_sum, 2, 4); 485 costs_0 = __riscv_vslideup_vx_u32m1(costs_0, cost3_sum, 3, 4); 486 __riscv_vse32_v_u32m1(&cost[0], costs_0, 4); 487 return costs_0; 488 } 489 490 int cdef_find_dir_rvv(const uint16_t *img, int stride, int32_t *var, 491 int coeff_shift) { 492 size_t vl = 8; 493 size_t vlmax = __riscv_vsetvlmax_e16m1(); 494 vuint16m1_t s; 495 vint16m1_t lines_0, lines_1, lines_2, lines_3; 496 vint16m1_t lines_4, lines_5, lines_6, lines_7; 497 vuint16m1_t vec_zero_u16m1 = 498 __riscv_vmv_v_x_u16m1(0, __riscv_vsetvl_e16m1(16)); 499 500 if (vlmax == 8) 501 s = __riscv_vle16_v_u16m1(img, vl); 502 else 503 s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); 504 lines_0 = __riscv_vreinterpret_v_u16m1_i16m1( 505 __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); 506 lines_0 = __riscv_vsub_vx_i16m1(lines_0, 128, vl); 507 508 img += stride; 509 if (vlmax == 8) 510 s = __riscv_vle16_v_u16m1(img, vl); 511 else 512 s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); 513 lines_1 = __riscv_vreinterpret_v_u16m1_i16m1( 514 __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); 515 lines_1 = __riscv_vsub_vx_i16m1(lines_1, 128, vl); 516 517 img += stride; 518 if (vlmax == 8) 519 s = __riscv_vle16_v_u16m1(img, vl); 520 else 521 s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); 522 lines_2 = __riscv_vreinterpret_v_u16m1_i16m1( 523 __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); 524 lines_2 = __riscv_vsub_vx_i16m1(lines_2, 128, vl); 525 526 img += stride; 527 if (vlmax == 8) 528 s = __riscv_vle16_v_u16m1(img, vl); 529 else 530 s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); 531 lines_3 = __riscv_vreinterpret_v_u16m1_i16m1( 532 __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); 533 lines_3 = __riscv_vsub_vx_i16m1(lines_3, 128, vl); 534 535 img += stride; 536 if (vlmax == 8) 537 s = __riscv_vle16_v_u16m1(img, vl); 538 else 539 s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); 540 lines_4 = __riscv_vreinterpret_v_u16m1_i16m1( 541 __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); 542 lines_4 = __riscv_vsub_vx_i16m1(lines_4, 128, vl); 543 544 img += stride; 545 if (vlmax == 8) 546 s = __riscv_vle16_v_u16m1(img, vl); 547 else 548 s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); 549 lines_5 = __riscv_vreinterpret_v_u16m1_i16m1( 550 __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); 551 lines_5 = __riscv_vsub_vx_i16m1(lines_5, 128, vl); 552 553 img += stride; 554 if (vlmax == 8) 555 s = __riscv_vle16_v_u16m1(img, vl); 556 else 557 s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); 558 lines_6 = __riscv_vreinterpret_v_u16m1_i16m1( 559 __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); 560 lines_6 = __riscv_vsub_vx_i16m1(lines_6, 128, vl); 561 562 img += stride; 563 if (vlmax == 8) 564 s = __riscv_vle16_v_u16m1(img, vl); 565 else 566 s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); 567 lines_7 = __riscv_vreinterpret_v_u16m1_i16m1( 568 __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); 569 lines_7 = __riscv_vsub_vx_i16m1(lines_7, 128, vl); 570 571 // Compute "mostly vertical" directions. 572 uint32_t cost[8]; 573 vuint32m1_t cost47 = 574 compute_vert_directions_rvv(lines_0, lines_1, lines_2, lines_3, lines_4, 575 lines_5, lines_6, lines_7, cost + 4, vl); 576 577 // Compute "mostly horizontal" directions. 578 vuint32m1_t cost03 = 579 compute_horiz_directions_rvv(lines_0, lines_1, lines_2, lines_3, lines_4, 580 lines_5, lines_6, lines_7, cost, vl); 581 582 // Find max cost as well as its index to get best_dir. 583 // The max cost needs to be propagated in the whole vector to find its 584 // position in the original cost vectors cost03 and cost47. 585 vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); 586 vuint32m1_t cost07 = __riscv_vmaxu_vv_u32m1(cost03, cost47, 4); 587 uint32_t best_cost = __riscv_vmv_x_s_u32m1_u32( 588 __riscv_vredmaxu_vs_u32m1_u32m1(cost07, vec_scalar_u32m1, 4)); 589 vbool32_t mask_cost = __riscv_vmseq_vx_u32m1_b32(cost03, best_cost, 4); 590 long best_dir = __riscv_vfirst_m_b32(mask_cost, 4); 591 if (best_dir == -1) { 592 mask_cost = __riscv_vmseq_vx_u32m1_b32(cost47, best_cost, 4); 593 best_dir = __riscv_vfirst_m_b32(mask_cost, 4); 594 best_dir += 4; 595 } 596 597 // Difference between the optimal variance and the variance along the 598 // orthogonal direction. Again, the sum(x^2) terms cancel out. 599 *var = best_cost - cost[(best_dir + 4) & 7]; 600 601 // We'd normally divide by 840, but dividing by 1024 is close enough 602 // for what we're going to do with this. 603 *var >>= 10; 604 return (int)best_dir; 605 } 606 607 void cdef_copy_rect8_8bit_to_16bit_rvv(uint16_t *dst, int dstride, 608 const uint8_t *src, int sstride, 609 int width, int height) { 610 do { 611 int w = 0; 612 size_t num_cols = width; 613 while (num_cols > 0) { 614 size_t vl = __riscv_vsetvl_e8mf2(num_cols); 615 vuint8mf2_t u8_src = __riscv_vle8_v_u8mf2(src + w, vl); 616 vuint16m1_t u16_src = __riscv_vwcvtu_x_x_v_u16m1(u8_src, vl); 617 __riscv_vse16_v_u16m1(dst + w, u16_src, vl); 618 619 w += vl; 620 num_cols -= vl; 621 } 622 src += sstride; 623 dst += dstride; 624 } while (--height != 0); 625 } 626 627 void cdef_copy_rect8_16bit_to_16bit_rvv(uint16_t *dst, int dstride, 628 const uint16_t *src, int sstride, 629 int width, int height) { 630 do { 631 int w = 0; 632 size_t num_cols = width; 633 while (num_cols > 0) { 634 size_t vl = __riscv_vsetvl_e16m1(num_cols); 635 vuint16m1_t u16_src = __riscv_vle16_v_u16m1(src + w, vl); 636 __riscv_vse16_v_u16m1(dst + w, u16_src, vl); 637 638 w += vl; 639 num_cols -= vl; 640 } 641 src += sstride; 642 dst += dstride; 643 } while (--height != 0); 644 } 645 646 static inline vint16m1_t constrain16(vint16m1_t a, vint16m1_t b, 647 int16_t threshold, int16_t adjdamp, 648 size_t vl) { 649 if (!threshold) return __riscv_vmv_v_x_i16m1(0, vl); 650 const vbool16_t mask = __riscv_vmslt_vv_i16m1_b16(a, b, vl); 651 const vint16m1_t diff = __riscv_vsub_vv_i16m1(a, b, vl); 652 const vint16m1_t abs_diff = __riscv_vneg_v_i16m1_tumu(mask, diff, diff, vl); 653 const vint16m1_t shift = __riscv_vsra_vx_i16m1(abs_diff, adjdamp, vl); 654 const vint16m1_t thr = __riscv_vmv_v_x_i16m1(threshold, vl); 655 const vint16m1_t sub = __riscv_vsub_vv_i16m1(thr, shift, vl); 656 const vint16m1_t max = __riscv_vmax_vx_i16m1(sub, 0, vl); 657 const vint16m1_t min = __riscv_vmin_vv_i16m1(abs_diff, max, vl); 658 return __riscv_vneg_v_i16m1_tumu(mask, min, min, vl); 659 } 660 661 static inline vint16m1_t vmax_mask(vint16m1_t a, vint16m1_t b, size_t vl) { 662 const vbool16_t mask = 663 __riscv_vmseq_vx_i16m1_b16(a, (int16_t)CDEF_VERY_LARGE, vl); 664 const vint16m1_t val = __riscv_vmerge_vvm_i16m1(a, b, mask, vl); 665 return __riscv_vmax_vv_i16m1(val, b, vl); 666 } 667 668 static inline vint16m1_t load_strided_i16_4x2(int16_t *addr, 669 const ptrdiff_t stride, 670 size_t vl) { 671 const vint16m1_t px_l1 = __riscv_vle16_v_i16m1(addr + stride, vl); 672 const vint16m1_t px_l0 = __riscv_vle16_v_i16m1(addr, vl); 673 return __riscv_vslideup_vx_i16m1(px_l0, px_l1, 4, vl); 674 } 675 676 static inline void store_strided_u8_4x2(uint8_t *addr, vuint8mf2_t vdst, 677 const ptrdiff_t stride, size_t vl) { 678 __riscv_vse8_v_u8mf2(addr, vdst, vl >> 1); 679 vdst = __riscv_vslidedown_vx_u8mf2(vdst, 4, vl); 680 __riscv_vse8_v_u8mf2(addr + stride, vdst, vl >> 1); 681 } 682 683 static inline void store_strided_u16_4x2(uint16_t *addr, vuint16m1_t vdst, 684 const ptrdiff_t stride, size_t vl) { 685 __riscv_vse16_v_u16m1(addr, vdst, vl >> 1); 686 vdst = __riscv_vslidedown_vx_u16m1(vdst, 4, vl); 687 __riscv_vse16_v_u16m1(addr + stride, vdst, vl >> 1); 688 } 689 690 #define LOAD_PIX(addr) \ 691 const vint16m1_t px = __riscv_vle16_v_i16m1((int16_t *)addr, vl); \ 692 vint16m1_t sum = __riscv_vmv_v_x_i16m1(0, vl) 693 694 #define LOAD_PIX4(addr) \ 695 const vint16m1_t px = \ 696 load_strided_i16_4x2((int16_t *)addr, CDEF_BSTRIDE, vl); \ 697 vint16m1_t sum = __riscv_vmv_v_x_i16m1(0, vl) 698 699 #define LOAD_DIR(p, addr, o0, o1) \ 700 const vint16m1_t p##0 = __riscv_vle16_v_i16m1((int16_t *)addr + o0, vl); \ 701 const vint16m1_t p##1 = __riscv_vle16_v_i16m1((int16_t *)addr - o0, vl); \ 702 const vint16m1_t p##2 = __riscv_vle16_v_i16m1((int16_t *)addr + o1, vl); \ 703 const vint16m1_t p##3 = __riscv_vle16_v_i16m1((int16_t *)addr - o1, vl) 704 705 #define LOAD_DIR4(p, addr, o0, o1) \ 706 const vint16m1_t p##0 = \ 707 load_strided_i16_4x2((int16_t *)addr + o0, CDEF_BSTRIDE, vl); \ 708 const vint16m1_t p##1 = \ 709 load_strided_i16_4x2((int16_t *)addr - o0, CDEF_BSTRIDE, vl); \ 710 const vint16m1_t p##2 = \ 711 load_strided_i16_4x2((int16_t *)addr + o1, CDEF_BSTRIDE, vl); \ 712 const vint16m1_t p##3 = \ 713 load_strided_i16_4x2((int16_t *)addr - o1, CDEF_BSTRIDE, vl) 714 715 #define MAKE_TAPS \ 716 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; \ 717 const int16_t tap0 = (int16_t)(pri_taps[0]); \ 718 const int16_t tap1 = (int16_t)(pri_taps[1]) 719 720 #define CONSTRAIN(p, strength, shift) \ 721 vint16m1_t p##_c0 = \ 722 constrain16(p##0, px, (int16_t)strength, (int16_t)shift, vl); \ 723 vint16m1_t p##_c1 = \ 724 constrain16(p##1, px, (int16_t)strength, (int16_t)shift, vl); \ 725 vint16m1_t p##_c2 = \ 726 constrain16(p##2, px, (int16_t)strength, (int16_t)shift, vl); \ 727 vint16m1_t p##_c3 = \ 728 constrain16(p##3, px, (int16_t)strength, (int16_t)shift, vl) 729 730 #define SETUP_MINMAX \ 731 vint16m1_t max = px; \ 732 vint16m1_t min = px 733 734 #define MIN_MAX(p) \ 735 do { \ 736 max = vmax_mask(p##0, max, vl); \ 737 min = __riscv_vmin_vv_i16m1(p##0, min, vl); \ 738 max = vmax_mask(p##1, max, vl); \ 739 min = __riscv_vmin_vv_i16m1(p##1, min, vl); \ 740 max = vmax_mask(p##2, max, vl); \ 741 min = __riscv_vmin_vv_i16m1(p##2, min, vl); \ 742 max = vmax_mask(p##3, max, vl); \ 743 min = __riscv_vmin_vv_i16m1(p##3, min, vl); \ 744 } while (0) 745 746 #define PRI_0_UPDATE_SUM(p) \ 747 const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ 748 const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ 749 sum = __riscv_vmacc_vx_i16m1(sum, tap0, p##sum0, vl); \ 750 sum = __riscv_vmacc_vx_i16m1(sum, tap1, p##sum1, vl) 751 752 #define UPDATE_SUM(p) \ 753 const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ 754 const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ 755 sum = __riscv_vadd_vv_i16m1(sum, p##sum0, vl); \ 756 sum = __riscv_vadd_vv_i16m1(sum, p##sum1, vl) 757 758 #define SEC_0_UPDATE_SUM(p) \ 759 const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ 760 const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ 761 const vint16m1_t p##sum2 = __riscv_vadd_vv_i16m1(p##sum0, p##sum1, vl); \ 762 sum = __riscv_vadd_vv_i16m1(sum, __riscv_vsll_vx_i16m1(p##sum2, 1, vl), vl) 763 764 #define BIAS \ 765 const vbool16_t mask = __riscv_vmslt_vx_i16m1_b16(sum, 0, vl); \ 766 const vint16m1_t v_8 = __riscv_vmv_v_x_i16m1(8, vl); \ 767 const vint16m1_t bias = __riscv_vsub_vx_i16m1_tumu(mask, v_8, v_8, 1, vl); \ 768 const vint16m1_t unclamped = __riscv_vadd_vv_i16m1( \ 769 px, __riscv_vsra_vx_i16m1(__riscv_vadd_vv_i16m1(bias, sum, vl), 4, vl), \ 770 vl) 771 772 #define STORE4 \ 773 do { \ 774 store_strided_u8_4x2(dst8, vdst, dstride, vl); \ 775 \ 776 in += (CDEF_BSTRIDE << 1); \ 777 dst8 += (dstride << 1); \ 778 } while (0) 779 780 #define STORE4_CLAMPED \ 781 do { \ 782 BIAS; \ 783 vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ 784 __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ 785 vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ 786 __riscv_vreinterpret_v_i16m1_u16m1(clamped), vl); \ 787 STORE4; \ 788 } while (0) 789 790 #define STORE4_UNCLAMPED \ 791 do { \ 792 BIAS; \ 793 vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ 794 __riscv_vreinterpret_v_i16m1_u16m1(unclamped), vl); \ 795 STORE4; \ 796 } while (0) 797 798 #define STORE8 \ 799 do { \ 800 __riscv_vse8_v_u8mf2(dst8, vdst, vl); \ 801 \ 802 in += CDEF_BSTRIDE; \ 803 dst8 += dstride; \ 804 } while (0) 805 806 #define STORE8_CLAMPED \ 807 do { \ 808 BIAS; \ 809 vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ 810 __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ 811 vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ 812 __riscv_vreinterpret_v_i16m1_u16m1(clamped), vl); \ 813 STORE8; \ 814 } while (0) 815 816 #define STORE8_UNCLAMPED \ 817 do { \ 818 BIAS; \ 819 vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ 820 __riscv_vreinterpret_v_i16m1_u16m1(unclamped), vl); \ 821 STORE8; \ 822 } while (0) 823 824 #define STORE16_4 \ 825 do { \ 826 store_strided_u16_4x2(dst16, vdst, dstride, vl); \ 827 \ 828 in += (CDEF_BSTRIDE << 1); \ 829 dst16 += (dstride << 1); \ 830 } while (0) 831 832 #define STORE16_4_CLAMPED \ 833 do { \ 834 BIAS; \ 835 vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ 836 __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ 837 vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(clamped); \ 838 STORE16_4; \ 839 } while (0) 840 841 #define STORE16_4_UNCLAMPED \ 842 do { \ 843 BIAS; \ 844 vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(unclamped); \ 845 STORE16_4; \ 846 } while (0) 847 848 #define STORE16 \ 849 do { \ 850 __riscv_vse16_v_u16m1(dst16, vdst, vl); \ 851 \ 852 in += CDEF_BSTRIDE; \ 853 dst16 += dstride; \ 854 } while (0) 855 856 #define STORE16_CLAMPED \ 857 do { \ 858 BIAS; \ 859 vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ 860 __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ 861 vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(clamped); \ 862 STORE16; \ 863 } while (0) 864 865 #define STORE16_UNCLAMPED \ 866 do { \ 867 BIAS; \ 868 vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(unclamped); \ 869 STORE16; \ 870 } while (0) 871 872 void cdef_filter_8_0_rvv(void *dest, int dstride, const uint16_t *in, 873 int pri_strength, int sec_strength, int dir, 874 int pri_damping, int sec_damping, int coeff_shift, 875 int block_width, int block_height) { 876 const int po1 = cdef_directions[dir][0]; 877 const int po2 = cdef_directions[dir][1]; 878 const int s1o1 = cdef_directions[dir + 2][0]; 879 const int s1o2 = cdef_directions[dir + 2][1]; 880 const int s2o1 = cdef_directions[dir - 2][0]; 881 const int s2o2 = cdef_directions[dir - 2][1]; 882 MAKE_TAPS; 883 884 if (pri_strength) { 885 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); 886 } 887 if (sec_strength) { 888 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); 889 } 890 891 if (block_width == 8) { 892 uint8_t *dst8 = (uint8_t *)dest; 893 894 int h = block_height; 895 const size_t vl = block_width; 896 do { 897 LOAD_PIX(in); 898 SETUP_MINMAX; 899 900 // Primary pass 901 LOAD_DIR(p, in, po1, po2); 902 CONSTRAIN(p, pri_strength, pri_damping); 903 MIN_MAX(p); 904 PRI_0_UPDATE_SUM(p); 905 906 // Secondary pass 1 907 LOAD_DIR(s, in, s1o1, s2o1); 908 CONSTRAIN(s, sec_strength, sec_damping); 909 MIN_MAX(s); 910 SEC_0_UPDATE_SUM(s); 911 912 // Secondary pass 2 913 LOAD_DIR(s2, in, s1o2, s2o2); 914 CONSTRAIN(s2, sec_strength, sec_damping); 915 MIN_MAX(s2); 916 UPDATE_SUM(s2); 917 918 // Store 919 STORE8_CLAMPED; 920 } while (--h != 0); 921 } else { 922 uint8_t *dst8 = (uint8_t *)dest; 923 924 int h = block_height; 925 const size_t vl = block_width << 1; 926 do { 927 LOAD_PIX4(in); 928 SETUP_MINMAX; 929 930 // Primary pass 931 LOAD_DIR4(p, in, po1, po2); 932 CONSTRAIN(p, pri_strength, pri_damping); 933 MIN_MAX(p); 934 PRI_0_UPDATE_SUM(p); 935 936 // Secondary pass 1 937 LOAD_DIR4(s, in, s1o1, s2o1); 938 CONSTRAIN(s, sec_strength, sec_damping); 939 MIN_MAX(s); 940 SEC_0_UPDATE_SUM(s); 941 942 // Secondary pass 2 943 LOAD_DIR4(s2, in, s1o2, s2o2); 944 CONSTRAIN(s2, sec_strength, sec_damping); 945 MIN_MAX(s2); 946 UPDATE_SUM(s2); 947 948 // Store 949 STORE4_CLAMPED; 950 951 h -= 2; 952 } while (h != 0); 953 } 954 } 955 956 void cdef_filter_8_1_rvv(void *dest, int dstride, const uint16_t *in, 957 int pri_strength, int sec_strength, int dir, 958 int pri_damping, int sec_damping, int coeff_shift, 959 int block_width, int block_height) { 960 (void)sec_strength; 961 (void)sec_damping; 962 963 const int po1 = cdef_directions[dir][0]; 964 const int po2 = cdef_directions[dir][1]; 965 MAKE_TAPS; 966 967 if (pri_strength) { 968 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); 969 } 970 971 if (block_width == 8) { 972 uint8_t *dst8 = (uint8_t *)dest; 973 974 int h = block_height; 975 const size_t vl = block_width; 976 do { 977 LOAD_PIX(in); 978 979 // Primary pass 980 LOAD_DIR(p, in, po1, po2); 981 CONSTRAIN(p, pri_strength, pri_damping); 982 PRI_0_UPDATE_SUM(p); 983 984 // Store 985 STORE8_UNCLAMPED; 986 } while (--h != 0); 987 } else { 988 uint8_t *dst8 = (uint8_t *)dest; 989 990 int h = block_height; 991 const size_t vl = block_width << 1; 992 do { 993 LOAD_PIX4(in); 994 995 // Primary pass 996 LOAD_DIR4(p, in, po1, po2); 997 CONSTRAIN(p, pri_strength, pri_damping); 998 PRI_0_UPDATE_SUM(p); 999 1000 // Store 1001 STORE4_UNCLAMPED; 1002 1003 h -= 2; 1004 } while (h != 0); 1005 } 1006 } 1007 1008 void cdef_filter_8_2_rvv(void *dest, int dstride, const uint16_t *in, 1009 int pri_strength, int sec_strength, int dir, 1010 int pri_damping, int sec_damping, int coeff_shift, 1011 int block_width, int block_height) { 1012 (void)pri_strength; 1013 (void)pri_damping; 1014 (void)coeff_shift; 1015 1016 const int s1o1 = cdef_directions[dir + 2][0]; 1017 const int s1o2 = cdef_directions[dir + 2][1]; 1018 const int s2o1 = cdef_directions[dir - 2][0]; 1019 const int s2o2 = cdef_directions[dir - 2][1]; 1020 1021 if (sec_strength) { 1022 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); 1023 } 1024 1025 if (block_width == 8) { 1026 uint8_t *dst8 = (uint8_t *)dest; 1027 1028 int h = block_height; 1029 const size_t vl = block_width; 1030 do { 1031 LOAD_PIX(in); 1032 1033 // Secondary pass 1 1034 LOAD_DIR(s, in, s1o1, s2o1); 1035 CONSTRAIN(s, sec_strength, sec_damping); 1036 SEC_0_UPDATE_SUM(s); 1037 1038 // Secondary pass 2 1039 LOAD_DIR(s2, in, s1o2, s2o2); 1040 CONSTRAIN(s2, sec_strength, sec_damping); 1041 UPDATE_SUM(s2); 1042 1043 // Store 1044 STORE8_UNCLAMPED; 1045 } while (--h != 0); 1046 } else { 1047 uint8_t *dst8 = (uint8_t *)dest; 1048 1049 int h = block_height; 1050 const size_t vl = block_width << 1; 1051 do { 1052 LOAD_PIX4(in); 1053 1054 // Secondary pass 1 1055 LOAD_DIR4(s, in, s1o1, s2o1); 1056 CONSTRAIN(s, sec_strength, sec_damping); 1057 SEC_0_UPDATE_SUM(s); 1058 1059 // Secondary pass 2 1060 LOAD_DIR4(s2, in, s1o2, s2o2); 1061 CONSTRAIN(s2, sec_strength, sec_damping); 1062 UPDATE_SUM(s2); 1063 1064 // Store 1065 STORE4_UNCLAMPED; 1066 1067 h -= 2; 1068 } while (h != 0); 1069 } 1070 } 1071 1072 void cdef_filter_8_3_rvv(void *dest, int dstride, const uint16_t *in, 1073 int pri_strength, int sec_strength, int dir, 1074 int pri_damping, int sec_damping, int coeff_shift, 1075 int block_width, int block_height) { 1076 (void)pri_strength; 1077 (void)sec_strength; 1078 (void)dir; 1079 (void)pri_damping; 1080 (void)sec_damping; 1081 (void)coeff_shift; 1082 1083 if (block_width == 8) { 1084 uint8_t *dst8 = (uint8_t *)dest; 1085 1086 int h = block_height; 1087 const size_t vl = block_width; 1088 do { 1089 const vuint16m1_t px = __riscv_vle16_v_u16m1(in, vl); 1090 const vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(px, vl); 1091 __riscv_vse8_v_u8mf2(dst8, vdst, vl); 1092 1093 in += CDEF_BSTRIDE; 1094 dst8 += dstride; 1095 } while (--h != 0); 1096 } else { 1097 uint8_t *dst8 = (uint8_t *)dest; 1098 1099 int h = block_height; 1100 const size_t vl = block_width << 1; 1101 do { 1102 const vint16m1_t px = 1103 load_strided_i16_4x2((int16_t *)in, CDEF_BSTRIDE, vl); 1104 vuint8mf2_t vdst = 1105 __riscv_vncvt_x_x_w_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(px), vl); 1106 store_strided_u8_4x2(dst8, vdst, dstride, vl); 1107 1108 in += 2 * CDEF_BSTRIDE; 1109 dst8 += 2 * dstride; 1110 h -= 2; 1111 } while (h != 0); 1112 } 1113 } 1114 1115 void cdef_filter_16_0_rvv(void *dest, int dstride, const uint16_t *in, 1116 int pri_strength, int sec_strength, int dir, 1117 int pri_damping, int sec_damping, int coeff_shift, 1118 int block_width, int block_height) { 1119 const int po1 = cdef_directions[dir][0]; 1120 const int po2 = cdef_directions[dir][1]; 1121 const int s1o1 = cdef_directions[dir + 2][0]; 1122 const int s1o2 = cdef_directions[dir + 2][1]; 1123 const int s2o1 = cdef_directions[dir - 2][0]; 1124 const int s2o2 = cdef_directions[dir - 2][1]; 1125 MAKE_TAPS; 1126 1127 if (pri_strength) { 1128 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); 1129 } 1130 if (sec_strength) { 1131 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); 1132 } 1133 1134 if (block_width == 8) { 1135 uint16_t *dst16 = (uint16_t *)dest; 1136 1137 int h = block_height; 1138 const size_t vl = block_width; 1139 do { 1140 LOAD_PIX(in); 1141 SETUP_MINMAX; 1142 1143 // Primary pass 1144 LOAD_DIR(p, in, po1, po2); 1145 CONSTRAIN(p, pri_strength, pri_damping); 1146 MIN_MAX(p); 1147 PRI_0_UPDATE_SUM(p); 1148 1149 // Secondary pass 1 1150 LOAD_DIR(s, in, s1o1, s2o1); 1151 CONSTRAIN(s, sec_strength, sec_damping); 1152 MIN_MAX(s); 1153 SEC_0_UPDATE_SUM(s); 1154 1155 // Secondary pass 2 1156 LOAD_DIR(s2, in, s1o2, s2o2); 1157 CONSTRAIN(s2, sec_strength, sec_damping); 1158 MIN_MAX(s2); 1159 UPDATE_SUM(s2); 1160 1161 // Store 1162 STORE16_CLAMPED; 1163 } while (--h != 0); 1164 } else { 1165 uint16_t *dst16 = (uint16_t *)dest; 1166 1167 int h = block_height; 1168 const size_t vl = block_width << 1; 1169 do { 1170 LOAD_PIX4(in); 1171 SETUP_MINMAX; 1172 1173 // Primary pass 1174 LOAD_DIR4(p, in, po1, po2); 1175 CONSTRAIN(p, pri_strength, pri_damping); 1176 MIN_MAX(p); 1177 PRI_0_UPDATE_SUM(p); 1178 1179 // Secondary pass 1 1180 LOAD_DIR4(s, in, s1o1, s2o1); 1181 CONSTRAIN(s, sec_strength, sec_damping); 1182 MIN_MAX(s); 1183 SEC_0_UPDATE_SUM(s); 1184 1185 // Secondary pass 2 1186 LOAD_DIR4(s2, in, s1o2, s2o2); 1187 CONSTRAIN(s2, sec_strength, sec_damping); 1188 MIN_MAX(s2); 1189 UPDATE_SUM(s2); 1190 1191 // Store 1192 STORE16_4_CLAMPED; 1193 1194 h -= 2; 1195 } while (h != 0); 1196 } 1197 } 1198 1199 void cdef_filter_16_1_rvv(void *dest, int dstride, const uint16_t *in, 1200 int pri_strength, int sec_strength, int dir, 1201 int pri_damping, int sec_damping, int coeff_shift, 1202 int block_width, int block_height) { 1203 (void)sec_strength; 1204 (void)sec_damping; 1205 1206 const int po1 = cdef_directions[dir][0]; 1207 const int po2 = cdef_directions[dir][1]; 1208 MAKE_TAPS; 1209 1210 if (pri_strength) { 1211 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); 1212 } 1213 1214 if (block_width == 8) { 1215 uint16_t *dst16 = (uint16_t *)dest; 1216 1217 int h = block_height; 1218 const size_t vl = block_width; 1219 do { 1220 LOAD_PIX(in); 1221 1222 // Primary pass 1223 LOAD_DIR(p, in, po1, po2); 1224 CONSTRAIN(p, pri_strength, pri_damping); 1225 PRI_0_UPDATE_SUM(p); 1226 1227 // Store 1228 STORE16_UNCLAMPED; 1229 } while (--h != 0); 1230 } else { 1231 uint16_t *dst16 = (uint16_t *)dest; 1232 1233 int h = block_height; 1234 const size_t vl = block_width << 1; 1235 do { 1236 LOAD_PIX4(in); 1237 1238 // Primary pass 1239 LOAD_DIR4(p, in, po1, po2); 1240 CONSTRAIN(p, pri_strength, pri_damping); 1241 PRI_0_UPDATE_SUM(p); 1242 1243 // Store 1244 STORE16_4_UNCLAMPED; 1245 1246 h -= 2; 1247 } while (h != 0); 1248 } 1249 } 1250 1251 void cdef_filter_16_2_rvv(void *dest, int dstride, const uint16_t *in, 1252 int pri_strength, int sec_strength, int dir, 1253 int pri_damping, int sec_damping, int coeff_shift, 1254 int block_width, int block_height) { 1255 (void)pri_strength; 1256 (void)pri_damping; 1257 (void)coeff_shift; 1258 1259 const int s1o1 = cdef_directions[dir + 2][0]; 1260 const int s1o2 = cdef_directions[dir + 2][1]; 1261 const int s2o1 = cdef_directions[dir - 2][0]; 1262 const int s2o2 = cdef_directions[dir - 2][1]; 1263 1264 if (sec_strength) { 1265 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); 1266 } 1267 1268 if (block_width == 8) { 1269 uint16_t *dst16 = (uint16_t *)dest; 1270 1271 int h = block_height; 1272 const size_t vl = block_width; 1273 do { 1274 LOAD_PIX(in); 1275 1276 // Secondary pass 1 1277 LOAD_DIR(s, in, s1o1, s2o1); 1278 CONSTRAIN(s, sec_strength, sec_damping); 1279 SEC_0_UPDATE_SUM(s); 1280 1281 // Secondary pass 2 1282 LOAD_DIR(s2, in, s1o2, s2o2); 1283 CONSTRAIN(s2, sec_strength, sec_damping); 1284 UPDATE_SUM(s2); 1285 1286 // Store 1287 STORE16_UNCLAMPED; 1288 } while (--h != 0); 1289 } else { 1290 uint16_t *dst16 = (uint16_t *)dest; 1291 1292 int h = block_height; 1293 const size_t vl = block_width << 1; 1294 do { 1295 LOAD_PIX4(in); 1296 1297 // Secondary pass 1 1298 LOAD_DIR4(s, in, s1o1, s2o1); 1299 CONSTRAIN(s, sec_strength, sec_damping); 1300 SEC_0_UPDATE_SUM(s); 1301 1302 // Secondary pass 2 1303 LOAD_DIR4(s2, in, s1o2, s2o2); 1304 CONSTRAIN(s2, sec_strength, sec_damping); 1305 UPDATE_SUM(s2); 1306 1307 // Store 1308 STORE16_4_UNCLAMPED; 1309 1310 h -= 2; 1311 } while (h != 0); 1312 } 1313 } 1314 1315 void cdef_filter_16_3_rvv(void *dest, int dstride, const uint16_t *in, 1316 int pri_strength, int sec_strength, int dir, 1317 int pri_damping, int sec_damping, int coeff_shift, 1318 int block_width, int block_height) { 1319 (void)pri_strength; 1320 (void)sec_strength; 1321 (void)dir; 1322 (void)pri_damping; 1323 (void)sec_damping; 1324 (void)coeff_shift; 1325 1326 if (block_width == 8) { 1327 uint16_t *dst16 = (uint16_t *)dest; 1328 1329 int h = block_height; 1330 const size_t vl = block_width; 1331 do { 1332 const vuint16m1_t px = __riscv_vle16_v_u16m1(in, vl); 1333 __riscv_vse16_v_u16m1(dst16, px, vl); 1334 1335 in += CDEF_BSTRIDE; 1336 dst16 += dstride; 1337 } while (--h != 0); 1338 } else { 1339 uint16_t *dst16 = (uint16_t *)dest; 1340 1341 int h = block_height; 1342 const size_t vl = block_width << 1; 1343 do { 1344 const vint16m1_t px = 1345 load_strided_i16_4x2((int16_t *)in, CDEF_BSTRIDE, vl); 1346 vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(px); 1347 store_strided_u16_4x2(dst16, vdst, dstride, vl); 1348 1349 in += 2 * CDEF_BSTRIDE; 1350 dst16 += 2 * dstride; 1351 h -= 2; 1352 } while (h != 0); 1353 } 1354 }