convolve_rvv.h (21227B)
1 /* 2 * Copyright (c) 2025, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AV1_COMMON_RISCV_CONVOLVE_RVV_H_ 13 #define AOM_AV1_COMMON_RISCV_CONVOLVE_RVV_H_ 14 15 #include "config/aom_config.h" 16 17 #include "av1/common/convolve.h" 18 #include "av1/common/filter.h" 19 20 // load_strided_u8_4xN 21 static inline vuint8mf2_t load_strided_u8_4xN(uint8_t *addr, ptrdiff_t stride, 22 size_t vl) { 23 const vuint8mf2_t px_l1 = __riscv_vle8_v_u8mf2(addr + stride, vl); 24 const vuint8mf2_t px_l0 = __riscv_vle8_v_u8mf2(addr, vl); 25 return __riscv_vslideup_vx_u8mf2(px_l0, px_l1, vl >> 1, vl); 26 } 27 28 // store_strided_u8_4xN 29 static inline void store_strided_u8_4xN(uint8_t *addr, vuint8mf2_t vdst, 30 ptrdiff_t stride, size_t vl) { 31 __riscv_vse8_v_u8mf2(addr, vdst, vl >> 1); 32 vdst = __riscv_vslidedown_vx_u8mf2(vdst, vl >> 1, vl); 33 __riscv_vse8_v_u8mf2(addr + stride, vdst, vl >> 1); 34 } 35 36 // load_strided_i16_4xN 37 static inline vint16m1_t load_strided_i16_4xN(int16_t *addr, ptrdiff_t stride, 38 size_t vl) { 39 const vint16m1_t px_l1 = __riscv_vle16_v_i16m1(addr + stride, vl >> 1); 40 const vint16m1_t px_l0 = __riscv_vle16_v_i16m1(addr, vl >> 1); 41 return __riscv_vslideup_vx_i16m1(px_l0, px_l1, vl >> 1, vl); 42 } 43 44 // store_strided_i16_4xN 45 static inline void store_strided_i16_4xN(int16_t *addr, vint16m1_t vdst, 46 ptrdiff_t stride, size_t vl) { 47 __riscv_vse16_v_i16m1(addr, vdst, vl >> 1); 48 vdst = __riscv_vslidedown_vx_i16m1(vdst, vl >> 1, vl); 49 __riscv_vse16_v_i16m1(addr + stride, vdst, vl >> 1); 50 } 51 52 static inline vuint8mf2_t convolve12_2d_v_rvv( 53 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 54 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 55 const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8, 56 const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11, 57 const int16_t *y_filter, const int16_t sub_const, const int vert_const, 58 size_t vl) { 59 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl); 60 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl); 61 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl); 62 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl); 63 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[4], s4, vl); 64 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[5], s5, vl); 65 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[6], s6, vl); 66 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[7], s7, vl); 67 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[8], s8, vl); 68 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[9], s9, vl); 69 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[10], s10, vl); 70 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[11], s11, vl); 71 sum = __riscv_vadd_vx_i32m2(sum, vert_const, vl); 72 73 vint16m1_t i16_sum = 74 __riscv_vnsra_wx_i16m1(sum, ((FILTER_BITS << 1) - ROUND0_BITS), vl); 75 i16_sum = __riscv_vsub_vx_i16m1(i16_sum, sub_const, vl); 76 vint16m1_t iclip_sum = 77 __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), 255, vl); 78 79 return __riscv_vncvt_x_x_w_u8mf2( 80 __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum), vl); 81 } 82 83 static inline void convolve_2d_sr_vert_12tap_rvv( 84 int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, 85 int h, const int16_t *y_filter_ptr, size_t vl) { 86 const int vert_const = (1 << ((FILTER_BITS << 1) - ROUND0_BITS)) >> 1; 87 const int16_t sub_const = 1 << FILTER_BITS; 88 89 if (w == 4) { 90 vl = vl << 1; 91 92 vint16m1_t s0 = load_strided_i16_4xN(src_ptr, src_stride, vl); 93 src_ptr += src_stride; 94 vint16m1_t s1 = load_strided_i16_4xN(src_ptr, src_stride, vl); 95 src_ptr += src_stride; 96 vint16m1_t s2 = load_strided_i16_4xN(src_ptr, src_stride, vl); 97 src_ptr += src_stride; 98 vint16m1_t s3 = load_strided_i16_4xN(src_ptr, src_stride, vl); 99 src_ptr += src_stride; 100 vint16m1_t s4 = load_strided_i16_4xN(src_ptr, src_stride, vl); 101 src_ptr += src_stride; 102 vint16m1_t s5 = load_strided_i16_4xN(src_ptr, src_stride, vl); 103 src_ptr += src_stride; 104 vint16m1_t s6 = load_strided_i16_4xN(src_ptr, src_stride, vl); 105 src_ptr += src_stride; 106 vint16m1_t s7 = load_strided_i16_4xN(src_ptr, src_stride, vl); 107 src_ptr += src_stride; 108 vint16m1_t s8 = load_strided_i16_4xN(src_ptr, src_stride, vl); 109 src_ptr += src_stride; 110 vint16m1_t s9 = load_strided_i16_4xN(src_ptr, src_stride, vl); 111 src_ptr += src_stride; 112 113 do { 114 vint16m1_t s10 = load_strided_i16_4xN(src_ptr, src_stride, vl); 115 src_ptr += src_stride; 116 vint16m1_t s11 = load_strided_i16_4xN(src_ptr, src_stride, vl); 117 src_ptr += src_stride; 118 vint16m1_t s12 = load_strided_i16_4xN(src_ptr, src_stride, vl); 119 src_ptr += src_stride; 120 vint16m1_t s13 = load_strided_i16_4xN(src_ptr, src_stride, vl); 121 src_ptr += src_stride; 122 123 vuint8mf2_t d0 = 124 convolve12_2d_v_rvv(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 125 y_filter_ptr, sub_const, vert_const, vl); 126 vuint8mf2_t d1 = 127 convolve12_2d_v_rvv(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, 128 s13, y_filter_ptr, sub_const, vert_const, vl); 129 130 store_strided_u8_4xN(dst_ptr, d0, dst_stride, vl); 131 dst_ptr += dst_stride << 1; 132 store_strided_u8_4xN(dst_ptr, d1, dst_stride, vl); 133 dst_ptr += dst_stride << 1; 134 135 s0 = s4; 136 s1 = s5; 137 s2 = s6; 138 s3 = s7; 139 s4 = s8; 140 s5 = s9; 141 s6 = s10; 142 s7 = s11; 143 s8 = s12; 144 s9 = s13; 145 146 h -= 4; 147 } while (h != 0); 148 } else { 149 do { 150 int height = h; 151 int16_t *s = src_ptr; 152 uint8_t *d = dst_ptr; 153 154 vint16m1_t s0 = __riscv_vle16_v_i16m1(s, vl); 155 s += src_stride; 156 vint16m1_t s1 = __riscv_vle16_v_i16m1(s, vl); 157 s += src_stride; 158 vint16m1_t s2 = __riscv_vle16_v_i16m1(s, vl); 159 s += src_stride; 160 vint16m1_t s3 = __riscv_vle16_v_i16m1(s, vl); 161 s += src_stride; 162 vint16m1_t s4 = __riscv_vle16_v_i16m1(s, vl); 163 s += src_stride; 164 vint16m1_t s5 = __riscv_vle16_v_i16m1(s, vl); 165 s += src_stride; 166 vint16m1_t s6 = __riscv_vle16_v_i16m1(s, vl); 167 s += src_stride; 168 vint16m1_t s7 = __riscv_vle16_v_i16m1(s, vl); 169 s += src_stride; 170 vint16m1_t s8 = __riscv_vle16_v_i16m1(s, vl); 171 s += src_stride; 172 vint16m1_t s9 = __riscv_vle16_v_i16m1(s, vl); 173 s += src_stride; 174 vint16m1_t s10 = __riscv_vle16_v_i16m1(s, vl); 175 s += src_stride; 176 177 do { 178 vint16m1_t s11 = __riscv_vle16_v_i16m1(s, vl); 179 s += src_stride; 180 vint16m1_t s12 = __riscv_vle16_v_i16m1(s, vl); 181 s += src_stride; 182 vint16m1_t s13 = __riscv_vle16_v_i16m1(s, vl); 183 s += src_stride; 184 vint16m1_t s14 = __riscv_vle16_v_i16m1(s, vl); 185 s += src_stride; 186 187 vuint8mf2_t d0 = 188 convolve12_2d_v_rvv(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, 189 s11, y_filter_ptr, sub_const, vert_const, vl); 190 vuint8mf2_t d1 = 191 convolve12_2d_v_rvv(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 192 s12, y_filter_ptr, sub_const, vert_const, vl); 193 vuint8mf2_t d2 = 194 convolve12_2d_v_rvv(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, 195 s13, y_filter_ptr, sub_const, vert_const, vl); 196 vuint8mf2_t d3 = 197 convolve12_2d_v_rvv(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, 198 s14, y_filter_ptr, sub_const, vert_const, vl); 199 200 __riscv_vse8_v_u8mf2(d, d0, vl); 201 d += dst_stride; 202 __riscv_vse8_v_u8mf2(d, d1, vl); 203 d += dst_stride; 204 __riscv_vse8_v_u8mf2(d, d2, vl); 205 d += dst_stride; 206 __riscv_vse8_v_u8mf2(d, d3, vl); 207 d += dst_stride; 208 209 s0 = s4; 210 s1 = s5; 211 s2 = s6; 212 s3 = s7; 213 s4 = s8; 214 s5 = s9; 215 s6 = s10; 216 s7 = s11; 217 s8 = s12; 218 s9 = s13; 219 s10 = s14; 220 221 height -= 4; 222 } while (height != 0); 223 224 src_ptr += vl; 225 dst_ptr += vl; 226 w -= vl; 227 } while (w != 0); 228 } 229 } 230 231 static inline vuint8mf2_t convolve8_2d_v_rvv( 232 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 233 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 234 const vint16m1_t s6, const vint16m1_t s7, const int16_t *y_filter, 235 const int16_t sub_const, const int vert_const, size_t vl) { 236 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl); 237 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl); 238 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl); 239 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl); 240 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[4], s4, vl); 241 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[5], s5, vl); 242 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[6], s6, vl); 243 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[7], s7, vl); 244 sum = __riscv_vadd_vx_i32m2(sum, vert_const, vl); 245 246 vint16m1_t i16_sum = 247 __riscv_vnsra_wx_i16m1(sum, ((FILTER_BITS << 1) - ROUND0_BITS), vl); 248 i16_sum = __riscv_vsub_vx_i16m1(i16_sum, sub_const, vl); 249 vint16m1_t iclip_sum = 250 __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), 255, vl); 251 252 return __riscv_vncvt_x_x_w_u8mf2( 253 __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum), vl); 254 } 255 256 static inline void convolve_2d_sr_vert_8tap_rvv( 257 int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, 258 int h, const int16_t *y_filter_ptr, size_t vl) { 259 const int vert_const = (1 << ((FILTER_BITS << 1) - ROUND0_BITS)) >> 1; 260 const int16_t sub_const = 1 << FILTER_BITS; 261 262 if (w <= 4) { 263 vl = vl << 1; 264 265 vint16m1_t s0 = load_strided_i16_4xN(src_ptr, src_stride, vl); 266 src_ptr += src_stride; 267 vint16m1_t s1 = load_strided_i16_4xN(src_ptr, src_stride, vl); 268 src_ptr += src_stride; 269 vint16m1_t s2 = load_strided_i16_4xN(src_ptr, src_stride, vl); 270 src_ptr += src_stride; 271 vint16m1_t s3 = load_strided_i16_4xN(src_ptr, src_stride, vl); 272 src_ptr += src_stride; 273 vint16m1_t s4 = load_strided_i16_4xN(src_ptr, src_stride, vl); 274 src_ptr += src_stride; 275 vint16m1_t s5 = load_strided_i16_4xN(src_ptr, src_stride, vl); 276 src_ptr += src_stride; 277 278 do { 279 vint16m1_t s6 = load_strided_i16_4xN(src_ptr, src_stride, vl); 280 src_ptr += src_stride; 281 vint16m1_t s7 = load_strided_i16_4xN(src_ptr, src_stride, vl); 282 src_ptr += src_stride; 283 284 vuint8mf2_t d0 = 285 convolve8_2d_v_rvv(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_ptr, 286 sub_const, vert_const, vl); 287 288 store_strided_u8_4xN(dst_ptr, d0, dst_stride, vl); 289 dst_ptr += dst_stride << 1; 290 291 s0 = s2; 292 s1 = s3; 293 s2 = s4; 294 s3 = s5; 295 s4 = s6; 296 s5 = s7; 297 298 h -= 2; 299 } while (h != 0); 300 } else { 301 do { 302 int height = h; 303 int16_t *s = src_ptr; 304 uint8_t *d = dst_ptr; 305 306 vint16m1_t s0 = __riscv_vle16_v_i16m1(s, vl); 307 s += src_stride; 308 vint16m1_t s1 = __riscv_vle16_v_i16m1(s, vl); 309 s += src_stride; 310 vint16m1_t s2 = __riscv_vle16_v_i16m1(s, vl); 311 s += src_stride; 312 vint16m1_t s3 = __riscv_vle16_v_i16m1(s, vl); 313 s += src_stride; 314 vint16m1_t s4 = __riscv_vle16_v_i16m1(s, vl); 315 s += src_stride; 316 vint16m1_t s5 = __riscv_vle16_v_i16m1(s, vl); 317 s += src_stride; 318 vint16m1_t s6 = __riscv_vle16_v_i16m1(s, vl); 319 s += src_stride; 320 321 do { 322 vint16m1_t s7 = __riscv_vle16_v_i16m1(s, vl); 323 vuint8mf2_t d0 = 324 convolve8_2d_v_rvv(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_ptr, 325 sub_const, vert_const, vl); 326 __riscv_vse8_v_u8mf2(d, d0, vl); 327 328 s0 = s1; 329 s1 = s2; 330 s2 = s3; 331 s3 = s4; 332 s4 = s5; 333 s5 = s6; 334 s6 = s7; 335 s += src_stride; 336 d += dst_stride; 337 height--; 338 } while (height != 0); 339 340 src_ptr += vl; 341 dst_ptr += vl; 342 w -= vl; 343 } while (w != 0); 344 } 345 } 346 347 static inline vuint8mf2_t convolve6_2d_v_rvv( 348 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 349 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 350 const int16_t *y_filter, const int16_t sub_const, const int vert_const, 351 size_t vl) { 352 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl); 353 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl); 354 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl); 355 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl); 356 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[4], s4, vl); 357 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[5], s5, vl); 358 sum = __riscv_vadd_vx_i32m2(sum, vert_const, vl); 359 360 vint16m1_t i16_sum = 361 __riscv_vnsra_wx_i16m1(sum, ((FILTER_BITS << 1) - ROUND0_BITS), vl); 362 i16_sum = __riscv_vsub_vx_i16m1(i16_sum, sub_const, vl); 363 vint16m1_t iclip_sum = 364 __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), 255, vl); 365 366 return __riscv_vncvt_x_x_w_u8mf2( 367 __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum), vl); 368 } 369 370 static inline void convolve_2d_sr_vert_6tap_rvv( 371 int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, 372 int h, const int16_t *y_filter_ptr, size_t vl) { 373 const int vert_const = (1 << ((FILTER_BITS << 1) - ROUND0_BITS)) >> 1; 374 const int16_t sub_const = 1 << FILTER_BITS; 375 376 const int16_t *filter = y_filter_ptr + 1; 377 378 if (w <= 4) { 379 vl = vl << 1; 380 381 vint16m1_t s0 = load_strided_i16_4xN(src_ptr, src_stride, vl); 382 src_ptr += src_stride; 383 vint16m1_t s1 = load_strided_i16_4xN(src_ptr, src_stride, vl); 384 src_ptr += src_stride; 385 vint16m1_t s2 = load_strided_i16_4xN(src_ptr, src_stride, vl); 386 src_ptr += src_stride; 387 vint16m1_t s3 = load_strided_i16_4xN(src_ptr, src_stride, vl); 388 src_ptr += src_stride; 389 390 do { 391 vint16m1_t s4 = load_strided_i16_4xN(src_ptr, src_stride, vl); 392 src_ptr += src_stride; 393 vint16m1_t s5 = load_strided_i16_4xN(src_ptr, src_stride, vl); 394 src_ptr += src_stride; 395 vint16m1_t s6 = load_strided_i16_4xN(src_ptr, src_stride, vl); 396 src_ptr += src_stride; 397 vint16m1_t s7 = load_strided_i16_4xN(src_ptr, src_stride, vl); 398 src_ptr += src_stride; 399 400 vuint8mf2_t d0 = convolve6_2d_v_rvv(s0, s1, s2, s3, s4, s5, filter, 401 sub_const, vert_const, vl); 402 vuint8mf2_t d1 = convolve6_2d_v_rvv(s2, s3, s4, s5, s6, s7, filter, 403 sub_const, vert_const, vl); 404 405 store_strided_u8_4xN(dst_ptr, d0, dst_stride, vl); 406 dst_ptr += dst_stride << 1; 407 store_strided_u8_4xN(dst_ptr, d1, dst_stride, vl); 408 dst_ptr += dst_stride << 1; 409 410 s0 = s4; 411 s1 = s5; 412 s2 = s6; 413 s3 = s7; 414 415 h -= 4; 416 } while (h != 0); 417 } else { 418 do { 419 int height = h; 420 int16_t *s = src_ptr; 421 uint8_t *d = dst_ptr; 422 423 vint16m1_t s0 = __riscv_vle16_v_i16m1(s, vl); 424 s += src_stride; 425 vint16m1_t s1 = __riscv_vle16_v_i16m1(s, vl); 426 s += src_stride; 427 vint16m1_t s2 = __riscv_vle16_v_i16m1(s, vl); 428 s += src_stride; 429 vint16m1_t s3 = __riscv_vle16_v_i16m1(s, vl); 430 s += src_stride; 431 vint16m1_t s4 = __riscv_vle16_v_i16m1(s, vl); 432 s += src_stride; 433 434 do { 435 vint16m1_t s5 = __riscv_vle16_v_i16m1(s, vl); 436 s += src_stride; 437 vint16m1_t s6 = __riscv_vle16_v_i16m1(s, vl); 438 s += src_stride; 439 vint16m1_t s7 = __riscv_vle16_v_i16m1(s, vl); 440 s += src_stride; 441 vint16m1_t s8 = __riscv_vle16_v_i16m1(s, vl); 442 s += src_stride; 443 444 vuint8mf2_t d0 = convolve6_2d_v_rvv(s0, s1, s2, s3, s4, s5, filter, 445 sub_const, vert_const, vl); 446 vuint8mf2_t d1 = convolve6_2d_v_rvv(s1, s2, s3, s4, s5, s6, filter, 447 sub_const, vert_const, vl); 448 vuint8mf2_t d2 = convolve6_2d_v_rvv(s2, s3, s4, s5, s6, s7, filter, 449 sub_const, vert_const, vl); 450 vuint8mf2_t d3 = convolve6_2d_v_rvv(s3, s4, s5, s6, s7, s8, filter, 451 sub_const, vert_const, vl); 452 453 __riscv_vse8_v_u8mf2(d, d0, vl); 454 d += dst_stride; 455 __riscv_vse8_v_u8mf2(d, d1, vl); 456 d += dst_stride; 457 __riscv_vse8_v_u8mf2(d, d2, vl); 458 d += dst_stride; 459 __riscv_vse8_v_u8mf2(d, d3, vl); 460 d += dst_stride; 461 462 s0 = s4; 463 s1 = s5; 464 s2 = s6; 465 s3 = s7; 466 s4 = s8; 467 468 height -= 4; 469 } while (height != 0); 470 471 src_ptr += vl; 472 dst_ptr += vl; 473 w -= vl; 474 } while (w != 0); 475 } 476 } 477 478 static inline vuint8mf2_t convolve4_2d_v_rvv( 479 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 480 const vint16m1_t s3, const int16_t *y_filter, const int16_t sub_const, 481 const int vert_const, size_t vl) { 482 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl); 483 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl); 484 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl); 485 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl); 486 sum = __riscv_vadd_vx_i32m2(sum, vert_const, vl); 487 488 vint16m1_t i16_sum = 489 __riscv_vnsra_wx_i16m1(sum, ((FILTER_BITS << 1) - ROUND0_BITS), vl); 490 i16_sum = __riscv_vsub_vx_i16m1(i16_sum, sub_const, vl); 491 vint16m1_t iclip_sum = 492 __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), 255, vl); 493 494 return __riscv_vncvt_x_x_w_u8mf2( 495 __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum), vl); 496 } 497 498 static inline void convolve_2d_sr_vert_4tap_rvv( 499 int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, 500 int h, const int16_t *y_filter_ptr, size_t vl) { 501 const int vert_const = (1 << ((FILTER_BITS << 1) - ROUND0_BITS)) >> 1; 502 const int16_t sub_const = 1 << FILTER_BITS; 503 // Filter values are at offset 2 504 const int16_t *filter = y_filter_ptr + 2; 505 506 if (w <= 4) { 507 vl = vl << 1; 508 509 vint16m1_t s0 = load_strided_i16_4xN(src_ptr, src_stride, vl); 510 src_ptr += src_stride; 511 vint16m1_t s1 = load_strided_i16_4xN(src_ptr, src_stride, vl); 512 src_ptr += src_stride; 513 514 do { 515 vint16m1_t s2 = load_strided_i16_4xN(src_ptr, src_stride, vl); 516 src_ptr += src_stride; 517 vint16m1_t s3 = load_strided_i16_4xN(src_ptr, src_stride, vl); 518 src_ptr += src_stride; 519 vint16m1_t s4 = load_strided_i16_4xN(src_ptr, src_stride, vl); 520 src_ptr += src_stride; 521 vint16m1_t s5 = load_strided_i16_4xN(src_ptr, src_stride, vl); 522 src_ptr += src_stride; 523 524 vuint8mf2_t d0 = 525 convolve4_2d_v_rvv(s0, s1, s2, s3, filter, sub_const, vert_const, vl); 526 vuint8mf2_t d1 = 527 convolve4_2d_v_rvv(s2, s3, s4, s5, filter, sub_const, vert_const, vl); 528 529 store_strided_u8_4xN(dst_ptr, d0, dst_stride, vl); 530 dst_ptr += dst_stride << 1; 531 store_strided_u8_4xN(dst_ptr, d1, dst_stride, vl); 532 dst_ptr += dst_stride << 1; 533 534 s0 = s4; 535 s1 = s5; 536 537 h -= 4; 538 } while (h != 0); 539 } else { 540 do { 541 int height = h; 542 int16_t *s = src_ptr; 543 uint8_t *d = dst_ptr; 544 545 vint16m1_t s0 = __riscv_vle16_v_i16m1(s, vl); 546 s += src_stride; 547 vint16m1_t s1 = __riscv_vle16_v_i16m1(s, vl); 548 s += src_stride; 549 vint16m1_t s2 = __riscv_vle16_v_i16m1(s, vl); 550 s += src_stride; 551 552 do { 553 vint16m1_t s3 = __riscv_vle16_v_i16m1(s, vl); 554 s += src_stride; 555 vint16m1_t s4 = __riscv_vle16_v_i16m1(s, vl); 556 s += src_stride; 557 vint16m1_t s5 = __riscv_vle16_v_i16m1(s, vl); 558 s += src_stride; 559 vint16m1_t s6 = __riscv_vle16_v_i16m1(s, vl); 560 s += src_stride; 561 562 vuint8mf2_t d0 = convolve4_2d_v_rvv(s0, s1, s2, s3, filter, sub_const, 563 vert_const, vl); 564 vuint8mf2_t d1 = convolve4_2d_v_rvv(s1, s2, s3, s4, filter, sub_const, 565 vert_const, vl); 566 vuint8mf2_t d2 = convolve4_2d_v_rvv(s2, s3, s4, s5, filter, sub_const, 567 vert_const, vl); 568 vuint8mf2_t d3 = convolve4_2d_v_rvv(s3, s4, s5, s6, filter, sub_const, 569 vert_const, vl); 570 571 __riscv_vse8_v_u8mf2(d, d0, vl); 572 d += dst_stride; 573 __riscv_vse8_v_u8mf2(d, d1, vl); 574 d += dst_stride; 575 __riscv_vse8_v_u8mf2(d, d2, vl); 576 d += dst_stride; 577 __riscv_vse8_v_u8mf2(d, d3, vl); 578 d += dst_stride; 579 580 s0 = s4; 581 s1 = s5; 582 s2 = s6; 583 584 height -= 4; 585 } while (height != 0); 586 587 src_ptr += vl; 588 dst_ptr += vl; 589 w -= vl; 590 } while (w != 0); 591 } 592 } 593 594 #endif // AOM_AV1_COMMON_RISCV_CONVOLVE_RVV_H_