convolve_rvv.c (67649B)
1 /* 2 * Copyright (c) 2025, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <riscv_vector.h> 14 15 #include "config/aom_config.h" 16 #include "config/av1_rtcd.h" 17 18 #include "aom_dsp/aom_dsp_common.h" 19 #include "aom_ports/mem.h" 20 #include "av1/common/convolve.h" 21 #include "av1/common/filter.h" 22 #include "av1/common/riscv/convolve_rvv.h" 23 24 static inline vuint8mf2_t convolve12_4_x_rvv( 25 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 26 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 27 const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8, 28 const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11, 29 const int16_t *filter, const int32_t horiz_const, size_t vl) { 30 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, filter[0], vl); 31 sum = __riscv_vwmacc_vx_i32m2(sum, filter[1], s1, vl); 32 sum = __riscv_vwmacc_vx_i32m2(sum, filter[2], s2, vl); 33 sum = __riscv_vwmacc_vx_i32m2(sum, filter[3], s3, vl); 34 sum = __riscv_vwmacc_vx_i32m2(sum, filter[4], s4, vl); 35 sum = __riscv_vwmacc_vx_i32m2(sum, filter[5], s5, vl); 36 sum = __riscv_vwmacc_vx_i32m2(sum, filter[6], s6, vl); 37 sum = __riscv_vwmacc_vx_i32m2(sum, filter[7], s7, vl); 38 sum = __riscv_vwmacc_vx_i32m2(sum, filter[8], s8, vl); 39 sum = __riscv_vwmacc_vx_i32m2(sum, filter[9], s9, vl); 40 sum = __riscv_vwmacc_vx_i32m2(sum, filter[10], s10, vl); 41 sum = __riscv_vwmacc_vx_i32m2(sum, filter[11], s11, vl); 42 sum = __riscv_vwadd_wx_i32m2(sum, horiz_const + (1 << (FILTER_BITS - 1)), vl); 43 44 // Round and shift 45 vint16m1_t i16_sum = __riscv_vnsra_wx_i16m1(sum, FILTER_BITS, vl); 46 vint16m1_t iclip_sum = 47 __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), 255, vl); 48 49 // Convert to 8-bit 50 return __riscv_vncvt_x_x_w_u8mf2( 51 __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum), vl); 52 } 53 54 static inline void convolve_x_sr_12tap_rvv(const uint8_t *src_ptr, 55 int src_stride, uint8_t *dst_ptr, 56 const int dst_stride, int w, int h, 57 const int16_t *x_filter_ptr) { 58 const int32_t horiz_const = (1 << (ROUND0_BITS - 1)); 59 size_t vl = __riscv_vsetvl_e16m1(w); 60 61 do { 62 const uint8_t *s = src_ptr; 63 uint8_t *d = dst_ptr; 64 int width = w; 65 66 do { 67 // Load 68 vuint8mf2_t t0 = __riscv_vle8_v_u8mf2(s + 0, vl); 69 vuint8mf2_t t1 = __riscv_vle8_v_u8mf2(s + 1, vl); 70 vuint8mf2_t t2 = __riscv_vle8_v_u8mf2(s + 2, vl); 71 vuint8mf2_t t3 = __riscv_vle8_v_u8mf2(s + 3, vl); 72 vuint8mf2_t t4 = __riscv_vle8_v_u8mf2(s + 4, vl); 73 vuint8mf2_t t5 = __riscv_vle8_v_u8mf2(s + 5, vl); 74 vuint8mf2_t t6 = __riscv_vle8_v_u8mf2(s + 6, vl); 75 vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(s + 7, vl); 76 vuint8mf2_t t8 = __riscv_vle8_v_u8mf2(s + 8, vl); 77 vuint8mf2_t t9 = __riscv_vle8_v_u8mf2(s + 9, vl); 78 vuint8mf2_t t10 = __riscv_vle8_v_u8mf2(s + 10, vl); 79 vuint8mf2_t t11 = __riscv_vle8_v_u8mf2(s + 11, vl); 80 81 // Convert to 16-bit integers 82 vint16m1_t s0 = 83 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl)); 84 vint16m1_t s1 = 85 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl)); 86 vint16m1_t s2 = 87 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl)); 88 vint16m1_t s3 = 89 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl)); 90 vint16m1_t s4 = 91 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl)); 92 vint16m1_t s5 = 93 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl)); 94 vint16m1_t s6 = 95 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl)); 96 vint16m1_t s7 = 97 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl)); 98 vint16m1_t s8 = 99 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t8, vl)); 100 vint16m1_t s9 = 101 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t9, vl)); 102 vint16m1_t s10 = 103 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t10, vl)); 104 vint16m1_t s11 = 105 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t11, vl)); 106 107 // Perform convolution 108 vuint8mf2_t d0 = 109 convolve12_4_x_rvv(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 110 x_filter_ptr, horiz_const, vl); 111 112 // Store result 113 __riscv_vse8_v_u8mf2(d, d0, vl); 114 115 s += vl; 116 d += vl; 117 width -= vl; 118 } while (width != 0); 119 src_ptr += src_stride; 120 dst_ptr += dst_stride; 121 } while (--h != 0); 122 } 123 124 static inline vuint8mf2_t convolve4_8_x_rvv( 125 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 126 const vint16m1_t s3, const int16_t *filter, const int16_t horiz_const, 127 size_t vl) { 128 vint16m1_t sum = __riscv_vmul_vx_i16m1(s0, filter[0], vl); 129 sum = __riscv_vmacc_vx_i16m1(sum, filter[1], s1, vl); 130 sum = __riscv_vmacc_vx_i16m1(sum, filter[2], s2, vl); 131 sum = __riscv_vmacc_vx_i16m1(sum, filter[3], s3, vl); 132 sum = __riscv_vadd_vx_i16m1(sum, horiz_const, vl); 133 134 // Round and shift 135 // We halved the filter values so -1 from right shift 136 vuint16m1_t d0 = 137 __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(sum, 0, vl)); 138 139 return __riscv_vnclipu_wx_u8mf2(d0, FILTER_BITS - 1, __RISCV_VXRM_RNU, vl); 140 } 141 142 static inline void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, 143 vuint8mf2_t *const s0, vuint8mf2_t *const s1, 144 vuint8mf2_t *const s2, vuint8mf2_t *const s3, 145 size_t vl) { 146 *s0 = __riscv_vle8_v_u8mf2(s, vl); 147 s += p; 148 *s1 = __riscv_vle8_v_u8mf2(s, vl); 149 s += p; 150 *s2 = __riscv_vle8_v_u8mf2(s, vl); 151 s += p; 152 *s3 = __riscv_vle8_v_u8mf2(s, vl); 153 } 154 155 static inline void store_u8_8x2(uint8_t *s, ptrdiff_t p, const vuint8mf2_t s0, 156 const vuint8mf2_t s1, size_t vl) { 157 __riscv_vse8_v_u8mf2(s, s0, vl); 158 s += p; 159 __riscv_vse8_v_u8mf2(s, s1, vl); 160 } 161 162 static inline void convolve_x_sr_4tap_rvv(const uint8_t *src_ptr, 163 int src_stride, uint8_t *dst_ptr, 164 const int dst_stride, int w, int h, 165 const int16_t *x_filter_ptr) { 166 size_t vl; 167 const int16_t horiz_const = (1 << ((ROUND0_BITS - 1) - 1)); 168 169 // All filter values are even, halve to reduce intermediate precision 170 // requirements. 171 int16_t filter[4]; 172 for (int i = 0; i < 4; i++) filter[i] = x_filter_ptr[2 + i] >> 1; 173 174 if (w == 4) { 175 vl = 8; 176 do { 177 // Load 8 pixels for each row 178 vuint8mf2_t t00, t01, t02, t03; 179 t00 = load_strided_u8_4xN((uint8_t *)src_ptr + 0, src_stride, vl); 180 t01 = load_strided_u8_4xN((uint8_t *)src_ptr + 1, src_stride, vl); 181 t02 = load_strided_u8_4xN((uint8_t *)src_ptr + 2, src_stride, vl); 182 t03 = load_strided_u8_4xN((uint8_t *)src_ptr + 3, src_stride, vl); 183 184 // Convert to 16-bit integers 185 vint16m1_t s00, s01, s02, s03; 186 s00 = 187 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t00, vl)); 188 s01 = 189 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t01, vl)); 190 s02 = 191 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t02, vl)); 192 s03 = 193 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t03, vl)); 194 195 // Perform convolution 196 vuint8mf2_t d01 = 197 convolve4_8_x_rvv(s00, s01, s02, s03, filter, horiz_const, vl); 198 199 // Store result 200 store_strided_u8_4xN(dst_ptr + 0 * dst_stride, d01, dst_stride, vl); 201 202 src_ptr += 2 * src_stride; 203 dst_ptr += 2 * dst_stride; 204 h -= 2; 205 } while (h != 0); 206 } else { 207 vl = __riscv_vsetvl_e16m1(w); 208 do { 209 int width = w; 210 const uint8_t *s = src_ptr; 211 uint8_t *d = dst_ptr; 212 213 do { 214 vuint8mf2_t t00, t01, t02, t03; 215 vuint8mf2_t t10, t11, t12, t13; 216 load_u8_8x4(s + 0 * src_stride, 1, &t00, &t01, &t02, &t03, vl); 217 load_u8_8x4(s + 1 * src_stride, 1, &t10, &t11, &t12, &t13, vl); 218 219 // Convert to 16-bit integers 220 vint16m1_t s00, s01, s02, s03; 221 s00 = __riscv_vreinterpret_v_u16m1_i16m1( 222 __riscv_vzext_vf2_u16m1(t00, vl)); 223 s01 = __riscv_vreinterpret_v_u16m1_i16m1( 224 __riscv_vzext_vf2_u16m1(t01, vl)); 225 s02 = __riscv_vreinterpret_v_u16m1_i16m1( 226 __riscv_vzext_vf2_u16m1(t02, vl)); 227 s03 = __riscv_vreinterpret_v_u16m1_i16m1( 228 __riscv_vzext_vf2_u16m1(t03, vl)); 229 230 vint16m1_t s10, s11, s12, s13; 231 s10 = __riscv_vreinterpret_v_u16m1_i16m1( 232 __riscv_vzext_vf2_u16m1(t10, vl)); 233 s11 = __riscv_vreinterpret_v_u16m1_i16m1( 234 __riscv_vzext_vf2_u16m1(t11, vl)); 235 s12 = __riscv_vreinterpret_v_u16m1_i16m1( 236 __riscv_vzext_vf2_u16m1(t12, vl)); 237 s13 = __riscv_vreinterpret_v_u16m1_i16m1( 238 __riscv_vzext_vf2_u16m1(t13, vl)); 239 240 // Perform convolution 241 vuint8mf2_t d0 = 242 convolve4_8_x_rvv(s00, s01, s02, s03, filter, horiz_const, vl); 243 vuint8mf2_t d1 = 244 convolve4_8_x_rvv(s10, s11, s12, s13, filter, horiz_const, vl); 245 246 // Store result 247 store_u8_8x2(d, dst_stride, d0, d1, vl); 248 249 s += vl; 250 d += vl; 251 width -= vl; 252 } while (width > 0); 253 src_ptr += 2 * src_stride; 254 dst_ptr += 2 * dst_stride; 255 h -= 2; 256 } while (h != 0); 257 } 258 } 259 260 static inline vuint8mf2_t convolve8_8_x_rvv( 261 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 262 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 263 const vint16m1_t s6, const vint16m1_t s7, const int16_t *filter, 264 const int16_t horiz_const, size_t vl) { 265 vint16m1_t sum = __riscv_vmul_vx_i16m1(s0, filter[0], vl); 266 sum = __riscv_vmacc_vx_i16m1(sum, filter[1], s1, vl); 267 sum = __riscv_vmacc_vx_i16m1(sum, filter[2], s2, vl); 268 sum = __riscv_vmacc_vx_i16m1(sum, filter[3], s3, vl); 269 sum = __riscv_vmacc_vx_i16m1(sum, filter[4], s4, vl); 270 sum = __riscv_vmacc_vx_i16m1(sum, filter[5], s5, vl); 271 sum = __riscv_vmacc_vx_i16m1(sum, filter[6], s6, vl); 272 sum = __riscv_vmacc_vx_i16m1(sum, filter[7], s7, vl); 273 sum = __riscv_vadd_vx_i16m1(sum, horiz_const, vl); 274 275 // Round and shift 276 // We halved the filter values so -1 from right shift 277 vuint16m1_t d0 = 278 __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(sum, 0, vl)); 279 280 return __riscv_vnclipu_wx_u8mf2(d0, FILTER_BITS - 1, __RISCV_VXRM_RNU, vl); 281 } 282 283 static inline void load_u8_8x8(const uint8_t *s, int p, vuint8mf2_t *const s0, 284 vuint8mf2_t *const s1, vuint8mf2_t *const s2, 285 vuint8mf2_t *const s3, vuint8mf2_t *const s4, 286 vuint8mf2_t *const s5, vuint8mf2_t *const s6, 287 vuint8mf2_t *const s7, size_t vl) { 288 *s0 = __riscv_vle8_v_u8mf2(s, vl); 289 s += p; 290 *s1 = __riscv_vle8_v_u8mf2(s, vl); 291 s += p; 292 *s2 = __riscv_vle8_v_u8mf2(s, vl); 293 s += p; 294 *s3 = __riscv_vle8_v_u8mf2(s, vl); 295 s += p; 296 *s4 = __riscv_vle8_v_u8mf2(s, vl); 297 s += p; 298 *s5 = __riscv_vle8_v_u8mf2(s, vl); 299 s += p; 300 *s6 = __riscv_vle8_v_u8mf2(s, vl); 301 s += p; 302 *s7 = __riscv_vle8_v_u8mf2(s, vl); 303 } 304 305 static inline void convolve_x_sr_8tap_rvv(const uint8_t *src_ptr, 306 int src_stride, uint8_t *dst_ptr, 307 const int dst_stride, int w, int h, 308 const int16_t *x_filter_ptr) { 309 // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single 310 // rounding right shift by FILTER_BITS - instead of a first rounding right 311 // shift by ROUND0_BITS, followed by second rounding right shift by 312 // FILTER_BITS - ROUND0_BITS. 313 // The outermost -1 is needed because we will halve the filter values. 314 const int32_t horiz_const = 1 << ((ROUND0_BITS - 1) - 1); 315 316 // Filter values are even so halve to reduce precision requirements. 317 int16_t filter[8]; 318 for (int i = 0; i < 8; i++) filter[i] = x_filter_ptr[i] >> 1; 319 320 size_t vl = __riscv_vsetvl_e16m1(w); 321 while (h-- != 0) { 322 int width = w; 323 const uint8_t *s = src_ptr; 324 uint8_t *d = dst_ptr; 325 326 do { 327 // Load 328 vuint8mf2_t t0, t1, t2, t3, t4, t5, t6, t7; 329 load_u8_8x8(s, 1, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, vl); 330 331 // Convert to 16-bit integers 332 vint16m1_t s0 = 333 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl)); 334 vint16m1_t s1 = 335 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl)); 336 vint16m1_t s2 = 337 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl)); 338 vint16m1_t s3 = 339 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl)); 340 vint16m1_t s4 = 341 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl)); 342 vint16m1_t s5 = 343 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl)); 344 vint16m1_t s6 = 345 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl)); 346 vint16m1_t s7 = 347 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl)); 348 349 // Perform convolution 350 vuint8mf2_t d0 = convolve8_8_x_rvv(s0, s1, s2, s3, s4, s5, s6, s7, filter, 351 horiz_const, vl); 352 353 // Store result 354 __riscv_vse8_v_u8mf2(d, d0, vl); 355 356 s += vl; 357 d += vl; 358 width -= vl; 359 } while (width > 0); 360 src_ptr += src_stride; 361 dst_ptr += dst_stride; 362 } 363 } 364 365 void av1_convolve_x_sr_rvv(const uint8_t *src, int src_stride, uint8_t *dst, 366 int dst_stride, int w, int h, 367 const InterpFilterParams *filter_params_x, 368 const int subpel_x_qn, ConvolveParams *conv_params) { 369 if (w == 2 || h == 2) { 370 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, 371 subpel_x_qn, conv_params); 372 return; 373 } 374 375 int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK); 376 const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; 377 const uint8_t *src_rvv = src - horiz_offset; 378 379 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 380 filter_params_x, subpel_x_qn & SUBPEL_MASK); 381 382 if (filter_taps > 8) { 383 convolve_x_sr_12tap_rvv(src_rvv, src_stride, dst, dst_stride, w, h, 384 x_filter_ptr); 385 return; 386 } 387 388 if (filter_taps <= 4) { 389 convolve_x_sr_4tap_rvv(src_rvv + 2, src_stride, dst, dst_stride, w, h, 390 x_filter_ptr); 391 return; 392 } 393 394 convolve_x_sr_8tap_rvv(src_rvv, src_stride, dst, dst_stride, w, h, 395 x_filter_ptr); 396 return; 397 } 398 399 static inline void store_u8_8x4(uint8_t *s, int p, const vuint8mf2_t s0, 400 const vuint8mf2_t s1, const vuint8mf2_t s2, 401 const vuint8mf2_t s3, size_t vl) { 402 __riscv_vse8_v_u8mf2(s, s0, vl); 403 s += p; 404 __riscv_vse8_v_u8mf2(s, s1, vl); 405 s += p; 406 __riscv_vse8_v_u8mf2(s, s2, vl); 407 s += p; 408 __riscv_vse8_v_u8mf2(s, s3, vl); 409 } 410 411 static inline vuint8mf2_t convolve4_8_y_rvv(const vint16m1_t s0, 412 const vint16m1_t s1, 413 const vint16m1_t s2, 414 const vint16m1_t s3, 415 const int16_t *filter, size_t vl) { 416 vint16m1_t sum = __riscv_vmul_vx_i16m1(s0, filter[0], vl); 417 sum = __riscv_vmacc_vx_i16m1(sum, filter[1], s1, vl); 418 sum = __riscv_vmacc_vx_i16m1(sum, filter[2], s2, vl); 419 sum = __riscv_vmacc_vx_i16m1(sum, filter[3], s3, vl); 420 421 // Round and shift 422 // We halved the filter values so -1 from right shift 423 vuint16m1_t d0 = 424 __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(sum, 0, vl)); 425 426 return __riscv_vnclipu_wx_u8mf2(d0, FILTER_BITS - 1, __RISCV_VXRM_RNU, vl); 427 } 428 429 static inline void convolve_y_sr_4tap_rvv(const uint8_t *src, 430 const int src_stride, uint8_t *dst, 431 const int dst_stride, int w, int h, 432 const int16_t *filter_y) { 433 const int16_t *filter = filter_y + 2; 434 435 if (w == 4) { 436 size_t vl = 8; 437 438 // Load initial data 439 vuint8mf2_t t01 = 440 load_strided_u8_4xN((uint8_t *)src + 0 * src_stride, src_stride, vl); 441 vuint8mf2_t t12 = 442 load_strided_u8_4xN((uint8_t *)src + 1 * src_stride, src_stride, vl); 443 444 // Convert to 16-bit 445 vint16m1_t s01 = 446 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t01, vl)); 447 vint16m1_t s12 = 448 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t12, vl)); 449 450 src += 2 * src_stride; 451 452 do { 453 // Load next set of data 454 vuint8mf2_t t23 = 455 load_strided_u8_4xN((uint8_t *)src + 0 * src_stride, src_stride, vl); 456 vuint8mf2_t t34 = 457 load_strided_u8_4xN((uint8_t *)src + 1 * src_stride, src_stride, vl); 458 vuint8mf2_t t45 = 459 load_strided_u8_4xN((uint8_t *)src + 2 * src_stride, src_stride, vl); 460 vuint8mf2_t t56 = 461 load_strided_u8_4xN((uint8_t *)src + 3 * src_stride, src_stride, vl); 462 463 // Convert to 16-bit 464 vint16m1_t s23 = 465 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t23, vl)); 466 vint16m1_t s34 = 467 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t34, vl)); 468 vint16m1_t s45 = 469 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t45, vl)); 470 vint16m1_t s56 = 471 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t56, vl)); 472 473 // Perform convolution 474 vuint8mf2_t d01 = convolve4_8_y_rvv(s01, s12, s23, s34, filter, vl); 475 vuint8mf2_t d23 = convolve4_8_y_rvv(s23, s34, s45, s56, filter, vl); 476 477 // Store results 478 store_strided_u8_4xN(dst + 0 * dst_stride, d01, dst_stride, vl); 479 store_strided_u8_4xN(dst + 2 * dst_stride, d23, dst_stride, vl); 480 481 s01 = __riscv_vmv_v_v_i16m1(s45, vl); 482 s12 = __riscv_vmv_v_v_i16m1(s56, vl); 483 484 src += 4 * src_stride; 485 dst += 4 * dst_stride; 486 h -= 4; 487 } while (h != 0); 488 } else { 489 // Handle width > 4 case 490 size_t vl = __riscv_vsetvl_e16m1(w); 491 do { 492 // Load initial 3 rows of data 493 vuint8mf2_t t0 = __riscv_vle8_v_u8mf2(src + 0 * src_stride, vl); 494 vuint8mf2_t t1 = __riscv_vle8_v_u8mf2(src + 1 * src_stride, vl); 495 vuint8mf2_t t2 = __riscv_vle8_v_u8mf2(src + 2 * src_stride, vl); 496 497 // Convert to 16-bit 498 vint16m1_t s0 = 499 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl)); 500 vint16m1_t s1 = 501 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl)); 502 vint16m1_t s2 = 503 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl)); 504 505 int height = h; 506 const uint8_t *s = src + 3 * src_stride; 507 uint8_t *d = dst; 508 509 do { 510 // Load next 4 rows of data 511 vuint8mf2_t t3; 512 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3, vl); 513 514 // Convert to 16-bit 515 vint16m1_t s3 = 516 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl)); 517 vint16m1_t s4 = 518 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl)); 519 vint16m1_t s5 = 520 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl)); 521 vint16m1_t s6 = 522 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl)); 523 524 // Perform convolution 525 vuint8mf2_t d0 = convolve4_8_y_rvv(s0, s1, s2, s3, filter, vl); 526 vuint8mf2_t d1 = convolve4_8_y_rvv(s1, s2, s3, s4, filter, vl); 527 vuint8mf2_t d2 = convolve4_8_y_rvv(s2, s3, s4, s5, filter, vl); 528 vuint8mf2_t d3 = convolve4_8_y_rvv(s3, s4, s5, s6, filter, vl); 529 530 // Store results 531 store_u8_8x4(d, dst_stride, d0, d1, d2, d3, vl); 532 533 s0 = __riscv_vmv_v_v_i16m1(s4, vl); 534 s1 = __riscv_vmv_v_v_i16m1(s5, vl); 535 s2 = __riscv_vmv_v_v_i16m1(s6, vl); 536 537 s += 4 * src_stride; 538 d += 4 * dst_stride; 539 height -= 4; 540 } while (height != 0); 541 src += vl; 542 dst += vl; 543 w -= vl; 544 } while (w > 0); 545 } 546 } 547 548 static inline void load_u8_8x5(const uint8_t *s, int p, vuint8mf2_t *const s0, 549 vuint8mf2_t *const s1, vuint8mf2_t *const s2, 550 vuint8mf2_t *const s3, vuint8mf2_t *const s4, 551 size_t vl) { 552 *s0 = __riscv_vle8_v_u8mf2(s, vl); 553 s += p; 554 *s1 = __riscv_vle8_v_u8mf2(s, vl); 555 s += p; 556 *s2 = __riscv_vle8_v_u8mf2(s, vl); 557 s += p; 558 *s3 = __riscv_vle8_v_u8mf2(s, vl); 559 s += p; 560 *s4 = __riscv_vle8_v_u8mf2(s, vl); 561 } 562 563 static inline vuint8mf2_t convolve6_8_y_rvv( 564 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 565 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 566 const int16_t *filter, size_t vl) { 567 // Filter values at indices 0 and 7 are 0, so we start from index 1 568 vint16m1_t sum = __riscv_vmul_vx_i16m1(s0, filter[1], vl); 569 sum = __riscv_vmacc_vx_i16m1(sum, filter[2], s1, vl); 570 sum = __riscv_vmacc_vx_i16m1(sum, filter[3], s2, vl); 571 sum = __riscv_vmacc_vx_i16m1(sum, filter[4], s3, vl); 572 sum = __riscv_vmacc_vx_i16m1(sum, filter[5], s4, vl); 573 sum = __riscv_vmacc_vx_i16m1(sum, filter[6], s5, vl); 574 575 // Round and shift 576 // We halved the filter values so -1 from right shift 577 vuint16m1_t d0 = 578 __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(sum, 0, vl)); 579 580 return __riscv_vnclipu_wx_u8mf2(d0, FILTER_BITS - 1, __RISCV_VXRM_RNU, vl); 581 } 582 583 static inline void convolve_y_sr_6tap_rvv(const uint8_t *src_ptr, 584 int src_stride, uint8_t *dst_ptr, 585 const int dst_stride, int w, int h, 586 const int16_t *y_filter) { 587 size_t vl = __riscv_vsetvl_e16m1(w); 588 do { 589 const uint8_t *s = src_ptr; 590 uint8_t *d = dst_ptr; 591 int height = h; 592 593 // Load initial 5 rows of data 594 vuint8mf2_t t0, t1, t2, t3, t4; 595 load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4, vl); 596 597 // Convert to 16-bit 598 vint16m1_t s0 = 599 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl)); 600 vint16m1_t s1 = 601 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl)); 602 vint16m1_t s2 = 603 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl)); 604 vint16m1_t s3 = 605 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl)); 606 vint16m1_t s4 = 607 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl)); 608 609 s += 5 * src_stride; 610 611 do { 612 // Load next row of data 613 vuint8mf2_t t5 = __riscv_vle8_v_u8mf2(s + 0 * src_stride, vl); 614 vuint8mf2_t t6 = __riscv_vle8_v_u8mf2(s + 1 * src_stride, vl); 615 vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(s + 2 * src_stride, vl); 616 vuint8mf2_t t8 = __riscv_vle8_v_u8mf2(s + 3 * src_stride, vl); 617 618 // Convert to 16-bit 619 vint16m1_t s5 = 620 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl)); 621 vint16m1_t s6 = 622 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl)); 623 vint16m1_t s7 = 624 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl)); 625 vint16m1_t s8 = 626 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t8, vl)); 627 628 // Perform convolution 629 vuint8mf2_t d0 = convolve6_8_y_rvv(s0, s1, s2, s3, s4, s5, y_filter, vl); 630 vuint8mf2_t d1 = convolve6_8_y_rvv(s1, s2, s3, s4, s5, s6, y_filter, vl); 631 vuint8mf2_t d2 = convolve6_8_y_rvv(s2, s3, s4, s5, s6, s7, y_filter, vl); 632 vuint8mf2_t d3 = convolve6_8_y_rvv(s3, s4, s5, s6, s7, s8, y_filter, vl); 633 634 // Store result 635 store_u8_8x4(d, dst_stride, d0, d1, d2, d3, vl); 636 637 // Update sliding window 638 s0 = __riscv_vmv_v_v_i16m1(s4, vl); 639 s1 = __riscv_vmv_v_v_i16m1(s5, vl); 640 s2 = __riscv_vmv_v_v_i16m1(s6, vl); 641 s3 = __riscv_vmv_v_v_i16m1(s7, vl); 642 s4 = __riscv_vmv_v_v_i16m1(s8, vl); 643 s += 4 * src_stride; 644 d += 4 * dst_stride; 645 height -= 4; 646 } while (height != 0); 647 src_ptr += vl; 648 dst_ptr += vl; 649 w -= vl; 650 } while (w > 0); 651 } 652 653 static inline void load_u8_8x7(const uint8_t *s, int p, vuint8mf2_t *const s0, 654 vuint8mf2_t *const s1, vuint8mf2_t *const s2, 655 vuint8mf2_t *const s3, vuint8mf2_t *const s4, 656 vuint8mf2_t *const s5, vuint8mf2_t *const s6, 657 size_t vl) { 658 *s0 = __riscv_vle8_v_u8mf2(s, vl); 659 s += p; 660 *s1 = __riscv_vle8_v_u8mf2(s, vl); 661 s += p; 662 *s2 = __riscv_vle8_v_u8mf2(s, vl); 663 s += p; 664 *s3 = __riscv_vle8_v_u8mf2(s, vl); 665 s += p; 666 *s4 = __riscv_vle8_v_u8mf2(s, vl); 667 s += p; 668 *s5 = __riscv_vle8_v_u8mf2(s, vl); 669 s += p; 670 *s6 = __riscv_vle8_v_u8mf2(s, vl); 671 } 672 673 static inline vuint8mf2_t convolve8_8_y_rvv( 674 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 675 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 676 const vint16m1_t s6, const vint16m1_t s7, const int16_t *filter, 677 size_t vl) { 678 vint16m1_t sum = __riscv_vmul_vx_i16m1(s0, filter[0], vl); 679 sum = __riscv_vmacc_vx_i16m1(sum, filter[1], s1, vl); 680 sum = __riscv_vmacc_vx_i16m1(sum, filter[2], s2, vl); 681 sum = __riscv_vmacc_vx_i16m1(sum, filter[3], s3, vl); 682 sum = __riscv_vmacc_vx_i16m1(sum, filter[4], s4, vl); 683 sum = __riscv_vmacc_vx_i16m1(sum, filter[5], s5, vl); 684 sum = __riscv_vmacc_vx_i16m1(sum, filter[6], s6, vl); 685 sum = __riscv_vmacc_vx_i16m1(sum, filter[7], s7, vl); 686 687 // Round and shift 688 // We halved the filter values so -1 from right shift 689 vuint16m1_t d0 = 690 __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(sum, 0, vl)); 691 692 return __riscv_vnclipu_wx_u8mf2(d0, FILTER_BITS - 1, __RISCV_VXRM_RNU, vl); 693 } 694 695 static inline void convolve_y_sr_8tap_rvv(const uint8_t *src_ptr, 696 int src_stride, uint8_t *dst_ptr, 697 const int dst_stride, int w, int h, 698 const int16_t *y_filter) { 699 size_t vl = __riscv_vsetvl_e16m1(w); 700 do { 701 const uint8_t *s = src_ptr; 702 uint8_t *d = dst_ptr; 703 int height = h; 704 705 // Load initial 7 rows of data 706 vuint8mf2_t t0, t1, t2, t3, t4, t5, t6; 707 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, vl); 708 709 // Convert to 16-bit 710 vint16m1_t s0 = 711 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl)); 712 vint16m1_t s1 = 713 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl)); 714 vint16m1_t s2 = 715 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl)); 716 vint16m1_t s3 = 717 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl)); 718 vint16m1_t s4 = 719 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl)); 720 vint16m1_t s5 = 721 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl)); 722 vint16m1_t s6 = 723 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl)); 724 725 s += 7 * src_stride; 726 727 do { 728 // Load next row 729 vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(s + 0 * src_stride, vl); 730 vuint8mf2_t t8 = __riscv_vle8_v_u8mf2(s + 1 * src_stride, vl); 731 vuint8mf2_t t9 = __riscv_vle8_v_u8mf2(s + 2 * src_stride, vl); 732 vuint8mf2_t t10 = __riscv_vle8_v_u8mf2(s + 3 * src_stride, vl); 733 734 // Convert to 16-bit 735 vint16m1_t s7 = 736 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl)); 737 vint16m1_t s8 = 738 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t8, vl)); 739 vint16m1_t s9 = 740 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t9, vl)); 741 vint16m1_t s10 = 742 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t10, vl)); 743 744 // Perform 8-tap vertical convolution 745 vuint8mf2_t d0 = 746 convolve8_8_y_rvv(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, vl); 747 vuint8mf2_t d1 = 748 convolve8_8_y_rvv(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, vl); 749 vuint8mf2_t d2 = 750 convolve8_8_y_rvv(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, vl); 751 vuint8mf2_t d3 = 752 convolve8_8_y_rvv(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, vl); 753 754 // Store result 755 store_u8_8x4(d, dst_stride, d0, d1, d2, d3, vl); 756 757 // Update sliding window 758 s0 = __riscv_vmv_v_v_i16m1(s4, vl); 759 s1 = __riscv_vmv_v_v_i16m1(s5, vl); 760 s2 = __riscv_vmv_v_v_i16m1(s6, vl); 761 s3 = __riscv_vmv_v_v_i16m1(s7, vl); 762 s4 = __riscv_vmv_v_v_i16m1(s8, vl); 763 s5 = __riscv_vmv_v_v_i16m1(s9, vl); 764 s6 = __riscv_vmv_v_v_i16m1(s10, vl); 765 s += 4 * src_stride; 766 d += 4 * dst_stride; 767 height -= 4; 768 } while (height > 0); 769 src_ptr += vl; 770 dst_ptr += vl; 771 w -= vl; 772 } while (w > 0); 773 } 774 775 static inline void load_u8_8x11(const uint8_t *s, int p, vuint8mf2_t *const s0, 776 vuint8mf2_t *const s1, vuint8mf2_t *const s2, 777 vuint8mf2_t *const s3, vuint8mf2_t *const s4, 778 vuint8mf2_t *const s5, vuint8mf2_t *const s6, 779 vuint8mf2_t *const s7, vuint8mf2_t *const s8, 780 vuint8mf2_t *const s9, vuint8mf2_t *const s10, 781 size_t vl) { 782 *s0 = __riscv_vle8_v_u8mf2(s, vl); 783 s += p; 784 *s1 = __riscv_vle8_v_u8mf2(s, vl); 785 s += p; 786 *s2 = __riscv_vle8_v_u8mf2(s, vl); 787 s += p; 788 *s3 = __riscv_vle8_v_u8mf2(s, vl); 789 s += p; 790 *s4 = __riscv_vle8_v_u8mf2(s, vl); 791 s += p; 792 *s5 = __riscv_vle8_v_u8mf2(s, vl); 793 s += p; 794 *s6 = __riscv_vle8_v_u8mf2(s, vl); 795 s += p; 796 *s7 = __riscv_vle8_v_u8mf2(s, vl); 797 s += p; 798 *s8 = __riscv_vle8_v_u8mf2(s, vl); 799 s += p; 800 *s9 = __riscv_vle8_v_u8mf2(s, vl); 801 s += p; 802 *s10 = __riscv_vle8_v_u8mf2(s, vl); 803 } 804 805 static inline vuint8mf2_t convolve12_8_y_rvv( 806 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 807 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 808 const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8, 809 const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11, 810 const int16_t *y_filter, size_t vl) { 811 // Initialize sum with first multiplication 812 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl); 813 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl); 814 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl); 815 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl); 816 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[4], s4, vl); 817 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[5], s5, vl); 818 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[6], s6, vl); 819 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[7], s7, vl); 820 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[8], s8, vl); 821 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[9], s9, vl); 822 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[10], s10, vl); 823 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[11], s11, vl); 824 825 // Round and shift 826 sum = __riscv_vadd_vx_i32m2(sum, 1 << (FILTER_BITS - 1), vl); 827 vint16m1_t i16_sum = __riscv_vnsra_wx_i16m1(sum, FILTER_BITS, vl); 828 vint16m1_t iclip_sum = 829 __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), 255, vl); 830 831 // Convert to 8-bit 832 return __riscv_vncvt_x_x_w_u8mf2( 833 __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum), vl); 834 } 835 836 static inline void convolve_y_sr_12tap_rvv(const uint8_t *src_ptr, 837 int src_stride, uint8_t *dst_ptr, 838 const int dst_stride, int w, int h, 839 const int16_t *y_filter) { 840 size_t vl = __riscv_vsetvl_e16m1(w); 841 do { 842 const uint8_t *s = src_ptr; 843 uint8_t *d = dst_ptr; 844 int height = h; 845 846 // Load initial 11 rows of data 847 vuint8mf2_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; 848 load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, 849 &t9, &t10, vl); 850 851 // Convert to 16-bit 852 vint16m1_t s0 = 853 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl)); 854 vint16m1_t s1 = 855 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl)); 856 vint16m1_t s2 = 857 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl)); 858 vint16m1_t s3 = 859 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl)); 860 vint16m1_t s4 = 861 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl)); 862 vint16m1_t s5 = 863 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl)); 864 vint16m1_t s6 = 865 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl)); 866 vint16m1_t s7 = 867 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl)); 868 vint16m1_t s8 = 869 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t8, vl)); 870 vint16m1_t s9 = 871 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t9, vl)); 872 vint16m1_t s10 = 873 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t10, vl)); 874 875 s += 11 * src_stride; 876 877 do { 878 // Load next 4 rows 879 vuint8mf2_t t11 = __riscv_vle8_v_u8mf2(s + 0 * src_stride, vl); 880 vuint8mf2_t t12 = __riscv_vle8_v_u8mf2(s + 1 * src_stride, vl); 881 vuint8mf2_t t13 = __riscv_vle8_v_u8mf2(s + 2 * src_stride, vl); 882 vuint8mf2_t t14 = __riscv_vle8_v_u8mf2(s + 3 * src_stride, vl); 883 884 // Convert to 16-bit 885 vint16m1_t s11 = 886 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t11, vl)); 887 vint16m1_t s12 = 888 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t12, vl)); 889 vint16m1_t s13 = 890 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t13, vl)); 891 vint16m1_t s14 = 892 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t14, vl)); 893 894 // Perform 12-tap convolution 895 vuint8mf2_t d0 = convolve12_8_y_rvv(s0, s1, s2, s3, s4, s5, s6, s7, s8, 896 s9, s10, s11, y_filter, vl); 897 vuint8mf2_t d1 = convolve12_8_y_rvv(s1, s2, s3, s4, s5, s6, s7, s8, s9, 898 s10, s11, s12, y_filter, vl); 899 vuint8mf2_t d2 = convolve12_8_y_rvv(s2, s3, s4, s5, s6, s7, s8, s9, s10, 900 s11, s12, s13, y_filter, vl); 901 vuint8mf2_t d3 = convolve12_8_y_rvv(s3, s4, s5, s6, s7, s8, s9, s10, s11, 902 s12, s13, s14, y_filter, vl); 903 904 // Store results 905 store_u8_8x4(d, dst_stride, d0, d1, d2, d3, vl); 906 907 // Update source pointers for next iteration 908 s0 = __riscv_vmv_v_v_i16m1(s4, vl); 909 s1 = __riscv_vmv_v_v_i16m1(s5, vl); 910 s2 = __riscv_vmv_v_v_i16m1(s6, vl); 911 s3 = __riscv_vmv_v_v_i16m1(s7, vl); 912 s4 = __riscv_vmv_v_v_i16m1(s8, vl); 913 s5 = __riscv_vmv_v_v_i16m1(s9, vl); 914 s6 = __riscv_vmv_v_v_i16m1(s10, vl); 915 s7 = __riscv_vmv_v_v_i16m1(s11, vl); 916 s8 = __riscv_vmv_v_v_i16m1(s12, vl); 917 s9 = __riscv_vmv_v_v_i16m1(s13, vl); 918 s10 = __riscv_vmv_v_v_i16m1(s14, vl); 919 s += 4 * src_stride; 920 d += 4 * dst_stride; 921 height -= 4; 922 } while (height != 0); 923 src_ptr += vl; 924 dst_ptr += vl; 925 w -= vl; 926 } while (w > 0); 927 } 928 929 void av1_convolve_y_sr_rvv(const uint8_t *src, int src_stride, uint8_t *dst, 930 int dst_stride, int w, int h, 931 const InterpFilterParams *filter_params_y, 932 const int subpel_y_qn) { 933 if (w == 2 || h == 2) { 934 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, 935 subpel_y_qn); 936 return; 937 } 938 939 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 940 const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; 941 const int vert_offset = clamped_y_taps / 2 - 1; 942 const uint8_t *src_rvv = src - vert_offset * src_stride; 943 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 944 filter_params_y, subpel_y_qn & SUBPEL_MASK); 945 946 if (y_filter_taps > 8) { 947 convolve_y_sr_12tap_rvv(src_rvv, src_stride, dst, dst_stride, w, h, 948 y_filter_ptr); 949 return; 950 } 951 952 // Filter values are even so halve to reduce precision requirements. 953 // In RVV, we need to create a temporary array for the halved filter values 954 int16_t halved_filter[8]; 955 for (int i = 0; i < 8; i++) { 956 halved_filter[i] = y_filter_ptr[i] >> 1; 957 } 958 959 if (y_filter_taps <= 4) { 960 convolve_y_sr_4tap_rvv(src_rvv, src_stride, dst, dst_stride, w, h, 961 halved_filter); 962 } else if (y_filter_taps == 6) { 963 convolve_y_sr_6tap_rvv(src_rvv, src_stride, dst, dst_stride, w, h, 964 halved_filter); 965 } else { 966 convolve_y_sr_8tap_rvv(src_rvv, src_stride, dst, dst_stride, w, h, 967 halved_filter); 968 } 969 } 970 971 static inline vint16m1_t convolve12_4_2d_h_rvv( 972 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 973 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 974 const vint16m1_t filter0, const vint16m1_t filter1, 975 const vint16m1_t filter2, const vint16m1_t filter3, 976 const vint16m1_t filter4, const vint16m1_t filter5, 977 const int16_t horiz_const, size_t vl) { 978 vint32m2_t sum = __riscv_vwmul_vv_i32m2(s0, filter0, vl); 979 sum = __riscv_vwmacc_vv_i32m2(sum, filter1, s1, vl); 980 sum = __riscv_vwmacc_vv_i32m2(sum, filter2, s2, vl); 981 sum = __riscv_vwmacc_vv_i32m2(sum, filter3, s3, vl); 982 sum = __riscv_vwmacc_vv_i32m2(sum, filter4, s4, vl); 983 sum = __riscv_vwmacc_vv_i32m2(sum, filter5, s5, vl); 984 985 sum = __riscv_vadd_vv_i32m2( 986 sum, __riscv_vslidedown_vx_i32m2(sum, vl >> 1, vl), vl >> 1); 987 sum = __riscv_vadd_vx_i32m2(sum, horiz_const, vl >> 1); 988 989 return __riscv_vnsra_wx_i16m1(sum, ROUND0_BITS, vl >> 1); 990 } 991 992 static inline vint16m1_t convolve12_8_2d_h_rvv( 993 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 994 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 995 const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8, 996 const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11, 997 const int16_t *x_filter, const int16_t horiz_const, size_t vl) { 998 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, x_filter[0], vl); 999 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[1], s1, vl); 1000 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[2], s2, vl); 1001 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[3], s3, vl); 1002 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[4], s4, vl); 1003 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[5], s5, vl); 1004 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[6], s6, vl); 1005 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[7], s7, vl); 1006 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[8], s8, vl); 1007 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[9], s9, vl); 1008 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[10], s10, vl); 1009 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[11], s11, vl); 1010 1011 sum = __riscv_vadd_vx_i32m2(sum, horiz_const, vl); 1012 1013 return __riscv_vnsra_wx_i16m1(sum, ROUND0_BITS, vl); 1014 } 1015 1016 static inline void convolve_2d_sr_horiz_12tap_rvv( 1017 const uint8_t *src, int src_stride, int16_t *dst, const int dst_stride, 1018 int w, int h, const int16_t *x_filter_ptr, size_t vl) { 1019 const int bd = 8; 1020 const int16_t horiz_const = 1021 (1 << (bd + FILTER_BITS - 1)) + (1 << ((ROUND0_BITS - 1))); 1022 1023 const int16_t xf0 = x_filter_ptr[0]; 1024 const int16_t xf1 = x_filter_ptr[1]; 1025 const int16_t xf2 = x_filter_ptr[2]; 1026 const int16_t xf3 = x_filter_ptr[3]; 1027 const int16_t xf4 = x_filter_ptr[4]; 1028 const int16_t xf5 = x_filter_ptr[5]; 1029 const int16_t xf6 = x_filter_ptr[6]; 1030 const int16_t xf7 = x_filter_ptr[7]; 1031 const int16_t xf8 = x_filter_ptr[8]; 1032 const int16_t xf9 = x_filter_ptr[9]; 1033 const int16_t xf10 = x_filter_ptr[10]; 1034 const int16_t xf11 = x_filter_ptr[11]; 1035 1036 if (w == 4) { 1037 uint8_t *s = (uint8_t *)src; 1038 int16_t *d = dst; 1039 1040 vl = vl << 1; 1041 1042 const int16_t filter0[8] = { xf0, xf0, xf0, xf0, xf4, xf4, xf4, xf4 }; 1043 const int16_t filter1[8] = { xf1, xf1, xf1, xf1, xf5, xf5, xf5, xf5 }; 1044 const int16_t filter2[8] = { xf2, xf2, xf2, xf2, xf6, xf6, xf6, xf6 }; 1045 const int16_t filter3[8] = { xf3, xf3, xf3, xf3, xf7, xf7, xf7, xf7 }; 1046 const int16_t filter4[8] = { xf8, xf8, xf8, xf8, xf9, xf9, xf9, xf9 }; 1047 const int16_t filter5[8] = { 1048 xf10, xf10, xf10, xf10, xf11, xf11, xf11, xf11 1049 }; 1050 1051 const vint16m1_t vfilter0 = __riscv_vle16_v_i16m1(filter0, vl); 1052 const vint16m1_t vfilter1 = __riscv_vle16_v_i16m1(filter1, vl); 1053 const vint16m1_t vfilter2 = __riscv_vle16_v_i16m1(filter2, vl); 1054 const vint16m1_t vfilter3 = __riscv_vle16_v_i16m1(filter3, vl); 1055 const vint16m1_t vfilter4 = __riscv_vle16_v_i16m1(filter4, vl); 1056 const vint16m1_t vfilter5 = __riscv_vle16_v_i16m1(filter5, vl); 1057 1058 do { 1059 vuint8mf2_t t0 = __riscv_vle8_v_u8mf2(s, vl); 1060 vuint8mf2_t t1 = __riscv_vle8_v_u8mf2(s + 1, vl); 1061 vuint8mf2_t t2 = __riscv_vle8_v_u8mf2(s + 2, vl); 1062 vuint8mf2_t t3 = __riscv_vle8_v_u8mf2(s + 3, vl); 1063 vuint8mf2_t t4 = load_strided_u8_4xN(s + 8, 1, vl); 1064 vuint8mf2_t t5 = load_strided_u8_4xN(s + 10, 1, vl); 1065 1066 vuint8mf2_t t6 = __riscv_vle8_v_u8mf2(s + src_stride, vl); 1067 vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(s + src_stride + 1, vl); 1068 vuint8mf2_t t8 = __riscv_vle8_v_u8mf2(s + src_stride + 2, vl); 1069 vuint8mf2_t t9 = __riscv_vle8_v_u8mf2(s + src_stride + 3, vl); 1070 vuint8mf2_t t10 = load_strided_u8_4xN(s + src_stride + 8, 1, vl); 1071 vuint8mf2_t t11 = load_strided_u8_4xN(s + src_stride + 10, 1, vl); 1072 1073 vint16m1_t s0 = 1074 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl)); 1075 vint16m1_t s1 = 1076 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl)); 1077 vint16m1_t s2 = 1078 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl)); 1079 vint16m1_t s3 = 1080 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl)); 1081 vint16m1_t s4 = 1082 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl)); 1083 vint16m1_t s5 = 1084 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl)); 1085 vint16m1_t s6 = 1086 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl)); 1087 vint16m1_t s7 = 1088 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl)); 1089 vint16m1_t s8 = 1090 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t8, vl)); 1091 vint16m1_t s9 = 1092 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t9, vl)); 1093 vint16m1_t s10 = 1094 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t10, vl)); 1095 vint16m1_t s11 = 1096 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t11, vl)); 1097 1098 vint16m1_t d0 = convolve12_4_2d_h_rvv( 1099 s0, s1, s2, s3, s4, s5, vfilter0, vfilter1, vfilter2, vfilter3, 1100 vfilter4, vfilter5, horiz_const, vl); 1101 vint16m1_t d1 = convolve12_4_2d_h_rvv( 1102 s6, s7, s8, s9, s10, s11, vfilter0, vfilter1, vfilter2, vfilter3, 1103 vfilter4, vfilter5, horiz_const, vl); 1104 1105 __riscv_vse16_v_i16m1(d, d0, vl >> 1); 1106 __riscv_vse16_v_i16m1(d + dst_stride, d1, vl >> 1); 1107 1108 s += src_stride << 1; 1109 d += dst_stride << 1; 1110 h -= 2; 1111 } while (h > 0); 1112 } else { 1113 do { 1114 const uint8_t *s = src; 1115 int16_t *d = dst; 1116 int width = w; 1117 1118 do { 1119 vuint8mf2_t t0 = __riscv_vle8_v_u8mf2(s, vl); 1120 vuint8mf2_t t1 = __riscv_vle8_v_u8mf2(s + 1, vl); 1121 vuint8mf2_t t2 = __riscv_vle8_v_u8mf2(s + 2, vl); 1122 vuint8mf2_t t3 = __riscv_vle8_v_u8mf2(s + 3, vl); 1123 vuint8mf2_t t4 = __riscv_vle8_v_u8mf2(s + 4, vl); 1124 vuint8mf2_t t5 = __riscv_vle8_v_u8mf2(s + 5, vl); 1125 vuint8mf2_t t6 = __riscv_vle8_v_u8mf2(s + 6, vl); 1126 vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(s + 7, vl); 1127 vuint8mf2_t t8 = __riscv_vle8_v_u8mf2(s + 8, vl); 1128 vuint8mf2_t t9 = __riscv_vle8_v_u8mf2(s + 9, vl); 1129 vuint8mf2_t t10 = __riscv_vle8_v_u8mf2(s + 10, vl); 1130 vuint8mf2_t t11 = __riscv_vle8_v_u8mf2(s + 11, vl); 1131 1132 vint16m1_t s0 = 1133 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl)); 1134 vint16m1_t s1 = 1135 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl)); 1136 vint16m1_t s2 = 1137 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl)); 1138 vint16m1_t s3 = 1139 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl)); 1140 vint16m1_t s4 = 1141 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl)); 1142 vint16m1_t s5 = 1143 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl)); 1144 vint16m1_t s6 = 1145 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl)); 1146 vint16m1_t s7 = 1147 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl)); 1148 vint16m1_t s8 = 1149 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t8, vl)); 1150 vint16m1_t s9 = 1151 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t9, vl)); 1152 vint16m1_t s10 = __riscv_vreinterpret_v_u16m1_i16m1( 1153 __riscv_vzext_vf2_u16m1(t10, vl)); 1154 vint16m1_t s11 = __riscv_vreinterpret_v_u16m1_i16m1( 1155 __riscv_vzext_vf2_u16m1(t11, vl)); 1156 1157 vint16m1_t d0 = 1158 convolve12_8_2d_h_rvv(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, 1159 s11, x_filter_ptr, horiz_const, vl); 1160 1161 __riscv_vse16_v_i16m1(d, d0, vl); 1162 1163 s += vl; 1164 d += vl; 1165 width -= vl; 1166 } while (width != 0); 1167 src += src_stride; 1168 dst += dst_stride; 1169 } while (--h != 0); 1170 } 1171 } 1172 1173 static inline vint16m1_t convolve4_2d_h_rvv( 1174 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 1175 const vint16m1_t s3, const int16_t *x_filter, const int16_t horiz_const, 1176 size_t vl) { 1177 vint16m1_t sum = __riscv_vmul_vx_i16m1(s0, x_filter[0], vl); 1178 sum = __riscv_vmacc_vx_i16m1(sum, x_filter[1], s1, vl); 1179 sum = __riscv_vmacc_vx_i16m1(sum, x_filter[2], s2, vl); 1180 sum = __riscv_vmacc_vx_i16m1(sum, x_filter[3], s3, vl); 1181 1182 sum = __riscv_vadd_vx_i16m1(sum, horiz_const, vl); 1183 1184 return __riscv_vsra_vx_i16m1(sum, ROUND0_BITS - 1, vl); 1185 } 1186 1187 static inline void convolve_2d_sr_horiz_4tap_rvv( 1188 const uint8_t *src, ptrdiff_t src_stride, int16_t *dst, 1189 ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x, size_t vl) { 1190 const int bd = 8; 1191 const int16_t *filter = filter_x + 2; 1192 const int16_t horiz_const = 1193 (1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1)); 1194 1195 const int16_t xf0 = filter[0] >> 1; 1196 const int16_t xf1 = filter[1] >> 1; 1197 const int16_t xf2 = filter[2] >> 1; 1198 const int16_t xf3 = filter[3] >> 1; 1199 const int16_t xfilter[4] = { xf0, xf1, xf2, xf3 }; 1200 1201 if (w <= 4) { 1202 vl = vl << 1; 1203 1204 do { 1205 vuint8mf2_t t0 = load_strided_u8_4xN((uint8_t *)src + 0, src_stride, vl); 1206 vuint8mf2_t t1 = load_strided_u8_4xN((uint8_t *)src + 1, src_stride, vl); 1207 vuint8mf2_t t2 = load_strided_u8_4xN((uint8_t *)src + 2, src_stride, vl); 1208 vuint8mf2_t t3 = load_strided_u8_4xN((uint8_t *)src + 3, src_stride, vl); 1209 1210 vint16m1_t s0 = 1211 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl)); 1212 vint16m1_t s1 = 1213 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl)); 1214 vint16m1_t s2 = 1215 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl)); 1216 vint16m1_t s3 = 1217 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl)); 1218 1219 vint16m1_t d0 = 1220 convolve4_2d_h_rvv(s0, s1, s2, s3, xfilter, horiz_const, vl); 1221 1222 store_strided_i16_4xN(dst, d0, dst_stride, vl); 1223 1224 src += src_stride << 1; 1225 dst += dst_stride << 1; 1226 h -= 2; 1227 } while (h > 0); 1228 } else { 1229 do { 1230 int width = w; 1231 const uint8_t *s = src; 1232 int16_t *d = dst; 1233 1234 do { 1235 vuint8mf2_t t0 = __riscv_vle8_v_u8mf2(s + 0, vl); 1236 vuint8mf2_t t1 = __riscv_vle8_v_u8mf2(s + 1, vl); 1237 vuint8mf2_t t2 = __riscv_vle8_v_u8mf2(s + 2, vl); 1238 vuint8mf2_t t3 = __riscv_vle8_v_u8mf2(s + 3, vl); 1239 1240 vuint8mf2_t t4 = __riscv_vle8_v_u8mf2(s + src_stride, vl); 1241 vuint8mf2_t t5 = __riscv_vle8_v_u8mf2(s + src_stride + 1, vl); 1242 vuint8mf2_t t6 = __riscv_vle8_v_u8mf2(s + src_stride + 2, vl); 1243 vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(s + src_stride + 3, vl); 1244 1245 vint16m1_t s0 = 1246 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl)); 1247 vint16m1_t s1 = 1248 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl)); 1249 vint16m1_t s2 = 1250 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl)); 1251 vint16m1_t s3 = 1252 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl)); 1253 1254 vint16m1_t s4 = 1255 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl)); 1256 vint16m1_t s5 = 1257 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl)); 1258 vint16m1_t s6 = 1259 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl)); 1260 vint16m1_t s7 = 1261 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl)); 1262 1263 vint16m1_t d0 = 1264 convolve4_2d_h_rvv(s0, s1, s2, s3, xfilter, horiz_const, vl); 1265 vint16m1_t d1 = 1266 convolve4_2d_h_rvv(s4, s5, s6, s7, xfilter, horiz_const, vl); 1267 1268 __riscv_vse16_v_i16m1(d, d0, vl); 1269 __riscv_vse16_v_i16m1(d + dst_stride, d1, vl); 1270 1271 s += vl; 1272 d += vl; 1273 width -= vl; 1274 } while (width != 0); 1275 src += src_stride << 1; 1276 dst += dst_stride << 1; 1277 h -= 2; 1278 } while (h > 0); 1279 } 1280 } 1281 1282 static inline vint16m1_t convolve8_4_2d_h_rvv( 1283 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 1284 const vint16m1_t s3, const vint16m1_t x_filter0, const vint16m1_t x_filter1, 1285 const vint16m1_t x_filter2, const vint16m1_t x_filter3, 1286 const int16_t horiz_const, size_t vl) { 1287 vint16m1_t sum = __riscv_vmul_vv_i16m1(s0, x_filter0, vl); 1288 sum = __riscv_vmacc_vv_i16m1(sum, x_filter1, s1, vl); 1289 sum = __riscv_vmacc_vv_i16m1(sum, x_filter2, s2, vl); 1290 sum = __riscv_vmacc_vv_i16m1(sum, x_filter3, s3, vl); 1291 1292 sum = __riscv_vadd_vv_i16m1( 1293 sum, __riscv_vslidedown_vx_i16m1(sum, vl >> 1, vl), vl >> 1); 1294 sum = __riscv_vadd_vx_i16m1(sum, horiz_const, vl >> 1); 1295 1296 return __riscv_vsra_vx_i16m1(sum, ROUND0_BITS - 1, vl >> 1); 1297 } 1298 1299 static inline vint16m1_t convolve8_8_2d_h_rvv( 1300 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 1301 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 1302 const vint16m1_t s6, const vint16m1_t s7, const int16_t *x_filter, 1303 const int16_t horiz_const, size_t vl) { 1304 vint16m1_t sum = __riscv_vmul_vx_i16m1(s0, x_filter[0], vl); 1305 sum = __riscv_vmacc_vx_i16m1(sum, x_filter[1], s1, vl); 1306 sum = __riscv_vmacc_vx_i16m1(sum, x_filter[2], s2, vl); 1307 sum = __riscv_vmacc_vx_i16m1(sum, x_filter[3], s3, vl); 1308 sum = __riscv_vmacc_vx_i16m1(sum, x_filter[4], s4, vl); 1309 sum = __riscv_vmacc_vx_i16m1(sum, x_filter[5], s5, vl); 1310 sum = __riscv_vmacc_vx_i16m1(sum, x_filter[6], s6, vl); 1311 sum = __riscv_vmacc_vx_i16m1(sum, x_filter[7], s7, vl); 1312 1313 sum = __riscv_vadd_vx_i16m1(sum, horiz_const, vl); 1314 1315 return __riscv_vsra_vx_i16m1(sum, ROUND0_BITS - 1, vl); 1316 } 1317 1318 static inline void convolve_2d_sr_horiz_8tap_rvv( 1319 const uint8_t *src, ptrdiff_t src_stride, int16_t *dst, 1320 ptrdiff_t dst_stride, int w, int im_h, const int16_t *x_filter_ptr, 1321 size_t vl) { 1322 const int bd = 8; 1323 const int16_t horiz_const = 1324 (1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1)); 1325 1326 int height = im_h; 1327 1328 const int16_t xf0 = x_filter_ptr[0] >> 1; 1329 const int16_t xf1 = x_filter_ptr[1] >> 1; 1330 const int16_t xf2 = x_filter_ptr[2] >> 1; 1331 const int16_t xf3 = x_filter_ptr[3] >> 1; 1332 const int16_t xf4 = x_filter_ptr[4] >> 1; 1333 const int16_t xf5 = x_filter_ptr[5] >> 1; 1334 const int16_t xf6 = x_filter_ptr[6] >> 1; 1335 const int16_t xf7 = x_filter_ptr[7] >> 1; 1336 1337 if (w <= 4) { 1338 vl = vl << 1; 1339 1340 const int16_t filter0[8] = { xf0, xf0, xf0, xf0, xf4, xf4, xf4, xf4 }; 1341 const int16_t filter1[8] = { xf1, xf1, xf1, xf1, xf5, xf5, xf5, xf5 }; 1342 const int16_t filter2[8] = { xf2, xf2, xf2, xf2, xf6, xf6, xf6, xf6 }; 1343 const int16_t filter3[8] = { xf3, xf3, xf3, xf3, xf7, xf7, xf7, xf7 }; 1344 1345 const vint16m1_t vfilter0 = __riscv_vle16_v_i16m1(filter0, vl); 1346 const vint16m1_t vfilter1 = __riscv_vle16_v_i16m1(filter1, vl); 1347 const vint16m1_t vfilter2 = __riscv_vle16_v_i16m1(filter2, vl); 1348 const vint16m1_t vfilter3 = __riscv_vle16_v_i16m1(filter3, vl); 1349 1350 do { 1351 vuint8mf2_t t0 = __riscv_vle8_v_u8mf2(src, vl); 1352 vuint8mf2_t t1 = __riscv_vle8_v_u8mf2(src + 1, vl); 1353 vuint8mf2_t t2 = __riscv_vle8_v_u8mf2(src + 2, vl); 1354 vuint8mf2_t t3 = __riscv_vle8_v_u8mf2(src + 3, vl); 1355 1356 vuint8mf2_t t4 = __riscv_vle8_v_u8mf2(src + src_stride, vl); 1357 vuint8mf2_t t5 = __riscv_vle8_v_u8mf2(src + src_stride + 1, vl); 1358 vuint8mf2_t t6 = __riscv_vle8_v_u8mf2(src + src_stride + 2, vl); 1359 vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(src + src_stride + 3, vl); 1360 1361 vint16m1_t s0 = 1362 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl)); 1363 vint16m1_t s1 = 1364 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl)); 1365 vint16m1_t s2 = 1366 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl)); 1367 vint16m1_t s3 = 1368 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl)); 1369 vint16m1_t s4 = 1370 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl)); 1371 vint16m1_t s5 = 1372 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl)); 1373 vint16m1_t s6 = 1374 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl)); 1375 vint16m1_t s7 = 1376 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl)); 1377 1378 vint16m1_t d0 = convolve8_4_2d_h_rvv(s0, s1, s2, s3, vfilter0, vfilter1, 1379 vfilter2, vfilter3, horiz_const, vl); 1380 vint16m1_t d1 = convolve8_4_2d_h_rvv(s4, s5, s6, s7, vfilter0, vfilter1, 1381 vfilter2, vfilter3, horiz_const, vl); 1382 1383 __riscv_vse16_v_i16m1(dst, d0, vl >> 1); 1384 __riscv_vse16_v_i16m1(dst + dst_stride, d1, vl >> 1); 1385 1386 src += src_stride << 1; 1387 dst += dst_stride << 1; 1388 height -= 2; 1389 } while (height > 0); 1390 } else { 1391 const int16_t xfilter[8] = { xf0, xf1, xf2, xf3, xf4, xf5, xf6, xf7 }; 1392 1393 do { 1394 const uint8_t *s = src; 1395 int16_t *d = dst; 1396 int width = w; 1397 1398 do { 1399 vuint8mf2_t t0 = __riscv_vle8_v_u8mf2(s, vl); 1400 vuint8mf2_t t1 = __riscv_vle8_v_u8mf2(s + 1, vl); 1401 vuint8mf2_t t2 = __riscv_vle8_v_u8mf2(s + 2, vl); 1402 vuint8mf2_t t3 = __riscv_vle8_v_u8mf2(s + 3, vl); 1403 vuint8mf2_t t4 = __riscv_vle8_v_u8mf2(s + 4, vl); 1404 vuint8mf2_t t5 = __riscv_vle8_v_u8mf2(s + 5, vl); 1405 vuint8mf2_t t6 = __riscv_vle8_v_u8mf2(s + 6, vl); 1406 vuint8mf2_t t7 = __riscv_vle8_v_u8mf2(s + 7, vl); 1407 1408 vint16m1_t s0 = 1409 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t0, vl)); 1410 vint16m1_t s1 = 1411 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t1, vl)); 1412 vint16m1_t s2 = 1413 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t2, vl)); 1414 vint16m1_t s3 = 1415 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t3, vl)); 1416 vint16m1_t s4 = 1417 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t4, vl)); 1418 vint16m1_t s5 = 1419 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t5, vl)); 1420 vint16m1_t s6 = 1421 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t6, vl)); 1422 vint16m1_t s7 = 1423 __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(t7, vl)); 1424 1425 vint16m1_t d0 = convolve8_8_2d_h_rvv(s0, s1, s2, s3, s4, s5, s6, s7, 1426 xfilter, horiz_const, vl); 1427 1428 __riscv_vse16_v_i16m1(d, d0, vl); 1429 1430 s += vl; 1431 d += vl; 1432 width -= vl; 1433 } while (width != 0); 1434 src += src_stride; 1435 dst += dst_stride; 1436 } while (--height != 0); 1437 } 1438 } 1439 1440 void av1_convolve_2d_sr_rvv(const uint8_t *src, int src_stride, uint8_t *dst, 1441 int dst_stride, int w, int h, 1442 const InterpFilterParams *filter_params_x, 1443 const InterpFilterParams *filter_params_y, 1444 const int subpel_x_qn, const int subpel_y_qn, 1445 ConvolveParams *conv_params) { 1446 if (w == 2 || h == 2) { 1447 av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, 1448 filter_params_x, filter_params_y, subpel_x_qn, 1449 subpel_y_qn, conv_params); 1450 return; 1451 } 1452 1453 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 1454 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); 1455 const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; 1456 const int im_h = h + clamped_y_taps - 1; 1457 const int im_stride = MAX_SB_SIZE; 1458 const int vert_offset = clamped_y_taps / 2 - 1; 1459 const int horiz_offset = filter_params_x->taps / 2 - 1; 1460 const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; 1461 1462 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 1463 filter_params_x, subpel_x_qn & SUBPEL_MASK); 1464 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 1465 filter_params_y, subpel_y_qn & SUBPEL_MASK); 1466 1467 size_t vl = __riscv_vsetvl_e16m1(w); 1468 1469 if (filter_params_x->taps > 8) { 1470 DECLARE_ALIGNED(16, int16_t, 1471 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); 1472 1473 convolve_2d_sr_horiz_12tap_rvv(src_ptr, src_stride, im_block, im_stride, w, 1474 im_h, x_filter_ptr, vl); 1475 convolve_2d_sr_vert_12tap_rvv(im_block, im_stride, dst, dst_stride, w, h, 1476 y_filter_ptr, vl); 1477 } else { 1478 DECLARE_ALIGNED(16, int16_t, 1479 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); 1480 1481 // horizontal filter 1482 if (x_filter_taps <= 4) { 1483 convolve_2d_sr_horiz_4tap_rvv(src_ptr + 2, src_stride, im_block, 1484 im_stride, w, im_h, x_filter_ptr, vl); 1485 } else { 1486 convolve_2d_sr_horiz_8tap_rvv(src_ptr, src_stride, im_block, im_stride, w, 1487 im_h, x_filter_ptr, vl); 1488 } 1489 1490 // vertical filter 1491 if (clamped_y_taps <= 4) { 1492 convolve_2d_sr_vert_4tap_rvv(im_block, im_stride, dst, dst_stride, w, h, 1493 y_filter_ptr, vl); 1494 } else if (clamped_y_taps == 6) { 1495 convolve_2d_sr_vert_6tap_rvv(im_block, im_stride, dst, dst_stride, w, h, 1496 y_filter_ptr, vl); 1497 } else { 1498 convolve_2d_sr_vert_8tap_rvv(im_block, im_stride, dst, dst_stride, w, h, 1499 y_filter_ptr, vl); 1500 } 1501 } 1502 } 1503 1504 void av1_convolve_x_sr_intrabc_rvv(const uint8_t *src, int src_stride, 1505 uint8_t *dst, int dst_stride, int w, int h, 1506 const InterpFilterParams *filter_params_x, 1507 const int subpel_x_qn, 1508 ConvolveParams *conv_params) { 1509 assert(subpel_x_qn == 8); 1510 assert(filter_params_x->taps == 2); 1511 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); 1512 (void)filter_params_x; 1513 (void)subpel_x_qn; 1514 (void)conv_params; 1515 1516 size_t vl = __riscv_vsetvl_e8m1(w); 1517 if (w <= 8) { 1518 do { 1519 // Load 1520 vuint8mf2_t s0_0 = __riscv_vle8_v_u8mf2(src, vl); 1521 vuint8mf2_t s0_1 = __riscv_vle8_v_u8mf2(src + 1, vl); 1522 vuint8mf2_t s1_0 = __riscv_vle8_v_u8mf2(src + src_stride, vl); 1523 vuint8mf2_t s1_1 = __riscv_vle8_v_u8mf2(src + src_stride + 1, vl); 1524 1525 // Average the values 1526 vuint8mf2_t d0 = 1527 __riscv_vaaddu_vv_u8mf2(s0_0, s0_1, __RISCV_VXRM_RNU, vl); 1528 vuint8mf2_t d1 = 1529 __riscv_vaaddu_vv_u8mf2(s1_0, s1_1, __RISCV_VXRM_RNU, vl); 1530 1531 __riscv_vse8_v_u8mf2(dst, d0, vl); 1532 __riscv_vse8_v_u8mf2(dst + dst_stride, d1, vl); 1533 1534 src += src_stride << 1; 1535 dst += dst_stride << 1; 1536 h -= 2; 1537 } while (h > 0); 1538 } else { 1539 do { 1540 const uint8_t *src_ptr = src; 1541 uint8_t *dst_ptr = dst; 1542 int width = w; 1543 1544 do { 1545 // Load 1546 vuint8m1_t s0 = __riscv_vle8_v_u8m1(src_ptr, vl); 1547 vuint8m1_t s1 = __riscv_vle8_v_u8m1(src_ptr + 1, vl); 1548 vuint8m1_t s2 = __riscv_vle8_v_u8m1(src_ptr + src_stride, vl); 1549 vuint8m1_t s3 = __riscv_vle8_v_u8m1(src_ptr + src_stride + 1, vl); 1550 1551 // Average the values 1552 vuint8m1_t d0 = __riscv_vaaddu_vv_u8m1(s0, s1, __RISCV_VXRM_RNU, vl); 1553 vuint8m1_t d1 = __riscv_vaaddu_vv_u8m1(s2, s3, __RISCV_VXRM_RNU, vl); 1554 1555 // Store 1556 __riscv_vse8_v_u8m1(dst_ptr, d0, vl); 1557 __riscv_vse8_v_u8m1(dst_ptr + dst_stride, d1, vl); 1558 1559 src_ptr += vl; 1560 dst_ptr += vl; 1561 width -= vl; 1562 } while (width > 0); 1563 src += src_stride << 1; 1564 dst += dst_stride << 1; 1565 h -= 2; 1566 } while (h > 0); 1567 } 1568 } 1569 1570 void av1_convolve_y_sr_intrabc_rvv(const uint8_t *src, int src_stride, 1571 uint8_t *dst, int dst_stride, int w, int h, 1572 const InterpFilterParams *filter_params_y, 1573 const int subpel_y_qn) { 1574 assert(subpel_y_qn == 8); 1575 assert(filter_params_y->taps == 2); 1576 (void)filter_params_y; 1577 (void)subpel_y_qn; 1578 1579 size_t vl = __riscv_vsetvl_e8m1(w); 1580 if (w <= 8) { 1581 vuint8mf2_t s0 = __riscv_vle8_v_u8mf2(src, vl); 1582 1583 do { 1584 vuint8mf2_t s1 = __riscv_vle8_v_u8mf2(src + src_stride, vl); 1585 vuint8mf2_t s2 = __riscv_vle8_v_u8mf2(src + 2 * src_stride, vl); 1586 1587 // Average the values 1588 vuint8mf2_t d0 = __riscv_vaaddu_vv_u8mf2(s0, s1, __RISCV_VXRM_RNU, vl); 1589 vuint8mf2_t d1 = __riscv_vaaddu_vv_u8mf2(s1, s2, __RISCV_VXRM_RNU, vl); 1590 1591 __riscv_vse8_v_u8mf2(dst, d0, vl); 1592 __riscv_vse8_v_u8mf2(dst + dst_stride, d1, vl); 1593 1594 s0 = s2; 1595 src += src_stride << 1; 1596 dst += dst_stride << 1; 1597 h -= 2; 1598 } while (h > 0); 1599 } else { 1600 do { 1601 const uint8_t *src_ptr = src; 1602 uint8_t *dst_ptr = dst; 1603 int height = h; 1604 1605 vuint8m1_t s0 = __riscv_vle8_v_u8m1(src_ptr, vl); 1606 1607 do { 1608 vuint8m1_t s1 = __riscv_vle8_v_u8m1(src_ptr + src_stride, vl); 1609 vuint8m1_t s2 = __riscv_vle8_v_u8m1(src_ptr + 2 * src_stride, vl); 1610 1611 // Average the values 1612 vuint8m1_t d0 = __riscv_vaaddu_vv_u8m1(s0, s1, __RISCV_VXRM_RNU, vl); 1613 vuint8m1_t d1 = __riscv_vaaddu_vv_u8m1(s1, s2, __RISCV_VXRM_RNU, vl); 1614 1615 // Store 1616 __riscv_vse8_v_u8m1(dst_ptr, d0, vl); 1617 __riscv_vse8_v_u8m1(dst_ptr + dst_stride, d1, vl); 1618 1619 s0 = s2; 1620 src_ptr += src_stride << 1; 1621 dst_ptr += dst_stride << 1; 1622 height -= 2; 1623 } while (height > 0); 1624 src += vl; 1625 dst += vl; 1626 w -= vl; 1627 } while (w > 0); 1628 } 1629 } 1630 1631 void av1_convolve_2d_sr_intrabc_rvv(const uint8_t *src, int src_stride, 1632 uint8_t *dst, int dst_stride, int w, int h, 1633 const InterpFilterParams *filter_params_x, 1634 const InterpFilterParams *filter_params_y, 1635 const int subpel_x_qn, 1636 const int subpel_y_qn, 1637 ConvolveParams *conv_params) { 1638 assert(subpel_x_qn == 8); 1639 assert(subpel_y_qn == 8); 1640 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); 1641 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); 1642 (void)filter_params_x; 1643 (void)subpel_x_qn; 1644 (void)filter_params_y; 1645 (void)subpel_y_qn; 1646 (void)conv_params; 1647 1648 size_t vl = __riscv_vsetvl_e16m1(w); 1649 1650 if (w <= 8) { 1651 // Horizontal filter. 1652 vuint8mf2_t s0 = __riscv_vle8_v_u8mf2(src, vl); 1653 vuint8mf2_t s1 = __riscv_vle8_v_u8mf2(src + 1, vl); 1654 src += src_stride; 1655 1656 vuint16m1_t sum0 = __riscv_vwaddu_vv_u16m1(s0, s1, vl); 1657 1658 do { 1659 vuint8mf2_t s2 = __riscv_vle8_v_u8mf2(src, vl); 1660 vuint8mf2_t s3 = __riscv_vle8_v_u8mf2(src + 1, vl); 1661 src += src_stride; 1662 vuint8mf2_t s4 = __riscv_vle8_v_u8mf2(src, vl); 1663 vuint8mf2_t s5 = __riscv_vle8_v_u8mf2(src + 1, vl); 1664 src += src_stride; 1665 1666 vuint16m1_t sum1 = __riscv_vwaddu_vv_u16m1(s2, s3, vl); 1667 vuint16m1_t sum2 = __riscv_vwaddu_vv_u16m1(s4, s5, vl); 1668 1669 // Vertical filter. 1670 vuint8mf2_t d0 = __riscv_vnclipu_wx_u8mf2( 1671 __riscv_vadd_vv_u16m1(sum0, sum1, vl), 2, __RISCV_VXRM_RNU, vl); 1672 vuint8mf2_t d1 = __riscv_vnclipu_wx_u8mf2( 1673 __riscv_vadd_vv_u16m1(sum1, sum2, vl), 2, __RISCV_VXRM_RNU, vl); 1674 1675 __riscv_vse8_v_u8mf2(dst, d0, vl); 1676 dst += dst_stride; 1677 __riscv_vse8_v_u8mf2(dst, d1, vl); 1678 dst += dst_stride; 1679 1680 sum0 = sum2; 1681 h -= 2; 1682 } while (h != 0); 1683 } else { 1684 do { 1685 uint8_t *src_ptr = (uint8_t *)src; 1686 uint8_t *dst_ptr = dst; 1687 int height = h; 1688 1689 // Horizontal filter. 1690 vuint8mf2_t s0 = __riscv_vle8_v_u8mf2(src_ptr, vl); 1691 vuint8mf2_t s1 = __riscv_vle8_v_u8mf2(src_ptr + 1, vl); 1692 src_ptr += src_stride; 1693 1694 vuint16m1_t sum0 = __riscv_vwaddu_vv_u16m1(s0, s1, vl); 1695 1696 do { 1697 vuint8mf2_t s2 = __riscv_vle8_v_u8mf2(src_ptr, vl); 1698 vuint8mf2_t s3 = __riscv_vle8_v_u8mf2(src_ptr + 1, vl); 1699 src_ptr += src_stride; 1700 vuint8mf2_t s4 = __riscv_vle8_v_u8mf2(src_ptr, vl); 1701 vuint8mf2_t s5 = __riscv_vle8_v_u8mf2(src_ptr + 1, vl); 1702 src_ptr += src_stride; 1703 1704 vuint16m1_t sum1 = __riscv_vwaddu_vv_u16m1(s2, s3, vl); 1705 vuint16m1_t sum2 = __riscv_vwaddu_vv_u16m1(s4, s5, vl); 1706 1707 // Vertical filter. 1708 vuint8mf2_t d0 = __riscv_vnclipu_wx_u8mf2( 1709 __riscv_vadd_vv_u16m1(sum0, sum1, vl), 2, __RISCV_VXRM_RNU, vl); 1710 vuint8mf2_t d1 = __riscv_vnclipu_wx_u8mf2( 1711 __riscv_vadd_vv_u16m1(sum1, sum2, vl), 2, __RISCV_VXRM_RNU, vl); 1712 1713 __riscv_vse8_v_u8mf2(dst_ptr, d0, vl); 1714 dst_ptr += dst_stride; 1715 __riscv_vse8_v_u8mf2(dst_ptr, d1, vl); 1716 dst_ptr += dst_stride; 1717 1718 sum0 = sum2; 1719 height -= 2; 1720 } while (height != 0); 1721 1722 src += vl; 1723 dst += vl; 1724 w -= vl; 1725 } while (w != 0); 1726 } 1727 }