highbd_convolve_rvv.c (71814B)
1 /* 2 * Copyright (c) 2025, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 14 #include "config/aom_config.h" 15 #include "config/av1_rtcd.h" 16 17 #include "aom_dsp/riscv/mem_rvv.h" 18 #include "aom_ports/mem.h" 19 #include "av1/common/filter.h" 20 #include "av1/common/riscv/convolve_rvv.h" 21 22 static inline vuint16mf2_t highbd_convolve6_4_y_rvv( 23 const vint16mf2_t s0, const vint16mf2_t s1, const vint16mf2_t s2, 24 const vint16mf2_t s3, const vint16mf2_t s4, const vint16mf2_t s5, 25 const int16_t *filter, const uint16_t max, size_t vl) { 26 // Values at indices 0 and 7 of y_filter are zero. 27 vint32m1_t sum = __riscv_vwmul_vx_i32m1(s0, filter[1], vl); 28 sum = __riscv_vwmacc_vx_i32m1(sum, filter[2], s1, vl); 29 sum = __riscv_vwmacc_vx_i32m1(sum, filter[3], s2, vl); 30 sum = __riscv_vwmacc_vx_i32m1(sum, filter[4], s3, vl); 31 sum = __riscv_vwmacc_vx_i32m1(sum, filter[5], s4, vl); 32 sum = __riscv_vwmacc_vx_i32m1(sum, filter[6], s5, vl); 33 34 // Add rounding constant and shift 35 sum = __riscv_vadd_vx_i32m1(sum, 1 << (COMPOUND_ROUND1_BITS - 1), vl); 36 37 // Narrow result to 16-bit with rounding and saturation 38 vint16mf2_t res = __riscv_vnsra_wx_i16mf2(sum, COMPOUND_ROUND1_BITS, vl); 39 40 // Clamp result to max value 41 vuint16mf2_t d0 = 42 __riscv_vreinterpret_v_i16mf2_u16mf2(__riscv_vmax_vx_i16mf2(res, 0, vl)); 43 return __riscv_vminu_vx_u16mf2(d0, max, vl); 44 } 45 46 static inline vuint16m1_t highbd_convolve6_8_y_rvv( 47 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 48 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 49 const int16_t *filter, const uint16_t max, size_t vl) { 50 // Values at indices 0 and 7 of y_filter are zero. 51 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, filter[1], vl); 52 sum = __riscv_vwmacc_vx_i32m2(sum, filter[2], s1, vl); 53 sum = __riscv_vwmacc_vx_i32m2(sum, filter[3], s2, vl); 54 sum = __riscv_vwmacc_vx_i32m2(sum, filter[4], s3, vl); 55 sum = __riscv_vwmacc_vx_i32m2(sum, filter[5], s4, vl); 56 sum = __riscv_vwmacc_vx_i32m2(sum, filter[6], s5, vl); 57 58 // Add rounding constant and shift 59 sum = __riscv_vadd_vx_i32m2(sum, 1 << (COMPOUND_ROUND1_BITS - 1), vl); 60 61 // Narrow result to 16-bit with rounding and saturation 62 vint16m1_t res = __riscv_vnsra_wx_i16m1(sum, COMPOUND_ROUND1_BITS, vl); 63 64 // Clamp result to max value 65 vuint16m1_t d0 = 66 __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(res, 0, vl)); 67 return __riscv_vminu_vx_u16m1(d0, max, vl); 68 } 69 70 static inline void highbd_convolve_y_sr_6tap_rvv( 71 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 72 int w, int h, const int16_t *y_filter, int bd) { 73 const uint16_t max = (1 << bd) - 1; 74 size_t vl = __riscv_vsetvl_e16m1(w); 75 76 if (w == 4) { 77 const int16_t *s = (const int16_t *)(src_ptr + src_stride); 78 uint16_t *d = dst_ptr; 79 80 // Load initial 5 rows of data 81 vint16mf2_t s0, s1, s2, s3, s4; 82 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4, vl); 83 s += 5 * src_stride; 84 85 do { 86 // Load next 4 rows of data 87 vint16mf2_t s5, s6, s7, s8; 88 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8, vl); 89 90 // Perform 6-tap convolution for 4 rows 91 vuint16mf2_t d0 = 92 highbd_convolve6_4_y_rvv(s0, s1, s2, s3, s4, s5, y_filter, max, vl); 93 vuint16mf2_t d1 = 94 highbd_convolve6_4_y_rvv(s1, s2, s3, s4, s5, s6, y_filter, max, vl); 95 vuint16mf2_t d2 = 96 highbd_convolve6_4_y_rvv(s2, s3, s4, s5, s6, s7, y_filter, max, vl); 97 vuint16mf2_t d3 = 98 highbd_convolve6_4_y_rvv(s3, s4, s5, s6, s7, s8, y_filter, max, vl); 99 100 // Store results 101 store_u16_4x4(d, dst_stride, d0, d1, d2, d3, vl); 102 103 // Update source pointers for next iteration 104 s0 = __riscv_vmv_v_v_i16mf2(s4, vl); 105 s1 = __riscv_vmv_v_v_i16mf2(s5, vl); 106 s2 = __riscv_vmv_v_v_i16mf2(s6, vl); 107 s3 = __riscv_vmv_v_v_i16mf2(s7, vl); 108 s4 = __riscv_vmv_v_v_i16mf2(s8, vl); 109 110 s += 4 * src_stride; 111 d += 4 * dst_stride; 112 h -= 4; 113 } while (h != 0); 114 } else { 115 do { 116 int height = h; 117 const int16_t *s = (const int16_t *)(src_ptr + src_stride); 118 uint16_t *d = dst_ptr; 119 120 // Load initial 5 rows of data 121 vint16m1_t s0, s1, s2, s3, s4; 122 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4, vl); 123 s += 5 * src_stride; 124 125 do { 126 // Load next 4 rows of data 127 vint16m1_t s5, s6, s7, s8; 128 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8, vl); 129 130 // Perform 6-tap convolution for 4 rows 131 vuint16m1_t d0 = 132 highbd_convolve6_8_y_rvv(s0, s1, s2, s3, s4, s5, y_filter, max, vl); 133 vuint16m1_t d1 = 134 highbd_convolve6_8_y_rvv(s1, s2, s3, s4, s5, s6, y_filter, max, vl); 135 vuint16m1_t d2 = 136 highbd_convolve6_8_y_rvv(s2, s3, s4, s5, s6, s7, y_filter, max, vl); 137 vuint16m1_t d3 = 138 highbd_convolve6_8_y_rvv(s3, s4, s5, s6, s7, s8, y_filter, max, vl); 139 140 // Store results 141 store_u16_8x4(d, dst_stride, d0, d1, d2, d3, vl); 142 143 // Update source pointers for next iteration 144 s0 = __riscv_vmv_v_v_i16m1(s4, vl); 145 s1 = __riscv_vmv_v_v_i16m1(s5, vl); 146 s2 = __riscv_vmv_v_v_i16m1(s6, vl); 147 s3 = __riscv_vmv_v_v_i16m1(s7, vl); 148 s4 = __riscv_vmv_v_v_i16m1(s8, vl); 149 150 s += 4 * src_stride; 151 d += 4 * dst_stride; 152 height -= 4; 153 } while (height != 0); 154 155 src_ptr += vl; 156 dst_ptr += vl; 157 w -= vl; 158 } while (w > 0); 159 } 160 } 161 162 static inline vuint16mf2_t highbd_convolve8_4_y_rvv( 163 const vint16mf2_t s0, const vint16mf2_t s1, const vint16mf2_t s2, 164 const vint16mf2_t s3, const vint16mf2_t s4, const vint16mf2_t s5, 165 const vint16mf2_t s6, const vint16mf2_t s7, const int16_t *filter, 166 const uint16_t max, size_t vl) { 167 vint32m1_t sum = __riscv_vwmul_vx_i32m1(s0, filter[0], vl); 168 sum = __riscv_vwmacc_vx_i32m1(sum, filter[1], s1, vl); 169 sum = __riscv_vwmacc_vx_i32m1(sum, filter[2], s2, vl); 170 sum = __riscv_vwmacc_vx_i32m1(sum, filter[3], s3, vl); 171 sum = __riscv_vwmacc_vx_i32m1(sum, filter[4], s4, vl); 172 sum = __riscv_vwmacc_vx_i32m1(sum, filter[5], s5, vl); 173 sum = __riscv_vwmacc_vx_i32m1(sum, filter[6], s6, vl); 174 sum = __riscv_vwmacc_vx_i32m1(sum, filter[7], s7, vl); 175 176 // Convert to unsigned 16-bit with saturation 177 vuint32m1_t d0 = 178 __riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vx_i32m1(sum, 0, vl)); 179 vuint16mf2_t res = 180 __riscv_vnclipu_wx_u16mf2(d0, COMPOUND_ROUND1_BITS, __RISCV_VXRM_RNU, vl); 181 182 // Clamp to max 183 return __riscv_vminu_vx_u16mf2(res, max, vl); 184 } 185 186 static inline vuint16m1_t highbd_convolve8_8_y_rvv( 187 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 188 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 189 const vint16m1_t s6, const vint16m1_t s7, const int16_t *filter, 190 const uint16_t max, size_t vl) { 191 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, filter[0], vl); 192 sum = __riscv_vwmacc_vx_i32m2(sum, filter[1], s1, vl); 193 sum = __riscv_vwmacc_vx_i32m2(sum, filter[2], s2, vl); 194 sum = __riscv_vwmacc_vx_i32m2(sum, filter[3], s3, vl); 195 sum = __riscv_vwmacc_vx_i32m2(sum, filter[4], s4, vl); 196 sum = __riscv_vwmacc_vx_i32m2(sum, filter[5], s5, vl); 197 sum = __riscv_vwmacc_vx_i32m2(sum, filter[6], s6, vl); 198 sum = __riscv_vwmacc_vx_i32m2(sum, filter[7], s7, vl); 199 200 // Convert to unsigned 16-bit with saturation 201 vuint32m2_t d0 = 202 __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmax_vx_i32m2(sum, 0, vl)); 203 vuint16m1_t res = 204 __riscv_vnclipu_wx_u16m1(d0, COMPOUND_ROUND1_BITS, __RISCV_VXRM_RNU, vl); 205 206 // Clamp to max 207 return __riscv_vminu_vx_u16m1(res, max, vl); 208 } 209 210 static inline void highbd_convolve_y_sr_8tap_rvv( 211 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 212 int w, int h, const int16_t *y_filter, int bd) { 213 const uint16_t max = (1 << bd) - 1; 214 size_t vl = __riscv_vsetvl_e16m1(w); 215 216 if (w == 4) { 217 const int16_t *s = (const int16_t *)src_ptr; 218 uint16_t *d = dst_ptr; 219 220 // Load initial 7 rows of data 221 vint16mf2_t s0, s1, s2, s3, s4, s5, s6; 222 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, vl); 223 s += 7 * src_stride; 224 225 do { 226 // Load next 4 rows of data 227 vint16mf2_t s7, s8, s9, s10; 228 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10, vl); 229 230 // Perform 8-tap convolution for 4 rows 231 vuint16mf2_t d0 = highbd_convolve8_4_y_rvv(s0, s1, s2, s3, s4, s5, s6, s7, 232 y_filter, max, vl); 233 vuint16mf2_t d1 = highbd_convolve8_4_y_rvv(s1, s2, s3, s4, s5, s6, s7, s8, 234 y_filter, max, vl); 235 vuint16mf2_t d2 = highbd_convolve8_4_y_rvv(s2, s3, s4, s5, s6, s7, s8, s9, 236 y_filter, max, vl); 237 vuint16mf2_t d3 = highbd_convolve8_4_y_rvv(s3, s4, s5, s6, s7, s8, s9, 238 s10, y_filter, max, vl); 239 240 // Store results 241 store_u16_4x4(d, dst_stride, d0, d1, d2, d3, vl); 242 243 // Update source pointers for next iteration 244 s0 = __riscv_vmv_v_v_i16mf2(s4, vl); 245 s1 = __riscv_vmv_v_v_i16mf2(s5, vl); 246 s2 = __riscv_vmv_v_v_i16mf2(s6, vl); 247 s3 = __riscv_vmv_v_v_i16mf2(s7, vl); 248 s4 = __riscv_vmv_v_v_i16mf2(s8, vl); 249 s5 = __riscv_vmv_v_v_i16mf2(s9, vl); 250 s6 = __riscv_vmv_v_v_i16mf2(s10, vl); 251 252 s += 4 * src_stride; 253 d += 4 * dst_stride; 254 h -= 4; 255 } while (h != 0); 256 } else { 257 do { 258 int height = h; 259 const int16_t *s = (const int16_t *)src_ptr; 260 uint16_t *d = dst_ptr; 261 262 // Load initial 7 rows of data 263 vint16m1_t s0, s1, s2, s3, s4, s5, s6; 264 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, vl); 265 s += 7 * src_stride; 266 267 do { 268 // Load next 4 rows of data 269 vint16m1_t s7, s8, s9, s10; 270 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10, vl); 271 272 // Perform 8-tap convolution for 4 rows 273 vuint16m1_t d0 = highbd_convolve8_8_y_rvv(s0, s1, s2, s3, s4, s5, s6, 274 s7, y_filter, max, vl); 275 vuint16m1_t d1 = highbd_convolve8_8_y_rvv(s1, s2, s3, s4, s5, s6, s7, 276 s8, y_filter, max, vl); 277 vuint16m1_t d2 = highbd_convolve8_8_y_rvv(s2, s3, s4, s5, s6, s7, s8, 278 s9, y_filter, max, vl); 279 vuint16m1_t d3 = highbd_convolve8_8_y_rvv(s3, s4, s5, s6, s7, s8, s9, 280 s10, y_filter, max, vl); 281 282 // Store results 283 store_u16_8x4(d, dst_stride, d0, d1, d2, d3, vl); 284 285 // Update source pointers for next iteration 286 s0 = __riscv_vmv_v_v_i16m1(s4, vl); 287 s1 = __riscv_vmv_v_v_i16m1(s5, vl); 288 s2 = __riscv_vmv_v_v_i16m1(s6, vl); 289 s3 = __riscv_vmv_v_v_i16m1(s7, vl); 290 s4 = __riscv_vmv_v_v_i16m1(s8, vl); 291 s5 = __riscv_vmv_v_v_i16m1(s9, vl); 292 s6 = __riscv_vmv_v_v_i16m1(s10, vl); 293 294 s += 4 * src_stride; 295 d += 4 * dst_stride; 296 height -= 4; 297 } while (height != 0); 298 299 src_ptr += vl; 300 dst_ptr += vl; 301 w -= vl; 302 } while (w > 0); 303 } 304 } 305 306 static inline vuint16mf2_t highbd_convolve12_4_y_rvv( 307 const vint16mf2_t s0, const vint16mf2_t s1, const vint16mf2_t s2, 308 const vint16mf2_t s3, const vint16mf2_t s4, const vint16mf2_t s5, 309 const vint16mf2_t s6, const vint16mf2_t s7, const vint16mf2_t s8, 310 const vint16mf2_t s9, const vint16mf2_t s10, const vint16mf2_t s11, 311 const int16_t *filter, const uint16_t max, size_t vl) { 312 vint32m1_t sum = __riscv_vwmul_vx_i32m1(s0, filter[0], vl); 313 sum = __riscv_vwmacc_vx_i32m1(sum, filter[1], s1, vl); 314 sum = __riscv_vwmacc_vx_i32m1(sum, filter[2], s2, vl); 315 sum = __riscv_vwmacc_vx_i32m1(sum, filter[3], s3, vl); 316 sum = __riscv_vwmacc_vx_i32m1(sum, filter[4], s4, vl); 317 sum = __riscv_vwmacc_vx_i32m1(sum, filter[5], s5, vl); 318 sum = __riscv_vwmacc_vx_i32m1(sum, filter[6], s6, vl); 319 sum = __riscv_vwmacc_vx_i32m1(sum, filter[7], s7, vl); 320 sum = __riscv_vwmacc_vx_i32m1(sum, filter[8], s8, vl); 321 sum = __riscv_vwmacc_vx_i32m1(sum, filter[9], s9, vl); 322 sum = __riscv_vwmacc_vx_i32m1(sum, filter[10], s10, vl); 323 sum = __riscv_vwmacc_vx_i32m1(sum, filter[11], s11, vl); 324 325 // Convert to unsigned 16-bit with saturation 326 vuint32m1_t d0 = 327 __riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vx_i32m1(sum, 0, vl)); 328 vuint16mf2_t res = 329 __riscv_vnclipu_wx_u16mf2(d0, COMPOUND_ROUND1_BITS, __RISCV_VXRM_RNU, vl); 330 331 // Clamp to max 332 return __riscv_vminu_vx_u16mf2(res, max, vl); 333 } 334 335 static inline vuint16m1_t highbd_convolve12_8_y_rvv( 336 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 337 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 338 const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8, 339 const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11, 340 const int16_t *filter, const uint16_t max, size_t vl) { 341 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, filter[0], vl); 342 sum = __riscv_vwmacc_vx_i32m2(sum, filter[1], s1, vl); 343 sum = __riscv_vwmacc_vx_i32m2(sum, filter[2], s2, vl); 344 sum = __riscv_vwmacc_vx_i32m2(sum, filter[3], s3, vl); 345 sum = __riscv_vwmacc_vx_i32m2(sum, filter[4], s4, vl); 346 sum = __riscv_vwmacc_vx_i32m2(sum, filter[5], s5, vl); 347 sum = __riscv_vwmacc_vx_i32m2(sum, filter[6], s6, vl); 348 sum = __riscv_vwmacc_vx_i32m2(sum, filter[7], s7, vl); 349 sum = __riscv_vwmacc_vx_i32m2(sum, filter[8], s8, vl); 350 sum = __riscv_vwmacc_vx_i32m2(sum, filter[9], s9, vl); 351 sum = __riscv_vwmacc_vx_i32m2(sum, filter[10], s10, vl); 352 sum = __riscv_vwmacc_vx_i32m2(sum, filter[11], s11, vl); 353 354 // Convert to unsigned 16-bit with saturation 355 vuint32m2_t d0 = 356 __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmax_vx_i32m2(sum, 0, vl)); 357 vuint16m1_t res = 358 __riscv_vnclipu_wx_u16m1(d0, COMPOUND_ROUND1_BITS, __RISCV_VXRM_RNU, vl); 359 360 // Clamp to max 361 return __riscv_vminu_vx_u16m1(res, max, vl); 362 } 363 364 static inline void highbd_convolve_y_sr_12tap_rvv( 365 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 366 int w, int h, const int16_t *y_filter, int bd) { 367 const uint16_t max = (1 << bd) - 1; 368 size_t vl = __riscv_vsetvl_e16m1(w); 369 370 if (w == 4) { 371 const int16_t *s = (const int16_t *)src_ptr; 372 uint16_t *d = dst_ptr; 373 374 // Load initial 11 rows of data 375 vint16mf2_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 376 load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, 377 &s9, &s10, vl); 378 s += 11 * src_stride; 379 380 do { 381 // Load next 4 rows of data 382 vint16mf2_t s11, s12, s13, s14; 383 load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14, vl); 384 385 // Perform 12-tap convolution for 4 rows 386 vuint16mf2_t d0 = highbd_convolve12_4_y_rvv( 387 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter, max, vl); 388 vuint16mf2_t d1 = highbd_convolve12_4_y_rvv( 389 s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter, max, vl); 390 vuint16mf2_t d2 = 391 highbd_convolve12_4_y_rvv(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 392 s12, s13, y_filter, max, vl); 393 vuint16mf2_t d3 = 394 highbd_convolve12_4_y_rvv(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, 395 s13, s14, y_filter, max, vl); 396 397 // Store results 398 store_u16_4x4(d, dst_stride, d0, d1, d2, d3, vl); 399 400 // Update source pointers for next iteration 401 s0 = __riscv_vmv_v_v_i16mf2(s4, vl); 402 s1 = __riscv_vmv_v_v_i16mf2(s5, vl); 403 s2 = __riscv_vmv_v_v_i16mf2(s6, vl); 404 s3 = __riscv_vmv_v_v_i16mf2(s7, vl); 405 s4 = __riscv_vmv_v_v_i16mf2(s8, vl); 406 s5 = __riscv_vmv_v_v_i16mf2(s9, vl); 407 s6 = __riscv_vmv_v_v_i16mf2(s10, vl); 408 s7 = __riscv_vmv_v_v_i16mf2(s11, vl); 409 s8 = __riscv_vmv_v_v_i16mf2(s12, vl); 410 s9 = __riscv_vmv_v_v_i16mf2(s13, vl); 411 s10 = __riscv_vmv_v_v_i16mf2(s14, vl); 412 413 s += 4 * src_stride; 414 d += 4 * dst_stride; 415 h -= 4; 416 } while (h != 0); 417 } else { 418 do { 419 int height = h; 420 const int16_t *s = (const int16_t *)src_ptr; 421 uint16_t *d = dst_ptr; 422 423 // Load initial 11 rows of data 424 vint16m1_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; 425 load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, 426 &s9, &s10, vl); 427 s += 11 * src_stride; 428 429 do { 430 // Load next 4 rows of data 431 vint16m1_t s11, s12, s13, s14; 432 load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14, vl); 433 434 // Perform 12-tap convolution for 4 rows 435 vuint16m1_t d0 = 436 highbd_convolve12_8_y_rvv(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, 437 s10, s11, y_filter, max, vl); 438 vuint16m1_t d1 = 439 highbd_convolve12_8_y_rvv(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, 440 s11, s12, y_filter, max, vl); 441 vuint16m1_t d2 = 442 highbd_convolve12_8_y_rvv(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, 443 s12, s13, y_filter, max, vl); 444 vuint16m1_t d3 = 445 highbd_convolve12_8_y_rvv(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, 446 s13, s14, y_filter, max, vl); 447 448 // Store results 449 store_u16_8x4(d, dst_stride, d0, d1, d2, d3, vl); 450 451 // Update source pointers for next iteration 452 s0 = __riscv_vmv_v_v_i16m1(s4, vl); 453 s1 = __riscv_vmv_v_v_i16m1(s5, vl); 454 s2 = __riscv_vmv_v_v_i16m1(s6, vl); 455 s3 = __riscv_vmv_v_v_i16m1(s7, vl); 456 s4 = __riscv_vmv_v_v_i16m1(s8, vl); 457 s5 = __riscv_vmv_v_v_i16m1(s9, vl); 458 s6 = __riscv_vmv_v_v_i16m1(s10, vl); 459 s7 = __riscv_vmv_v_v_i16m1(s11, vl); 460 s8 = __riscv_vmv_v_v_i16m1(s12, vl); 461 s9 = __riscv_vmv_v_v_i16m1(s13, vl); 462 s10 = __riscv_vmv_v_v_i16m1(s14, vl); 463 464 s += 4 * src_stride; 465 d += 4 * dst_stride; 466 height -= 4; 467 } while (height != 0); 468 469 src_ptr += vl; 470 dst_ptr += vl; 471 w -= vl; 472 } while (w > 0); 473 } 474 } 475 476 void av1_highbd_convolve_y_sr_rvv(const uint16_t *src, int src_stride, 477 uint16_t *dst, int dst_stride, int w, int h, 478 const InterpFilterParams *filter_params_y, 479 const int subpel_y_qn, int bd) { 480 if (w == 2 || h == 2) { 481 av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, 482 filter_params_y, subpel_y_qn, bd); 483 return; 484 } 485 486 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 487 const int vert_offset = filter_params_y->taps / 2 - 1; 488 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 489 filter_params_y, subpel_y_qn & SUBPEL_MASK); 490 491 src -= vert_offset * src_stride; 492 493 if (y_filter_taps > 8) { 494 highbd_convolve_y_sr_12tap_rvv(src, src_stride, dst, dst_stride, w, h, 495 y_filter_ptr, bd); 496 return; 497 } 498 if (y_filter_taps < 8) { 499 highbd_convolve_y_sr_6tap_rvv(src, src_stride, dst, dst_stride, w, h, 500 y_filter_ptr, bd); 501 return; 502 } 503 504 highbd_convolve_y_sr_8tap_rvv(src, src_stride, dst, dst_stride, w, h, 505 y_filter_ptr, bd); 506 } 507 508 static inline vuint16m1_t highbd_convolve6_8_x_rvv( 509 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 510 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 511 const int16_t *filter, const int32_t offset, const uint16_t max, 512 size_t vl) { 513 // Values at indices 0 and 7 of y_filter are zero. 514 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, filter[1], vl); 515 sum = __riscv_vwmacc_vx_i32m2(sum, filter[2], s1, vl); 516 sum = __riscv_vwmacc_vx_i32m2(sum, filter[3], s2, vl); 517 sum = __riscv_vwmacc_vx_i32m2(sum, filter[4], s3, vl); 518 sum = __riscv_vwmacc_vx_i32m2(sum, filter[5], s4, vl); 519 sum = __riscv_vwmacc_vx_i32m2(sum, filter[6], s5, vl); 520 521 // Add rounding constant and offset 522 sum = __riscv_vadd_vx_i32m2(sum, (1 << (FILTER_BITS - 1)) + offset, vl); 523 524 // Narrow result to 16-bit with rounding and saturation 525 vint16m1_t res = __riscv_vnsra_wx_i16m1(sum, FILTER_BITS, vl); 526 527 // Clamp result to max value 528 vuint16m1_t d0 = 529 __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(res, 0, vl)); 530 return __riscv_vminu_vx_u16m1(d0, max, vl); 531 } 532 533 static inline void highbd_convolve_x_sr_6tap_rvv( 534 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 535 int w, int h, const int16_t *x_filter, ConvolveParams *conv_params, 536 int bd) { 537 const uint16_t max = (1 << bd) - 1; 538 // This shim allows to do only one rounding shift instead of two. 539 const int32_t offset = 1 << (conv_params->round_0 - 1); 540 541 int height = h; 542 size_t vl = __riscv_vsetvl_e16m1(w); 543 544 do { 545 int width = w; 546 const int16_t *s = (const int16_t *)src_ptr; 547 uint16_t *d = dst_ptr; 548 549 do { 550 vint16m1_t s00, s01, s02, s03, s04, s05; 551 vint16m1_t s10, s11, s12, s13, s14, s15; 552 vint16m1_t s20, s21, s22, s23, s24, s25; 553 vint16m1_t s30, s31, s32, s33, s34, s35; 554 555 // Load 6 elements for each of 4 rows 556 load_s16_8x6(s + 0 * src_stride, 1, &s00, &s01, &s02, &s03, &s04, &s05, 557 vl); 558 load_s16_8x6(s + 1 * src_stride, 1, &s10, &s11, &s12, &s13, &s14, &s15, 559 vl); 560 load_s16_8x6(s + 2 * src_stride, 1, &s20, &s21, &s22, &s23, &s24, &s25, 561 vl); 562 load_s16_8x6(s + 3 * src_stride, 1, &s30, &s31, &s32, &s33, &s34, &s35, 563 vl); 564 565 // Perform convolution 566 vuint16m1_t d0 = highbd_convolve6_8_x_rvv(s00, s01, s02, s03, s04, s05, 567 x_filter, offset, max, vl); 568 vuint16m1_t d1 = highbd_convolve6_8_x_rvv(s10, s11, s12, s13, s14, s15, 569 x_filter, offset, max, vl); 570 vuint16m1_t d2 = highbd_convolve6_8_x_rvv(s20, s21, s22, s23, s24, s25, 571 x_filter, offset, max, vl); 572 vuint16m1_t d3 = highbd_convolve6_8_x_rvv(s30, s31, s32, s33, s34, s35, 573 x_filter, offset, max, vl); 574 575 // Store results 576 store_u16_8x4(d, dst_stride, d0, d1, d2, d3, vl); 577 578 s += vl; 579 d += vl; 580 width -= vl; 581 } while (width > 0); 582 583 src_ptr += 4 * src_stride; 584 dst_ptr += 4 * dst_stride; 585 height -= 4; 586 } while (height != 0); 587 } 588 589 static inline vuint16mf2_t highbd_convolve4_4_x_rvv( 590 const vint16mf2_t s0, const vint16mf2_t s1, const vint16mf2_t s2, 591 const vint16mf2_t s3, const int16_t *filter, const int32_t offset, 592 const uint16_t max, size_t vl) { 593 vint32m1_t sum = __riscv_vwmul_vx_i32m1(s0, filter[0], vl); 594 sum = __riscv_vwmacc_vx_i32m1(sum, filter[1], s1, vl); 595 sum = __riscv_vwmacc_vx_i32m1(sum, filter[2], s2, vl); 596 sum = __riscv_vwmacc_vx_i32m1(sum, filter[3], s3, vl); 597 598 // Add rounding constant and offset 599 sum = __riscv_vadd_vx_i32m1(sum, (1 << (FILTER_BITS - 1)) + offset, vl); 600 601 // Narrow result to 16-bit with rounding and saturation 602 vint16mf2_t res = __riscv_vnsra_wx_i16mf2(sum, FILTER_BITS, vl); 603 604 // Clamp result to max value 605 vuint16mf2_t d0 = 606 __riscv_vreinterpret_v_i16mf2_u16mf2(__riscv_vmax_vx_i16mf2(res, 0, vl)); 607 return __riscv_vminu_vx_u16mf2(d0, max, vl); 608 } 609 610 static inline vuint16m1_t highbd_convolve8_8_x_rvv( 611 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 612 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 613 const vint16m1_t s6, const vint16m1_t s7, const int16_t *filter, 614 const int32_t offset, const uint16_t max, size_t vl) { 615 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, filter[0], vl); 616 sum = __riscv_vwmacc_vx_i32m2(sum, filter[1], s1, vl); 617 sum = __riscv_vwmacc_vx_i32m2(sum, filter[2], s2, vl); 618 sum = __riscv_vwmacc_vx_i32m2(sum, filter[3], s3, vl); 619 sum = __riscv_vwmacc_vx_i32m2(sum, filter[4], s4, vl); 620 sum = __riscv_vwmacc_vx_i32m2(sum, filter[5], s5, vl); 621 sum = __riscv_vwmacc_vx_i32m2(sum, filter[6], s6, vl); 622 sum = __riscv_vwmacc_vx_i32m2(sum, filter[7], s7, vl); 623 624 sum = __riscv_vwadd_wx_i32m2(sum, offset, vl); 625 626 // Convert to unsigned 16-bit with saturation 627 vuint32m2_t d0 = 628 __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmax_vx_i32m2(sum, 0, vl)); 629 vuint16m1_t res = 630 __riscv_vnclipu_wx_u16m1(d0, FILTER_BITS, __RISCV_VXRM_RNU, vl); 631 632 // Clamp to max 633 return __riscv_vminu_vx_u16m1(res, max, vl); 634 } 635 636 static inline void highbd_convolve_x_sr_rvv(const uint16_t *src_ptr, 637 int src_stride, uint16_t *dst_ptr, 638 int dst_stride, int w, int h, 639 const int16_t *x_filter, 640 ConvolveParams *conv_params, 641 int bd) { 642 // This shim allows to do only one rounding shift instead of two. 643 const int32_t offset = 1 << (conv_params->round_0 - 1); 644 const uint16_t max = (1 << bd) - 1; 645 size_t vl = __riscv_vsetvl_e16m1(w); 646 647 if (w == 4) { 648 // 4-tap filters are used for blocks having width == 4. 649 const int16_t *s = (const int16_t *)(src_ptr + 2); 650 uint16_t *d = dst_ptr; 651 const int16_t *x_filter_ptr = x_filter + 2; 652 653 do { 654 vint16mf2_t s00, s01, s02, s03; 655 vint16mf2_t s10, s11, s12, s13; 656 vint16mf2_t s20, s21, s22, s23; 657 vint16mf2_t s30, s31, s32, s33; 658 659 // Load pixels from each of 4 rows 660 load_s16_4x4(s + 0 * src_stride, 1, &s00, &s01, &s02, &s03, vl); 661 load_s16_4x4(s + 1 * src_stride, 1, &s10, &s11, &s12, &s13, vl); 662 load_s16_4x4(s + 2 * src_stride, 1, &s20, &s21, &s22, &s23, vl); 663 load_s16_4x4(s + 3 * src_stride, 1, &s30, &s31, &s32, &s33, vl); 664 665 // Perform convolution for 4 rows 666 vuint16mf2_t d0 = highbd_convolve4_4_x_rvv(s00, s01, s02, s03, 667 x_filter_ptr, offset, max, vl); 668 vuint16mf2_t d1 = highbd_convolve4_4_x_rvv(s10, s11, s12, s13, 669 x_filter_ptr, offset, max, vl); 670 vuint16mf2_t d2 = highbd_convolve4_4_x_rvv(s20, s21, s22, s23, 671 x_filter_ptr, offset, max, vl); 672 vuint16mf2_t d3 = highbd_convolve4_4_x_rvv(s30, s31, s32, s33, 673 x_filter_ptr, offset, max, vl); 674 675 // Store results 676 store_u16_4x4(d, dst_stride, d0, d1, d2, d3, vl); 677 678 s += 4 * src_stride; 679 d += 4 * dst_stride; 680 h -= 4; 681 } while (h != 0); 682 } else { 683 int height = h; 684 do { 685 int width = w; 686 const int16_t *s = (const int16_t *)src_ptr; 687 uint16_t *d = dst_ptr; 688 689 do { 690 vint16m1_t s00, s01, s02, s03, s04, s05, s06, s07; 691 vint16m1_t s10, s11, s12, s13, s14, s15, s16, s17; 692 vint16m1_t s20, s21, s22, s23, s24, s25, s26, s27; 693 vint16m1_t s30, s31, s32, s33, s34, s35, s36, s37; 694 695 // Load elements for each of 4 rows 696 load_s16_8x8(s + 0 * src_stride, 1, &s00, &s01, &s02, &s03, &s04, &s05, 697 &s06, &s07, vl); 698 load_s16_8x8(s + 1 * src_stride, 1, &s10, &s11, &s12, &s13, &s14, &s15, 699 &s16, &s17, vl); 700 load_s16_8x8(s + 2 * src_stride, 1, &s20, &s21, &s22, &s23, &s24, &s25, 701 &s26, &s27, vl); 702 load_s16_8x8(s + 3 * src_stride, 1, &s30, &s31, &s32, &s33, &s34, &s35, 703 &s36, &s37, vl); 704 705 // Perform convolution 706 vuint16m1_t d0 = highbd_convolve8_8_x_rvv( 707 s00, s01, s02, s03, s04, s05, s06, s07, x_filter, offset, max, vl); 708 vuint16m1_t d1 = highbd_convolve8_8_x_rvv( 709 s10, s11, s12, s13, s14, s15, s16, s17, x_filter, offset, max, vl); 710 vuint16m1_t d2 = highbd_convolve8_8_x_rvv( 711 s20, s21, s22, s23, s24, s25, s26, s27, x_filter, offset, max, vl); 712 vuint16m1_t d3 = highbd_convolve8_8_x_rvv( 713 s30, s31, s32, s33, s34, s35, s36, s37, x_filter, offset, max, vl); 714 715 // Store results 716 store_u16_8x4(d, dst_stride, d0, d1, d2, d3, vl); 717 718 s += vl; 719 d += vl; 720 width -= vl; 721 } while (width > 0); 722 723 src_ptr += 4 * src_stride; 724 dst_ptr += 4 * dst_stride; 725 height -= 4; 726 } while (height != 0); 727 } 728 } 729 730 static inline vuint16mf2_t highbd_convolve12_4_x_rvv( 731 const vint16mf2_t s0, const vint16mf2_t s1, const vint16mf2_t s2, 732 const vint16mf2_t s3, const vint16mf2_t s4, const vint16mf2_t s5, 733 const vint16mf2_t s6, const vint16mf2_t s7, const vint16mf2_t s8, 734 const vint16mf2_t s9, const vint16mf2_t s10, const vint16mf2_t s11, 735 const int16_t *filter, const int32_t offset, const uint16_t max, 736 size_t vl) { 737 vint32m1_t sum = __riscv_vwmul_vx_i32m1(s0, filter[0], vl); 738 sum = __riscv_vwmacc_vx_i32m1(sum, filter[1], s1, vl); 739 sum = __riscv_vwmacc_vx_i32m1(sum, filter[2], s2, vl); 740 sum = __riscv_vwmacc_vx_i32m1(sum, filter[3], s3, vl); 741 sum = __riscv_vwmacc_vx_i32m1(sum, filter[4], s4, vl); 742 sum = __riscv_vwmacc_vx_i32m1(sum, filter[5], s5, vl); 743 sum = __riscv_vwmacc_vx_i32m1(sum, filter[6], s6, vl); 744 sum = __riscv_vwmacc_vx_i32m1(sum, filter[7], s7, vl); 745 sum = __riscv_vwmacc_vx_i32m1(sum, filter[8], s8, vl); 746 sum = __riscv_vwmacc_vx_i32m1(sum, filter[9], s9, vl); 747 sum = __riscv_vwmacc_vx_i32m1(sum, filter[10], s10, vl); 748 sum = __riscv_vwmacc_vx_i32m1(sum, filter[11], s11, vl); 749 sum = __riscv_vwadd_wx_i32m1(sum, offset, vl); 750 751 // Convert to unsigned 16-bit with saturation 752 vuint32m1_t d0 = 753 __riscv_vreinterpret_v_i32m1_u32m1(__riscv_vmax_vx_i32m1(sum, 0, vl)); 754 vuint16mf2_t res = 755 __riscv_vnclipu_wx_u16mf2(d0, FILTER_BITS, __RISCV_VXRM_RNU, vl); 756 757 // Clamp to max 758 return __riscv_vminu_vx_u16mf2(res, max, vl); 759 } 760 761 static inline vuint16m1_t highbd_convolve12_8_x_rvv( 762 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 763 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 764 const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8, 765 const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11, 766 const int16_t *filter, const int32_t offset, const uint16_t max, 767 size_t vl) { 768 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, filter[0], vl); 769 sum = __riscv_vwmacc_vx_i32m2(sum, filter[1], s1, vl); 770 sum = __riscv_vwmacc_vx_i32m2(sum, filter[2], s2, vl); 771 sum = __riscv_vwmacc_vx_i32m2(sum, filter[3], s3, vl); 772 sum = __riscv_vwmacc_vx_i32m2(sum, filter[4], s4, vl); 773 sum = __riscv_vwmacc_vx_i32m2(sum, filter[5], s5, vl); 774 sum = __riscv_vwmacc_vx_i32m2(sum, filter[6], s6, vl); 775 sum = __riscv_vwmacc_vx_i32m2(sum, filter[7], s7, vl); 776 sum = __riscv_vwmacc_vx_i32m2(sum, filter[8], s8, vl); 777 sum = __riscv_vwmacc_vx_i32m2(sum, filter[9], s9, vl); 778 sum = __riscv_vwmacc_vx_i32m2(sum, filter[10], s10, vl); 779 sum = __riscv_vwmacc_vx_i32m2(sum, filter[11], s11, vl); 780 sum = __riscv_vwadd_wx_i32m2(sum, offset, vl); 781 782 // Convert to unsigned 16-bit with saturation 783 vuint32m2_t d0 = 784 __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmax_vx_i32m2(sum, 0, vl)); 785 vuint16m1_t res = 786 __riscv_vnclipu_wx_u16m1(d0, FILTER_BITS, __RISCV_VXRM_RNU, vl); 787 788 // Clamp to max 789 return __riscv_vminu_vx_u16m1(res, max, vl); 790 } 791 792 static inline void highbd_convolve_x_sr_12tap_rvv( 793 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 794 int w, int h, const int16_t *x_filter, ConvolveParams *conv_params, 795 int bd) { 796 // This shim allows to do only one rounding shift instead of two. 797 const int32_t offset = 1 << (conv_params->round_0 - 1); 798 const uint16_t max = (1 << bd) - 1; 799 size_t vl = __riscv_vsetvl_e16m1(w); 800 801 if (w == 4) { 802 const int16_t *s = (const int16_t *)src_ptr; 803 uint16_t *d = dst_ptr; 804 805 do { 806 vint16mf2_t s00, s01, s02, s03, s04, s05, s06, s07, s08, s09, s010, s011; 807 vint16mf2_t s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s110, s111; 808 vint16mf2_t s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s210, s211; 809 vint16mf2_t s30, s31, s32, s33, s34, s35, s36, s37, s38, s39, s310, s311; 810 811 // Load elements for each of 4 rows 812 load_s16_4x12(s + 0 * src_stride, 1, &s00, &s01, &s02, &s03, &s04, &s05, 813 &s06, &s07, &s08, &s09, &s010, &s011, vl); 814 load_s16_4x12(s + 1 * src_stride, 1, &s10, &s11, &s12, &s13, &s14, &s15, 815 &s16, &s17, &s18, &s19, &s110, &s111, vl); 816 load_s16_4x12(s + 2 * src_stride, 1, &s20, &s21, &s22, &s23, &s24, &s25, 817 &s26, &s27, &s28, &s29, &s210, &s211, vl); 818 load_s16_4x12(s + 3 * src_stride, 1, &s30, &s31, &s32, &s33, &s34, &s35, 819 &s36, &s37, &s38, &s39, &s310, &s311, vl); 820 821 // Perform convolution 822 vuint16mf2_t d0 = 823 highbd_convolve12_4_x_rvv(s00, s01, s02, s03, s04, s05, s06, s07, s08, 824 s09, s010, s011, x_filter, offset, max, vl); 825 vuint16mf2_t d1 = 826 highbd_convolve12_4_x_rvv(s10, s11, s12, s13, s14, s15, s16, s17, s18, 827 s19, s110, s111, x_filter, offset, max, vl); 828 vuint16mf2_t d2 = 829 highbd_convolve12_4_x_rvv(s20, s21, s22, s23, s24, s25, s26, s27, s28, 830 s29, s210, s211, x_filter, offset, max, vl); 831 vuint16mf2_t d3 = 832 highbd_convolve12_4_x_rvv(s30, s31, s32, s33, s34, s35, s36, s37, s38, 833 s39, s310, s311, x_filter, offset, max, vl); 834 835 // Store results 836 store_u16_4x4(d, dst_stride, d0, d1, d2, d3, vl); 837 838 s += 4 * src_stride; 839 d += 4 * dst_stride; 840 h -= 4; 841 } while (h != 0); 842 } else { 843 int height = h; 844 do { 845 const int16_t *s = (const int16_t *)src_ptr; 846 uint16_t *d = dst_ptr; 847 int width = w; 848 849 do { 850 vint16m1_t s00, s01, s02, s03, s04, s05, s06, s07, s08, s09, s010, s011; 851 vint16m1_t s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s110, s111; 852 vint16m1_t s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s210, s211; 853 vint16m1_t s30, s31, s32, s33, s34, s35, s36, s37, s38, s39, s310, s311; 854 855 // Load elements for each of 4 rows 856 load_s16_8x12(s + 0 * src_stride, 1, &s00, &s01, &s02, &s03, &s04, &s05, 857 &s06, &s07, &s08, &s09, &s010, &s011, vl); 858 load_s16_8x12(s + 1 * src_stride, 1, &s10, &s11, &s12, &s13, &s14, &s15, 859 &s16, &s17, &s18, &s19, &s110, &s111, vl); 860 load_s16_8x12(s + 2 * src_stride, 1, &s20, &s21, &s22, &s23, &s24, &s25, 861 &s26, &s27, &s28, &s29, &s210, &s211, vl); 862 load_s16_8x12(s + 3 * src_stride, 1, &s30, &s31, &s32, &s33, &s34, &s35, 863 &s36, &s37, &s38, &s39, &s310, &s311, vl); 864 865 // Perform convolution 866 vuint16m1_t d0 = highbd_convolve12_8_x_rvv( 867 s00, s01, s02, s03, s04, s05, s06, s07, s08, s09, s010, s011, 868 x_filter, offset, max, vl); 869 vuint16m1_t d1 = highbd_convolve12_8_x_rvv( 870 s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s110, s111, 871 x_filter, offset, max, vl); 872 vuint16m1_t d2 = highbd_convolve12_8_x_rvv( 873 s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s210, s211, 874 x_filter, offset, max, vl); 875 vuint16m1_t d3 = highbd_convolve12_8_x_rvv( 876 s30, s31, s32, s33, s34, s35, s36, s37, s38, s39, s310, s311, 877 x_filter, offset, max, vl); 878 879 // Store results 880 store_u16_8x4(d, dst_stride, d0, d1, d2, d3, vl); 881 882 s += vl; 883 d += vl; 884 width -= vl; 885 } while (width > 0); 886 887 src_ptr += 4 * src_stride; 888 dst_ptr += 4 * dst_stride; 889 height -= 4; 890 } while (height != 0); 891 } 892 } 893 894 void av1_highbd_convolve_x_sr_rvv(const uint16_t *src, int src_stride, 895 uint16_t *dst, int dst_stride, int w, int h, 896 const InterpFilterParams *filter_params_x, 897 const int subpel_x_qn, 898 ConvolveParams *conv_params, int bd) { 899 if (w == 2 || h == 2) { 900 av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, 901 filter_params_x, subpel_x_qn, conv_params, bd); 902 return; 903 } 904 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); 905 const int horiz_offset = filter_params_x->taps / 2 - 1; 906 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 907 filter_params_x, subpel_x_qn & SUBPEL_MASK); 908 909 src -= horiz_offset; 910 911 if (x_filter_taps > 8) { 912 highbd_convolve_x_sr_12tap_rvv(src, src_stride, dst, dst_stride, w, h, 913 x_filter_ptr, conv_params, bd); 914 return; 915 } 916 if (x_filter_taps <= 6 && w != 4) { 917 highbd_convolve_x_sr_6tap_rvv(src + 1, src_stride, dst, dst_stride, w, h, 918 x_filter_ptr, conv_params, bd); 919 return; 920 } 921 922 highbd_convolve_x_sr_rvv(src, src_stride, dst, dst_stride, w, h, x_filter_ptr, 923 conv_params, bd); 924 } 925 926 // store_strided_u16_4xN 927 static inline void store_strided_u16_4xN(uint16_t *addr, vuint16m1_t vdst, 928 ptrdiff_t stride, size_t vl) { 929 __riscv_vse16_v_u16m1(addr, vdst, vl >> 1); 930 vdst = __riscv_vslidedown_vx_u16m1(vdst, vl >> 1, vl); 931 __riscv_vse16_v_u16m1(addr + stride, vdst, vl >> 1); 932 } 933 934 static inline vuint16m1_t highbd_convolve12_2d_v_rvv( 935 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 936 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 937 const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8, 938 const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11, 939 const int16_t *y_filter, const int32_t offset, const int32_t shift, 940 const uint16_t max, size_t vl) { 941 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl); 942 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl); 943 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl); 944 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl); 945 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[4], s4, vl); 946 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[5], s5, vl); 947 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[6], s6, vl); 948 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[7], s7, vl); 949 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[8], s8, vl); 950 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[9], s9, vl); 951 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[10], s10, vl); 952 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[11], s11, vl); 953 sum = __riscv_vadd_vx_i32m2(sum, offset, vl); 954 955 vint16m1_t i16_sum = __riscv_vnsra_wx_i16m1(sum, shift, vl); 956 vint16m1_t iclip_sum = 957 __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), max, vl); 958 return __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum); 959 } 960 961 static inline void highbd_convolve_2d_sr_vert_12tap_rvv( 962 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 963 int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, 964 const int bd, const int offset, size_t vl) { 965 const int32_t shift_s32 = conv_params->round_1; 966 const int32_t offset_s32 = offset; 967 const uint16_t max_u16 = (1 << bd) - 1; 968 969 if (w == 4) { 970 int16_t *s = (int16_t *)src_ptr; 971 vl = vl << 1; 972 973 vint16m1_t s0 = load_strided_i16_4xN(s, src_stride, vl); 974 s += src_stride; 975 vint16m1_t s1 = load_strided_i16_4xN(s, src_stride, vl); 976 s += src_stride; 977 vint16m1_t s2 = load_strided_i16_4xN(s, src_stride, vl); 978 s += src_stride; 979 vint16m1_t s3 = load_strided_i16_4xN(s, src_stride, vl); 980 s += src_stride; 981 vint16m1_t s4 = load_strided_i16_4xN(s, src_stride, vl); 982 s += src_stride; 983 vint16m1_t s5 = load_strided_i16_4xN(s, src_stride, vl); 984 s += src_stride; 985 vint16m1_t s6 = load_strided_i16_4xN(s, src_stride, vl); 986 s += src_stride; 987 vint16m1_t s7 = load_strided_i16_4xN(s, src_stride, vl); 988 s += src_stride; 989 vint16m1_t s8 = load_strided_i16_4xN(s, src_stride, vl); 990 s += src_stride; 991 vint16m1_t s9 = load_strided_i16_4xN(s, src_stride, vl); 992 s += src_stride; 993 994 do { 995 vint16m1_t s10 = load_strided_i16_4xN(s, src_stride, vl); 996 s += src_stride; 997 vint16m1_t s11 = load_strided_i16_4xN(s, src_stride, vl); 998 s += src_stride; 999 vint16m1_t s12 = load_strided_i16_4xN(s, src_stride, vl); 1000 s += src_stride; 1001 vint16m1_t s13 = load_strided_i16_4xN(s, src_stride, vl); 1002 s += src_stride; 1003 1004 vuint16m1_t d0 = highbd_convolve12_2d_v_rvv( 1005 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_ptr, 1006 offset_s32, shift_s32, max_u16, vl); 1007 vuint16m1_t d1 = highbd_convolve12_2d_v_rvv( 1008 s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_ptr, 1009 offset_s32, shift_s32, max_u16, vl); 1010 1011 store_strided_u16_4xN(dst_ptr, d0, dst_stride, vl); 1012 dst_ptr += dst_stride << 1; 1013 store_strided_u16_4xN(dst_ptr, d1, dst_stride, vl); 1014 dst_ptr += dst_stride << 1; 1015 1016 s0 = s4; 1017 s1 = s5; 1018 s2 = s6; 1019 s3 = s7; 1020 s4 = s8; 1021 s5 = s9; 1022 s6 = s10; 1023 s7 = s11; 1024 s8 = s12; 1025 s9 = s13; 1026 1027 h -= 4; 1028 } while (h != 0); 1029 } else { 1030 do { 1031 int height = h; 1032 int16_t *s = (int16_t *)src_ptr; 1033 uint16_t *d = dst_ptr; 1034 1035 vint16m1_t s0 = __riscv_vle16_v_i16m1(s, vl); 1036 s += src_stride; 1037 vint16m1_t s1 = __riscv_vle16_v_i16m1(s, vl); 1038 s += src_stride; 1039 vint16m1_t s2 = __riscv_vle16_v_i16m1(s, vl); 1040 s += src_stride; 1041 vint16m1_t s3 = __riscv_vle16_v_i16m1(s, vl); 1042 s += src_stride; 1043 vint16m1_t s4 = __riscv_vle16_v_i16m1(s, vl); 1044 s += src_stride; 1045 vint16m1_t s5 = __riscv_vle16_v_i16m1(s, vl); 1046 s += src_stride; 1047 vint16m1_t s6 = __riscv_vle16_v_i16m1(s, vl); 1048 s += src_stride; 1049 vint16m1_t s7 = __riscv_vle16_v_i16m1(s, vl); 1050 s += src_stride; 1051 vint16m1_t s8 = __riscv_vle16_v_i16m1(s, vl); 1052 s += src_stride; 1053 vint16m1_t s9 = __riscv_vle16_v_i16m1(s, vl); 1054 s += src_stride; 1055 vint16m1_t s10 = __riscv_vle16_v_i16m1(s, vl); 1056 s += src_stride; 1057 1058 do { 1059 vint16m1_t s11 = __riscv_vle16_v_i16m1(s, vl); 1060 s += src_stride; 1061 vint16m1_t s12 = __riscv_vle16_v_i16m1(s, vl); 1062 s += src_stride; 1063 vint16m1_t s13 = __riscv_vle16_v_i16m1(s, vl); 1064 s += src_stride; 1065 vint16m1_t s14 = __riscv_vle16_v_i16m1(s, vl); 1066 s += src_stride; 1067 1068 vuint16m1_t d0 = highbd_convolve12_2d_v_rvv( 1069 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_ptr, 1070 offset_s32, shift_s32, max_u16, vl); 1071 vuint16m1_t d1 = highbd_convolve12_2d_v_rvv( 1072 s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_ptr, 1073 offset_s32, shift_s32, max_u16, vl); 1074 vuint16m1_t d2 = highbd_convolve12_2d_v_rvv( 1075 s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_ptr, 1076 offset_s32, shift_s32, max_u16, vl); 1077 vuint16m1_t d3 = highbd_convolve12_2d_v_rvv( 1078 s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_ptr, 1079 offset_s32, shift_s32, max_u16, vl); 1080 1081 __riscv_vse16_v_u16m1(d, d0, vl); 1082 d += dst_stride; 1083 __riscv_vse16_v_u16m1(d, d1, vl); 1084 d += dst_stride; 1085 __riscv_vse16_v_u16m1(d, d2, vl); 1086 d += dst_stride; 1087 __riscv_vse16_v_u16m1(d, d3, vl); 1088 d += dst_stride; 1089 1090 s0 = s4; 1091 s1 = s5; 1092 s2 = s6; 1093 s3 = s7; 1094 s4 = s8; 1095 s5 = s9; 1096 s6 = s10; 1097 s7 = s11; 1098 s8 = s12; 1099 s9 = s13; 1100 s10 = s14; 1101 1102 height -= 4; 1103 } while (height != 0); 1104 1105 src_ptr += vl; 1106 dst_ptr += vl; 1107 w -= vl; 1108 } while (w != 0); 1109 } 1110 } 1111 1112 static inline vuint16m1_t highbd_convolve8_2d_v_rvv( 1113 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 1114 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 1115 const vint16m1_t s6, const vint16m1_t s7, const int16_t *y_filter, 1116 const int32_t offset, const int32_t shift, const uint16_t max, size_t vl) { 1117 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl); 1118 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl); 1119 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl); 1120 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl); 1121 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[4], s4, vl); 1122 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[5], s5, vl); 1123 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[6], s6, vl); 1124 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[7], s7, vl); 1125 sum = __riscv_vadd_vx_i32m2(sum, offset, vl); 1126 1127 vint16m1_t i16_sum = __riscv_vnsra_wx_i16m1(sum, shift, vl); 1128 vint16m1_t iclip_sum = 1129 __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), max, vl); 1130 return __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum); 1131 } 1132 1133 static inline void highbd_convolve_2d_sr_vert_8tap_rvv( 1134 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1135 int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, 1136 int bd, const int offset, size_t vl) { 1137 const int32_t shift_s32 = conv_params->round_1; 1138 const int32_t offset_s32 = offset; 1139 const uint16_t max_u16 = (1 << bd) - 1; 1140 1141 if (w <= 4) { 1142 int16_t *s = (int16_t *)src_ptr; 1143 vl = vl << 1; 1144 1145 vint16m1_t s0 = load_strided_i16_4xN(s, src_stride, vl); 1146 s += src_stride; 1147 vint16m1_t s1 = load_strided_i16_4xN(s, src_stride, vl); 1148 s += src_stride; 1149 vint16m1_t s2 = load_strided_i16_4xN(s, src_stride, vl); 1150 s += src_stride; 1151 vint16m1_t s3 = load_strided_i16_4xN(s, src_stride, vl); 1152 s += src_stride; 1153 vint16m1_t s4 = load_strided_i16_4xN(s, src_stride, vl); 1154 s += src_stride; 1155 vint16m1_t s5 = load_strided_i16_4xN(s, src_stride, vl); 1156 s += src_stride; 1157 1158 do { 1159 vint16m1_t s6 = load_strided_i16_4xN(s, src_stride, vl); 1160 s += src_stride; 1161 vint16m1_t s7 = load_strided_i16_4xN(s, src_stride, vl); 1162 s += src_stride; 1163 1164 vuint16m1_t d0 = highbd_convolve8_2d_v_rvv(s0, s1, s2, s3, s4, s5, s6, s7, 1165 y_filter_ptr, offset_s32, 1166 shift_s32, max_u16, vl); 1167 1168 store_strided_u16_4xN(dst_ptr, d0, dst_stride, vl); 1169 dst_ptr += dst_stride << 1; 1170 1171 s0 = s2; 1172 s1 = s3; 1173 s2 = s4; 1174 s3 = s5; 1175 s4 = s6; 1176 s5 = s7; 1177 1178 h -= 2; 1179 } while (h != 0); 1180 } else { 1181 do { 1182 int height = h; 1183 int16_t *s = (int16_t *)src_ptr; 1184 uint16_t *d = dst_ptr; 1185 1186 vint16m1_t s0 = __riscv_vle16_v_i16m1(s, vl); 1187 s += src_stride; 1188 vint16m1_t s1 = __riscv_vle16_v_i16m1(s, vl); 1189 s += src_stride; 1190 vint16m1_t s2 = __riscv_vle16_v_i16m1(s, vl); 1191 s += src_stride; 1192 vint16m1_t s3 = __riscv_vle16_v_i16m1(s, vl); 1193 s += src_stride; 1194 vint16m1_t s4 = __riscv_vle16_v_i16m1(s, vl); 1195 s += src_stride; 1196 vint16m1_t s5 = __riscv_vle16_v_i16m1(s, vl); 1197 s += src_stride; 1198 vint16m1_t s6 = __riscv_vle16_v_i16m1(s, vl); 1199 s += src_stride; 1200 1201 do { 1202 vint16m1_t s7 = __riscv_vle16_v_i16m1(s, vl); 1203 vuint16m1_t d0 = highbd_convolve8_2d_v_rvv(s0, s1, s2, s3, s4, s5, s6, 1204 s7, y_filter_ptr, offset_s32, 1205 shift_s32, max_u16, vl); 1206 __riscv_vse16_v_u16m1(d, d0, vl); 1207 1208 s0 = s1; 1209 s1 = s2; 1210 s2 = s3; 1211 s3 = s4; 1212 s4 = s5; 1213 s5 = s6; 1214 s6 = s7; 1215 s += src_stride; 1216 d += dst_stride; 1217 height--; 1218 } while (height != 0); 1219 1220 src_ptr += vl; 1221 dst_ptr += vl; 1222 w -= vl; 1223 } while (w != 0); 1224 } 1225 } 1226 1227 static inline vuint16m1_t highbd_convolve6_2d_v_rvv( 1228 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 1229 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 1230 const int16_t *y_filter, const int32_t offset, const int32_t shift, 1231 const uint16_t max, size_t vl) { 1232 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, y_filter[0], vl); 1233 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[1], s1, vl); 1234 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[2], s2, vl); 1235 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[3], s3, vl); 1236 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[4], s4, vl); 1237 sum = __riscv_vwmacc_vx_i32m2(sum, y_filter[5], s5, vl); 1238 sum = __riscv_vadd_vx_i32m2(sum, offset, vl); 1239 1240 vint16m1_t i16_sum = __riscv_vnsra_wx_i16m1(sum, shift, vl); 1241 vint16m1_t iclip_sum = 1242 __riscv_vmin_vx_i16m1(__riscv_vmax_vx_i16m1(i16_sum, 0, vl), max, vl); 1243 return __riscv_vreinterpret_v_i16m1_u16m1(iclip_sum); 1244 } 1245 1246 static inline void highbd_convolve_2d_sr_vert_6tap_rvv( 1247 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1248 int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, 1249 int bd, const int offset, size_t vl) { 1250 const int32_t shift_s32 = conv_params->round_1; 1251 const int32_t offset_s32 = offset; 1252 const uint16_t max_u16 = (1 << bd) - 1; 1253 const int16_t *yfilter_6tap = y_filter_ptr + 1; 1254 1255 if (w == 4) { 1256 int16_t *s = (int16_t *)src_ptr; 1257 vl = vl << 1; 1258 1259 vint16m1_t s0 = load_strided_i16_4xN(s, src_stride, vl); 1260 s += src_stride; 1261 vint16m1_t s1 = load_strided_i16_4xN(s, src_stride, vl); 1262 s += src_stride; 1263 vint16m1_t s2 = load_strided_i16_4xN(s, src_stride, vl); 1264 s += src_stride; 1265 vint16m1_t s3 = load_strided_i16_4xN(s, src_stride, vl); 1266 s += src_stride; 1267 1268 do { 1269 vint16m1_t s4 = load_strided_i16_4xN(s, src_stride, vl); 1270 s += src_stride; 1271 vint16m1_t s5 = load_strided_i16_4xN(s, src_stride, vl); 1272 s += src_stride; 1273 vint16m1_t s6 = load_strided_i16_4xN(s, src_stride, vl); 1274 s += src_stride; 1275 vint16m1_t s7 = load_strided_i16_4xN(s, src_stride, vl); 1276 s += src_stride; 1277 1278 vuint16m1_t d0 = 1279 highbd_convolve6_2d_v_rvv(s0, s1, s2, s3, s4, s5, yfilter_6tap, 1280 offset_s32, shift_s32, max_u16, vl); 1281 vuint16m1_t d1 = 1282 highbd_convolve6_2d_v_rvv(s2, s3, s4, s5, s6, s7, yfilter_6tap, 1283 offset_s32, shift_s32, max_u16, vl); 1284 1285 store_strided_u16_4xN(dst_ptr, d0, dst_stride, vl); 1286 dst_ptr += dst_stride << 1; 1287 store_strided_u16_4xN(dst_ptr, d1, dst_stride, vl); 1288 dst_ptr += dst_stride << 1; 1289 1290 s0 = s4; 1291 s1 = s5; 1292 s2 = s6; 1293 s3 = s7; 1294 1295 h -= 4; 1296 } while (h != 0); 1297 } else { 1298 do { 1299 int height = h; 1300 int16_t *s = (int16_t *)src_ptr; 1301 uint16_t *d = dst_ptr; 1302 1303 vint16m1_t s0 = __riscv_vle16_v_i16m1(s, vl); 1304 s += src_stride; 1305 vint16m1_t s1 = __riscv_vle16_v_i16m1(s, vl); 1306 s += src_stride; 1307 vint16m1_t s2 = __riscv_vle16_v_i16m1(s, vl); 1308 s += src_stride; 1309 vint16m1_t s3 = __riscv_vle16_v_i16m1(s, vl); 1310 s += src_stride; 1311 vint16m1_t s4 = __riscv_vle16_v_i16m1(s, vl); 1312 s += src_stride; 1313 1314 do { 1315 vint16m1_t s5 = __riscv_vle16_v_i16m1(s, vl); 1316 s += src_stride; 1317 vint16m1_t s6 = __riscv_vle16_v_i16m1(s, vl); 1318 s += src_stride; 1319 vint16m1_t s7 = __riscv_vle16_v_i16m1(s, vl); 1320 s += src_stride; 1321 vint16m1_t s8 = __riscv_vle16_v_i16m1(s, vl); 1322 s += src_stride; 1323 1324 vuint16m1_t d0 = 1325 highbd_convolve6_2d_v_rvv(s0, s1, s2, s3, s4, s5, yfilter_6tap, 1326 offset_s32, shift_s32, max_u16, vl); 1327 vuint16m1_t d1 = 1328 highbd_convolve6_2d_v_rvv(s1, s2, s3, s4, s5, s6, yfilter_6tap, 1329 offset_s32, shift_s32, max_u16, vl); 1330 vuint16m1_t d2 = 1331 highbd_convolve6_2d_v_rvv(s2, s3, s4, s5, s6, s7, yfilter_6tap, 1332 offset_s32, shift_s32, max_u16, vl); 1333 vuint16m1_t d3 = 1334 highbd_convolve6_2d_v_rvv(s3, s4, s5, s6, s7, s8, yfilter_6tap, 1335 offset_s32, shift_s32, max_u16, vl); 1336 1337 __riscv_vse16_v_u16m1(d, d0, vl); 1338 d += dst_stride; 1339 __riscv_vse16_v_u16m1(d, d1, vl); 1340 d += dst_stride; 1341 __riscv_vse16_v_u16m1(d, d2, vl); 1342 d += dst_stride; 1343 __riscv_vse16_v_u16m1(d, d3, vl); 1344 d += dst_stride; 1345 1346 s0 = s4; 1347 s1 = s5; 1348 s2 = s6; 1349 s3 = s7; 1350 s4 = s8; 1351 1352 height -= 4; 1353 } while (height != 0); 1354 1355 src_ptr += vl; 1356 dst_ptr += vl; 1357 w -= vl; 1358 } while (w != 0); 1359 } 1360 } 1361 1362 static inline vint16m1_t highbd_convolve12_8_2d_h_rvv( 1363 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 1364 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 1365 const vint16m1_t s6, const vint16m1_t s7, const vint16m1_t s8, 1366 const vint16m1_t s9, const vint16m1_t s10, const vint16m1_t s11, 1367 const int16_t *x_filter, const int32_t offset, const int32_t shift, 1368 size_t vl) { 1369 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, x_filter[0], vl); 1370 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[1], s1, vl); 1371 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[2], s2, vl); 1372 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[3], s3, vl); 1373 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[4], s4, vl); 1374 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[5], s5, vl); 1375 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[6], s6, vl); 1376 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[7], s7, vl); 1377 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[8], s8, vl); 1378 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[9], s9, vl); 1379 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[10], s10, vl); 1380 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[11], s11, vl); 1381 1382 sum = __riscv_vadd_vx_i32m2(sum, offset, vl); 1383 1384 return __riscv_vnclip_wx_i16m1(sum, shift, __RISCV_VXRM_RNU, vl); 1385 } 1386 1387 static inline void highbd_convolve_2d_sr_horiz_12tap_rvv( 1388 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1389 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, 1390 const int offset, size_t vl) { 1391 assert(h >= 5); 1392 const int32_t shift_s32 = conv_params->round_0; 1393 const int32_t offset_s32 = offset; 1394 1395 if (w == 4) { 1396 const int16_t *s = (int16_t *)src_ptr; 1397 int16_t *d = (int16_t *)dst_ptr; 1398 1399 do { 1400 vint16m1_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; 1401 1402 load_s16_8x12(s, 1, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9, 1403 &t10, &t11, vl); 1404 1405 vint16m1_t d0 = highbd_convolve12_8_2d_h_rvv( 1406 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, x_filter_ptr, 1407 offset_s32, shift_s32, vl); 1408 1409 __riscv_vse16_v_i16m1(d, d0, vl); 1410 1411 s += src_stride; 1412 d += dst_stride; 1413 1414 } while (--h != 0); 1415 } else { 1416 int height = h; 1417 1418 do { 1419 const int16_t *s = (int16_t *)src_ptr; 1420 int16_t *d = (int16_t *)dst_ptr; 1421 int width = w; 1422 1423 do { 1424 vint16m1_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; 1425 1426 load_s16_8x12(s, 1, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9, 1427 &t10, &t11, vl); 1428 1429 vint16m1_t d0 = highbd_convolve12_8_2d_h_rvv( 1430 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, x_filter_ptr, 1431 offset_s32, shift_s32, vl); 1432 1433 __riscv_vse16_v_i16m1(d, d0, vl); 1434 1435 s += vl; 1436 d += vl; 1437 width -= vl; 1438 } while (width != 0); 1439 src_ptr += src_stride; 1440 dst_ptr += dst_stride; 1441 } while (--height != 0); 1442 } 1443 } 1444 1445 static inline vint16m1_t highbd_convolve8_4_2d_h_rvv( 1446 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 1447 const vint16m1_t s3, const int16_t *x_filter, const int32_t offset, 1448 const int32_t shift, size_t vl) { 1449 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, x_filter[0], vl); 1450 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[1], s1, vl); 1451 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[2], s2, vl); 1452 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[3], s3, vl); 1453 1454 sum = __riscv_vadd_vx_i32m2(sum, offset, vl); 1455 1456 return __riscv_vnclip_wx_i16m1(sum, shift, __RISCV_VXRM_RNU, vl); 1457 } 1458 1459 static inline vint16m1_t highbd_convolve8_8_2d_h_rvv( 1460 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 1461 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 1462 const vint16m1_t s6, const vint16m1_t s7, const int16_t *x_filter, 1463 const int32_t offset, const int32_t shift, size_t vl) { 1464 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, x_filter[0], vl); 1465 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[1], s1, vl); 1466 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[2], s2, vl); 1467 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[3], s3, vl); 1468 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[4], s4, vl); 1469 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[5], s5, vl); 1470 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[6], s6, vl); 1471 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[7], s7, vl); 1472 1473 sum = __riscv_vadd_vx_i32m2(sum, offset, vl); 1474 1475 return __riscv_vnclip_wx_i16m1(sum, shift, __RISCV_VXRM_RNU, vl); 1476 } 1477 1478 static inline void highbd_convolve_2d_sr_horiz_rvv( 1479 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1480 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, 1481 const int offset, size_t vl) { 1482 assert(h >= 5); 1483 const int32_t shift_s32 = conv_params->round_0; 1484 const int32_t offset_s32 = offset; 1485 1486 if (w == 4) { 1487 const int16_t *x_filter = (x_filter_ptr + 2); 1488 const int16_t *s = (int16_t *)(src_ptr + 1); 1489 int16_t *d = (int16_t *)dst_ptr; 1490 1491 do { 1492 vint16m1_t t0, t1, t2, t3; 1493 1494 load_s16_8x4(s, 1, &t0, &t1, &t2, &t3, vl); 1495 1496 vint16m1_t d0 = highbd_convolve8_4_2d_h_rvv(t0, t1, t2, t3, x_filter, 1497 offset_s32, shift_s32, vl); 1498 1499 __riscv_vse16_v_i16m1(d, d0, vl); 1500 1501 s += src_stride; 1502 d += dst_stride; 1503 } while (--h != 0); 1504 } else { 1505 do { 1506 const int16_t *s = (int16_t *)src_ptr; 1507 int16_t *d = (int16_t *)dst_ptr; 1508 int width = w; 1509 1510 do { 1511 vint16m1_t t0, t1, t2, t3, t4, t5, t6, t7; 1512 1513 load_s16_8x8(s, 1, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, vl); 1514 1515 vint16m1_t d0 = highbd_convolve8_8_2d_h_rvv(t0, t1, t2, t3, t4, t5, t6, 1516 t7, x_filter_ptr, 1517 offset_s32, shift_s32, vl); 1518 __riscv_vse16_v_i16m1(d, d0, vl); 1519 1520 s += vl; 1521 d += vl; 1522 width -= vl; 1523 } while (width != 0); 1524 src_ptr += src_stride; 1525 dst_ptr += dst_stride; 1526 } while (--h != 0); 1527 } 1528 } 1529 1530 static inline vint16m1_t highbd_convolve6_8_2d_h_rvv( 1531 const vint16m1_t s0, const vint16m1_t s1, const vint16m1_t s2, 1532 const vint16m1_t s3, const vint16m1_t s4, const vint16m1_t s5, 1533 const int16_t *x_filter, const int32_t offset, const int32_t shift, 1534 size_t vl) { 1535 vint32m2_t sum = __riscv_vwmul_vx_i32m2(s0, x_filter[0], vl); 1536 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[1], s1, vl); 1537 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[2], s2, vl); 1538 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[3], s3, vl); 1539 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[4], s4, vl); 1540 sum = __riscv_vwmacc_vx_i32m2(sum, x_filter[5], s5, vl); 1541 1542 sum = __riscv_vadd_vx_i32m2(sum, offset, vl); 1543 1544 return __riscv_vnclip_wx_i16m1(sum, shift, __RISCV_VXRM_RNU, vl); 1545 } 1546 1547 static inline void highbd_convolve_2d_sr_horiz_6tap_rvv( 1548 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, 1549 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, 1550 const int offset, size_t vl) { 1551 assert(h >= 5); 1552 const int32_t shift_s32 = conv_params->round_0; 1553 const int32_t offset_s32 = offset; 1554 const int16_t *x_filter = (x_filter_ptr + 1); 1555 1556 do { 1557 const int16_t *s = (int16_t *)src_ptr; 1558 int16_t *d = (int16_t *)dst_ptr; 1559 int width = w; 1560 1561 do { 1562 vint16m1_t t0, t1, t2, t3, t4, t5; 1563 1564 load_s16_8x6(s, 1, &t0, &t1, &t2, &t3, &t4, &t5, vl); 1565 1566 vint16m1_t d0 = highbd_convolve6_8_2d_h_rvv( 1567 t0, t1, t2, t3, t4, t5, x_filter, offset_s32, shift_s32, vl); 1568 1569 __riscv_vse16_v_i16m1(d, d0, vl); 1570 1571 s += vl; 1572 d += vl; 1573 width -= vl; 1574 } while (width != 0); 1575 src_ptr += src_stride; 1576 dst_ptr += dst_stride; 1577 } while (--h != 0); 1578 } 1579 1580 void av1_highbd_convolve_2d_sr_rvv(const uint16_t *src, int src_stride, 1581 uint16_t *dst, int dst_stride, int w, int h, 1582 const InterpFilterParams *filter_params_x, 1583 const InterpFilterParams *filter_params_y, 1584 const int subpel_x_qn, const int subpel_y_qn, 1585 ConvolveParams *conv_params, int bd) { 1586 if (w == 2 || h == 2) { 1587 av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, 1588 filter_params_x, filter_params_y, subpel_x_qn, 1589 subpel_y_qn, conv_params, bd); 1590 return; 1591 } 1592 DECLARE_ALIGNED(16, uint16_t, 1593 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); 1594 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); 1595 const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps; 1596 1597 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); 1598 const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; 1599 const int im_h = h + clamped_y_taps - 1; 1600 const int im_stride = MAX_SB_SIZE; 1601 const int vert_offset = clamped_y_taps / 2 - 1; 1602 const int horiz_offset = clamped_x_taps / 2 - 1; 1603 const int x_offset_initial = (1 << (bd + FILTER_BITS - 1)); 1604 const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 1605 // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a 1606 // simple shift left instead of a rounding saturating shift left. 1607 const int y_offset = 1608 (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1)); 1609 1610 const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; 1611 1612 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( 1613 filter_params_x, subpel_x_qn & SUBPEL_MASK); 1614 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( 1615 filter_params_y, subpel_y_qn & SUBPEL_MASK); 1616 1617 size_t vl = __riscv_vsetvl_e16m1(w); 1618 1619 if (x_filter_taps > 8) { 1620 highbd_convolve_2d_sr_horiz_12tap_rvv(src_ptr, src_stride, im_block, 1621 im_stride, w, im_h, x_filter_ptr, 1622 conv_params, x_offset_initial, vl); 1623 1624 highbd_convolve_2d_sr_vert_12tap_rvv(im_block, im_stride, dst, dst_stride, 1625 w, h, y_filter_ptr, conv_params, bd, 1626 y_offset, vl); 1627 return; 1628 } 1629 if (x_filter_taps <= 6 && w != 4) { 1630 highbd_convolve_2d_sr_horiz_6tap_rvv(src_ptr, src_stride, im_block, 1631 im_stride, w, im_h, x_filter_ptr, 1632 conv_params, x_offset_initial, vl); 1633 } else { 1634 highbd_convolve_2d_sr_horiz_rvv(src_ptr, src_stride, im_block, im_stride, w, 1635 im_h, x_filter_ptr, conv_params, 1636 x_offset_initial, vl); 1637 } 1638 1639 if (y_filter_taps <= 6) { 1640 highbd_convolve_2d_sr_vert_6tap_rvv(im_block, im_stride, dst, dst_stride, w, 1641 h, y_filter_ptr, conv_params, bd, 1642 y_offset, vl); 1643 } else { 1644 highbd_convolve_2d_sr_vert_8tap_rvv(im_block, im_stride, dst, dst_stride, w, 1645 h, y_filter_ptr, conv_params, bd, 1646 y_offset, vl); 1647 } 1648 } 1649 1650 // Filter used is [64, 64]. 1651 void av1_highbd_convolve_x_sr_intrabc_rvv( 1652 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 1653 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, 1654 ConvolveParams *conv_params, int bd) { 1655 assert(subpel_x_qn == 8); 1656 assert(filter_params_x->taps == 2); 1657 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); 1658 (void)filter_params_x; 1659 (void)subpel_x_qn; 1660 (void)conv_params; 1661 (void)bd; 1662 1663 size_t vl = __riscv_vsetvl_e16m1(w); 1664 if (w <= 4) { 1665 do { 1666 // Load 1667 vuint16mf2_t s0_0 = __riscv_vle16_v_u16mf2(src, vl); 1668 vuint16mf2_t s0_1 = __riscv_vle16_v_u16mf2(src + 1, vl); 1669 vuint16mf2_t s1_0 = __riscv_vle16_v_u16mf2(src + src_stride, vl); 1670 vuint16mf2_t s1_1 = __riscv_vle16_v_u16mf2(src + src_stride + 1, vl); 1671 1672 // Average the values 1673 vuint16mf2_t d0 = 1674 __riscv_vaaddu_vv_u16mf2(s0_0, s0_1, __RISCV_VXRM_RNU, vl); 1675 vuint16mf2_t d1 = 1676 __riscv_vaaddu_vv_u16mf2(s1_0, s1_1, __RISCV_VXRM_RNU, vl); 1677 1678 // Store 1679 __riscv_vse16_v_u16mf2(dst, d0, vl); 1680 __riscv_vse16_v_u16mf2(dst + dst_stride, d1, vl); 1681 1682 src += src_stride << 1; 1683 dst += dst_stride << 1; 1684 h -= 2; 1685 } while (h > 0); 1686 } else { 1687 do { 1688 const uint16_t *src_ptr = src; 1689 uint16_t *dst_ptr = dst; 1690 int width = w; 1691 1692 do { 1693 // Load 1694 vuint16m1_t s0 = __riscv_vle16_v_u16m1(src_ptr, vl); 1695 vuint16m1_t s1 = __riscv_vle16_v_u16m1(src_ptr + 1, vl); 1696 vuint16m1_t s2 = __riscv_vle16_v_u16m1(src_ptr + src_stride, vl); 1697 vuint16m1_t s3 = __riscv_vle16_v_u16m1(src_ptr + src_stride + 1, vl); 1698 1699 // Average the values 1700 vuint16m1_t d0 = __riscv_vaaddu_vv_u16m1(s0, s1, __RISCV_VXRM_RNU, vl); 1701 vuint16m1_t d1 = __riscv_vaaddu_vv_u16m1(s2, s3, __RISCV_VXRM_RNU, vl); 1702 1703 // Store 1704 __riscv_vse16_v_u16m1(dst_ptr, d0, vl); 1705 __riscv_vse16_v_u16m1(dst_ptr + dst_stride, d1, vl); 1706 1707 src_ptr += vl; 1708 dst_ptr += vl; 1709 width -= vl; 1710 } while (width > 0); 1711 src += src_stride << 1; 1712 dst += dst_stride << 1; 1713 h -= 2; 1714 } while (h > 0); 1715 } 1716 } 1717 1718 // Filter used is [64, 64]. 1719 void av1_highbd_convolve_y_sr_intrabc_rvv( 1720 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 1721 int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, 1722 int bd) { 1723 assert(subpel_y_qn == 8); 1724 assert(filter_params_y->taps == 2); 1725 (void)filter_params_y; 1726 (void)subpel_y_qn; 1727 (void)bd; 1728 1729 size_t vl = __riscv_vsetvl_e16m1(w); 1730 if (w <= 4) { 1731 vuint16mf2_t s0 = __riscv_vle16_v_u16mf2(src, vl); 1732 1733 do { 1734 vuint16mf2_t s1 = __riscv_vle16_v_u16mf2(src + src_stride, vl); 1735 vuint16mf2_t s2 = __riscv_vle16_v_u16mf2(src + 2 * src_stride, vl); 1736 1737 // Average the values 1738 vuint16mf2_t d0 = __riscv_vaaddu_vv_u16mf2(s0, s1, __RISCV_VXRM_RNU, vl); 1739 vuint16mf2_t d1 = __riscv_vaaddu_vv_u16mf2(s1, s2, __RISCV_VXRM_RNU, vl); 1740 1741 // Store 1742 __riscv_vse16_v_u16mf2(dst, d0, vl); 1743 __riscv_vse16_v_u16mf2(dst + dst_stride, d1, vl); 1744 1745 s0 = s2; 1746 src += src_stride << 1; 1747 dst += dst_stride << 1; 1748 h -= 2; 1749 } while (h > 0); 1750 } else { 1751 do { 1752 const uint16_t *src_ptr = src; 1753 uint16_t *dst_ptr = dst; 1754 int height = h; 1755 1756 vuint16m1_t s0 = __riscv_vle16_v_u16m1(src_ptr, vl); 1757 1758 do { 1759 vuint16m1_t s1 = __riscv_vle16_v_u16m1(src_ptr + src_stride, vl); 1760 vuint16m1_t s2 = __riscv_vle16_v_u16m1(src_ptr + 2 * src_stride, vl); 1761 1762 // Average the values 1763 vuint16m1_t d0 = __riscv_vaaddu_vv_u16m1(s0, s1, __RISCV_VXRM_RNU, vl); 1764 vuint16m1_t d1 = __riscv_vaaddu_vv_u16m1(s1, s2, __RISCV_VXRM_RNU, vl); 1765 1766 // Store 1767 __riscv_vse16_v_u16m1(dst_ptr, d0, vl); 1768 __riscv_vse16_v_u16m1(dst_ptr + dst_stride, d1, vl); 1769 1770 s0 = s2; 1771 src_ptr += src_stride << 1; 1772 dst_ptr += dst_stride << 1; 1773 height -= 2; 1774 } while (height > 0); 1775 src += vl; 1776 dst += vl; 1777 w -= vl; 1778 } while (w > 0); 1779 } 1780 } 1781 1782 // Both horizontal and vertical passes use the same 2-tap filter: [64, 64]. 1783 void av1_highbd_convolve_2d_sr_intrabc_rvv( 1784 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, 1785 int h, const InterpFilterParams *filter_params_x, 1786 const InterpFilterParams *filter_params_y, const int subpel_x_qn, 1787 const int subpel_y_qn, ConvolveParams *conv_params, int bd) { 1788 assert(subpel_x_qn == 8); 1789 assert(subpel_y_qn == 8); 1790 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); 1791 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); 1792 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); 1793 (void)filter_params_x; 1794 (void)subpel_x_qn; 1795 (void)filter_params_y; 1796 (void)subpel_y_qn; 1797 (void)conv_params; 1798 (void)bd; 1799 1800 size_t vl = __riscv_vsetvl_e16m1(w); 1801 1802 if (w <= 8) { 1803 // Horizontal filter. 1804 vuint16m1_t s0 = __riscv_vle16_v_u16m1(src, vl); 1805 vuint16m1_t s1 = __riscv_vle16_v_u16m1(src + 1, vl); 1806 src += src_stride; 1807 1808 vuint16m1_t sum0 = __riscv_vadd_vv_u16m1(s0, s1, vl); 1809 1810 do { 1811 vuint16m1_t s2 = __riscv_vle16_v_u16m1(src, vl); 1812 vuint16m1_t s3 = __riscv_vle16_v_u16m1(src + 1, vl); 1813 src += src_stride; 1814 vuint16m1_t s4 = __riscv_vle16_v_u16m1(src, vl); 1815 vuint16m1_t s5 = __riscv_vle16_v_u16m1(src + 1, vl); 1816 src += src_stride; 1817 1818 vuint16m1_t sum1 = __riscv_vadd_vv_u16m1(s2, s3, vl); 1819 vuint16m1_t sum2 = __riscv_vadd_vv_u16m1(s4, s5, vl); 1820 1821 // Vertical filter. 1822 vuint16m1_t d0 = 1823 __riscv_vadd_vx_u16m1(__riscv_vadd_vv_u16m1(sum0, sum1, vl), 2, vl); 1824 vuint16m1_t d1 = 1825 __riscv_vadd_vx_u16m1(__riscv_vadd_vv_u16m1(sum1, sum2, vl), 2, vl); 1826 1827 d0 = __riscv_vsrl_vx_u16m1(d0, 2, vl); 1828 d1 = __riscv_vsrl_vx_u16m1(d1, 2, vl); 1829 1830 __riscv_vse16_v_u16m1(dst, d0, vl); 1831 dst += dst_stride; 1832 __riscv_vse16_v_u16m1(dst, d1, vl); 1833 dst += dst_stride; 1834 1835 sum0 = sum2; 1836 h -= 2; 1837 } while (h != 0); 1838 } else { 1839 do { 1840 uint16_t *src_ptr = (uint16_t *)src; 1841 uint16_t *dst_ptr = dst; 1842 int height = h; 1843 1844 // Horizontal filter. 1845 vuint16m1_t s0 = __riscv_vle16_v_u16m1(src_ptr, vl); 1846 vuint16m1_t s1 = __riscv_vle16_v_u16m1(src_ptr + 1, vl); 1847 src_ptr += src_stride; 1848 1849 vuint16m1_t sum0 = __riscv_vadd_vv_u16m1(s0, s1, vl); 1850 1851 do { 1852 vuint16m1_t s2 = __riscv_vle16_v_u16m1(src_ptr, vl); 1853 vuint16m1_t s3 = __riscv_vle16_v_u16m1(src_ptr + 1, vl); 1854 src_ptr += src_stride; 1855 vuint16m1_t s4 = __riscv_vle16_v_u16m1(src_ptr, vl); 1856 vuint16m1_t s5 = __riscv_vle16_v_u16m1(src_ptr + 1, vl); 1857 src_ptr += src_stride; 1858 1859 vuint16m1_t sum1 = __riscv_vadd_vv_u16m1(s2, s3, vl); 1860 vuint16m1_t sum2 = __riscv_vadd_vv_u16m1(s4, s5, vl); 1861 1862 // Vertical filter. 1863 vuint16m1_t d0 = 1864 __riscv_vadd_vx_u16m1(__riscv_vadd_vv_u16m1(sum0, sum1, vl), 2, vl); 1865 vuint16m1_t d1 = 1866 __riscv_vadd_vx_u16m1(__riscv_vadd_vv_u16m1(sum1, sum2, vl), 2, vl); 1867 1868 d0 = __riscv_vsrl_vx_u16m1(d0, 2, vl); 1869 d1 = __riscv_vsrl_vx_u16m1(d1, 2, vl); 1870 1871 __riscv_vse16_v_u16m1(dst_ptr, d0, vl); 1872 dst_ptr += dst_stride; 1873 __riscv_vse16_v_u16m1(dst_ptr, d1, vl); 1874 dst_ptr += dst_stride; 1875 1876 sum0 = __riscv_vmv_v_v_u16m1(sum2, vl); 1877 height -= 2; 1878 } while (height != 0); 1879 1880 src += vl; 1881 dst += vl; 1882 w -= vl; 1883 } while (w != 0); 1884 } 1885 }