ipred.S (11120B)
1 /****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2024, Bogdan Gligorijevic 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 *****************************************************************************/ 27 28 #include "src/riscv/asm.S" 29 30 // void ipred_v_8bpc_rvv(pixel *dst, const ptrdiff_t stride, 31 // const pixel *const topleft, 32 // const int width, const int height, const int a, 33 // const int max_width, const int max_height) 34 function ipred_v_8bpc_rvv, export=1, ext="v,zba" 35 csrw vxrm, zero 36 addi a2, a2, 1 37 vsetvli t0, a3, e8, m1, ta, ma 38 bne t0, a3, 3f // Go to slow path - whole row doesn't fit in regsiter 39 1: 40 // Fast path - row fits in register 41 add t1, a0, a1 42 vle8.v v4, (a2) 43 2: 44 vse8.v v4, (a0) 45 sh1add a0, a1, a0 46 vse8.v v4, (t1) 47 sh1add t1, a1, t1 48 addi a4, a4, -2 49 bnez a4, 2b 50 ret 51 52 // Row doesn't fit in register. 53 3: 54 vsetvli t0, a3, e8, m2, ta, ma // Try using 2 registers at once (LMUL=2) 55 beq t0, a3, 1b // Back to fast path - now it fits 56 57 4: 58 // No need for more vsetli, since both width and VLEN are power of 2, so there is no tail. 59 vle8.v v4, (a2) 60 mv t2, a0 61 mv t1, a4 62 5: 63 vse8.v v4, (t2) 64 add t2, t2, a1 65 addi t1, t1, -1 66 bnez t1, 5b // Loop over rows. 67 68 sub a3, a3, t0 69 add a2, a2, t0 70 add a0, a0, t0 71 bnez a3, 4b // Loop over columns 72 73 ret 74 endfunc 75 76 77 function dc_gen_8bpc_rvv, export=1, ext="v,zbb" 78 .variant_cc dav1d_dc_gen_8bpc_rvv 79 add t1, a1, a2 80 srli t5, t1, 1 81 mv t1, a1 82 addi t2, a0, 1 83 vsetvli zero, t1, e16, m4, ta, ma 84 vmv.v.x v0, zero 85 1: 86 vsetvli t3, t1, e8, m2, tu, ma 87 vle8.v v4, (t2) 88 vwaddu.wv v0, v0, v4 89 90 sub t1, t1, t3 91 add t2, t2, t3 92 bnez t1, 1b 93 94 mv t1, a2 95 mv t2, a0 96 vsetvli zero, t1, e16, m4, ta, ma 97 vmv.v.x v8, zero 98 2: 99 vsetvli t3, t1, e8, m2, tu, ma 100 sub t2, t2, t3 101 vle8.v v4, (t2) 102 vwaddu.wv v8, v8, v4 103 sub t1, t1, t3 104 105 bnez t1, 2b 106 107 vsetvli zero, zero, e32, m8, ta, ma 108 vmv.s.x v16, t5 109 vmv.s.x v12, zero 110 vsetvli zero, a1, e16, m4, ta, ma 111 vwredsum.vs v24, v0, v16 112 vsetvli zero, a2, e16, m4, ta, ma 113 vwredsum.vs v16, v8, v12 114 vsetvli zero, zero, e32, m8, ta, ma 115 vmv.x.s t5, v24 116 vmv.x.s t1, v16 117 add t5, t5, t1 118 119 add t1, a1, a2 120 ctz t1, t1 121 122 srl a0, t5, t1 123 124 125 beq a1, a2, 5f 126 slli t1, a1, 1 127 sltu t2, t1, a2 128 slli t3, a2, 1 129 sltu t1, t3, a1 130 or t1, t1, t2 131 bnez t1, 3f 132 133 li t1, 0x5556 134 j 4f 135 3: 136 li t1, 0x3334 137 4: 138 mul a0, a0, t1 139 srli a0, a0, 16 140 5: 141 jr t0 142 endfunc 143 144 function dc_gen_top_8bpc_rvv, export=1, ext="v,zbb" 145 .variant_cc dav1d_dc_gen_top_8bpc_rvv 146 mv t1, a1 147 srli t5, a1, 1 148 addi a0, a0, 1 149 vsetvli zero, t1, e16, m4, ta, ma 150 vmv.v.x v0, zero 151 1: 152 vsetvli t3, t1, e8, m2, tu, ma 153 vle8.v v4, (a0) 154 vwaddu.wv v0, v0, v4 155 sub t1, t1, t3 156 157 add a0, a0, t3 158 bnez t1, 1b 159 j dc_gen_sum_up_8bpc_rvv 160 endfunc 161 162 function dc_gen_left_8bpc_rvv, export=1, ext="v,zbb" 163 .variant_cc dav1d_dc_gen_left_8bpc_rvv 164 mv t1, a1 165 srli t5, a1, 1 166 vsetvli t2, t1, e16, m4, ta, ma 167 vmv.v.x v0, zero 168 169 1: 170 vsetvli t3, t1, e8, m2, tu, ma 171 sub a0, a0, t3 172 vle8.v v4, (a0) 173 vwaddu.wv v0, v0, v4 174 sub t1, t1, t3 175 bnez t1, 1b 176 177 j dc_gen_sum_up_8bpc_rvv 178 endfunc 179 180 function dc_gen_sum_up_8bpc_rvv, export=1, ext="v,zbb" 181 .variant_cc dav1d_dc_gen_sum_up_8bpc_rvv 182 vsetvli zero, a1, e32, m8, ta, ma 183 vmv.s.x v4, t5 184 vsetvli zero, zero, e16, m4, ta, ma 185 vwredsum.vs v8, v0, v4 186 vsetvli zero, zero, e32, m8, ta, ma 187 vmv.x.s t5, v8 188 189 ctz t1, a1 190 191 srl a0, t5, t1 192 jr t0 193 endfunc 194 195 function cfl_pred_8bpc_rvv, export=1, ext="v,zba" 196 csrw vxrm, zero 197 1: 198 li t2, 0 199 mv t3, a2 200 2: 201 vsetvli t0, t3, e16, m2, ta, ma 202 add t4, a0, t2 203 vle16.v v0, (a5) 204 sh1add a5, t0, a5 205 206 vwmul.vx v4, v0, a6 207 vsetvli zero, zero, e32, m4, ta, mu 208 vneg.v v8, v4 209 vmslt.vx v0, v4, x0 210 vmax.vv v12, v8, v4 211 vssra.vi v16, v12, 6 212 vneg.v v16, v16, v0.t 213 vadd.vx v20, v16, a4 214 vmax.vx v0, v20, zero 215 vsetvli zero, zero, e16, m2, ta, ma 216 vnclipu.wi v4, v0, 0 217 vsetvli zero, zero, e8, m1, ta, ma 218 vnclipu.wi v0, v4, 0 219 vse8.v v0, (t4) 220 add t2, t0, t2 221 sub t3, t3, t0 222 bnez t3, 2b 223 addi a3, a3, -1 224 add a0, a0, a1 225 226 bnez a3, 1b 227 ret 228 endfunc 229 230 function ipred_cfl_8bpc_rvv, export=1, ext=v 231 mv t6, a0 # dst 232 mv a0, a2 # topleft 233 mv t4, a1 # stride 234 mv a1, a3 # width 235 mv a2, a4 # height 236 jal t0, dc_gen_8bpc_rvv 237 mv a2, a3 # width 238 mv a3, a4 # height 239 mv a4, a0 # dc_get_top 240 mv a0, t6 # dst 241 mv a1, t4 # stride 242 j cfl_pred_8bpc_rvv 243 endfunc 244 245 function ipred_cfl_128_8bpc_rvv, export=1, ext="v,zba" 246 # dc = 128, then just rearrange registers 247 mv a2, a3 248 mv a3, a4 249 li a4, 128 250 251 j cfl_pred_8bpc_rvv 252 endfunc 253 254 function ipred_cfl_top_8bpc_rvv, export=1, ext=v 255 mv t6, a0 # dst 256 mv a0, a2 # topleft 257 mv t4, a1 # stride 258 mv a1, a3 # width 259 jal t0, dc_gen_top_8bpc_rvv 260 mv a3, a4 # height 261 mv a4, a0 # dc_get_top 262 mv a0, t6 # dst 263 mv a2, a1 # width 264 mv a1, t4 # stride 265 j cfl_pred_8bpc_rvv 266 endfunc 267 268 function ipred_cfl_left_8bpc_rvv, export=1, ext="v,zba" 269 mv t6, a0 # dst 270 mv a0, a2 # topleft 271 mv t4, a1 # stride 272 mv a1, a4 # height 273 mv a2, a3 # width 274 jal t0, dc_gen_left_8bpc_rvv 275 mv a3, a4 # height 276 mv a4, a0 # dc_get_left 277 mv a1, t4 # stride 278 mv a0, t6 # dst 279 j cfl_pred_8bpc_rvv 280 endfunc 281 282 function ipred_paeth_8bpc_rvv, export=1, ext="v,zba" 283 csrw vxrm, zero 284 li t0, 0 285 mv t3, a2 286 lbu t1, (a2) 287 addi a6, a2, -1 288 addi a2, a2, 1 289 1: 290 lbu t2, (a6) 291 mv t3, a3 292 2: 293 sub t5, a3, t3 294 add t5, a2, t5 295 vsetvli t6, t3, e8, m1, ta, ma 296 vle8.v v2, (t5) 297 vwaddu.vx v4, v2, t2 298 vsetvli zero, zero, e16, m2, ta, ma 299 vwsub.vx v8, v4, t1 300 301 vsetvli zero, zero, e32, m4, ta, mu 302 vzext.vf4 v24, v2 303 vsub.vx v12, v8, t1 304 vmslt.vx v0, v12, zero 305 vneg.v v12, v12, v0.t 306 vsub.vx v16, v8, t2 307 vmslt.vx v0, v16, zero 308 vneg.v v16, v16, v0.t 309 vsub.vv v20, v8, v24 310 vmslt.vx v0, v20, zero 311 vneg.v v20, v20, v0.t 312 313 sub t5, a3, t3 314 vmsleu.vv v4, v16, v20 315 vmsleu.vv v5, v16, v12 316 vmsgtu.vv v0, v20, v12 317 vmand.mm v6, v4, v5 318 319 vsetvli zero, zero, e8, m1, ta, ma 320 vmerge.vxm v8, v2, t1, v0 321 vmmv.m v0, v6 322 add t5, a0, t5 323 sub t3, t3, t6 324 vmerge.vxm v4, v8, t2, v0 325 326 vse8.v v4, (t5) 327 328 bnez t3, 2b 329 330 addi a4, a4, -1 331 addi a6, a6, -1 332 add a0, a0, a1 333 bnez a4, 1b 334 ret 335 endfunc 336 337 function ipred_smooth_8bpc_rvv, export=1, ext="v,zba" 338 csrw vxrm, zero 339 la t0, dav1d_sm_weights 340 add t1, t0, a3 341 add t2, a2, a3 342 add t0, t0, a4 343 lbu t2, (t2) 344 sub t3, a2, a4 345 addi a6, a2, -1 346 addi a2, a2, 1 347 lbu t3, (t3) 348 1: 349 mv t6, a3 350 351 lbu a7, (a6) 352 lbu t4, (t0) 353 2: 354 li a5, 256 355 vsetvli t5, t6, e8, m1, ta, ma 356 vle8.v v2, (t1) 357 add t1, t1, t5 358 vle8.v v4, (a2) 359 add a2, a2, t5 360 sub a5, a5, t4 361 362 vwmulu.vx v8, v4, t4 363 vsetvli zero, zero, e16, m2, ta, ma 364 mul a5, a5, t3 365 366 vadd.vx v4, v8, a5 367 vsetvli zero, zero, e8, m1, ta, ma 368 vwmulu.vx v8, v2, a7 369 370 vneg.v v12, v2 371 vwmaccu.vx v8, t2, v12 372 vsetvli zero, zero, e16, m2, ta, ma 373 vwaddu.vv v12, v4, v8 374 375 sub a5, a3, t6 376 sub t6, t6, t5 377 add a5, a5, a0 378 vnclipu.wi v2, v12, 9 379 vsetvli zero, zero, e8, m1, ta, ma 380 vnclipu.wi v0, v2, 0 381 vse8.v v0, (a5) 382 383 bnez t6, 2b 384 385 sub t1, t1, a3 386 add a0, a0, a1 387 sub a2, a2, a3 388 addi a4, a4, -1 389 addi t0, t0, 1 390 addi a6, a6, -1 391 bnez a4, 1b 392 393 ret 394 endfunc 395 396 function ipred_smooth_v_8bpc_rvv, export=1, ext="v,zba" 397 csrw vxrm, zero 398 la t0, dav1d_sm_weights 399 add t2, a2, a3 400 add t0, t0, a4 401 sub t3, a2, a4 402 addi a2, a2, 1 403 lbu t3, (t3) 404 1: 405 mv t6, a3 406 407 lbu t4, (t0) 408 2: 409 li a5, 256 410 vsetvli t5, t6, e8, m1, ta, ma 411 vle8.v v4, (a2) 412 add a2, a2, t5 413 sub a5, a5, t4 414 415 vwmulu.vx v8, v4, t4 416 vsetvli zero, zero, e16, m2, ta, ma 417 mul a5, a5, t3 418 vwaddu.vx v4, v8, a5 419 420 sub a5, a3, t6 421 sub t6, t6, t5 422 add a5, a5, a0 423 vsetvli zero, zero, e16, m2, ta, ma 424 vnclipu.wi v2, v4, 8 425 vsetvli zero, zero, e8, m1, ta, ma 426 vnclipu.wi v0, v2, 0 427 vse8.v v0, (a5) 428 429 bnez t6, 2b 430 431 add a0, a0, a1 432 sub a2, a2, a3 433 addi a4, a4, -1 434 addi t0, t0, 1 435 bnez a4, 1b 436 437 ret 438 endfunc 439 440 function ipred_smooth_h_8bpc_rvv, export=1, ext="v,zba" 441 csrw vxrm, zero 442 la t0, dav1d_sm_weights 443 add t1, t0, a3 444 add t2, a2, a3 445 lbu t2, (t2) 446 addi a6, a2, -1 447 1: 448 mv t6, a3 449 450 lbu a7, (a6) 451 2: 452 vsetvli t5, t6, e8, m1, ta, ma 453 vle8.v v2, (t1) 454 add t1, t1, t5 455 456 vwmulu.vx v8, v2, a7 457 458 vneg.v v12, v2 459 vwmaccu.vx v8, t2, v12 460 461 sub a5, a3, t6 462 sub t6, t6, t5 463 add a5, a5, a0 464 vsetvli zero, zero, e8, m1, ta, ma 465 vnclipu.wi v0, v8, 8 466 vse8.v v0, (a5) 467 468 bnez t6, 2b 469 470 sub t1, t1, a3 471 add a0, a0, a1 472 addi a4, a4, -1 473 addi a6, a6, -1 474 bnez a4, 1b 475 476 ret 477 endfunc 478 479 function pal_pred_8bpc_rvv, export=1, ext="v,zba" 480 csrw vxrm, zero 481 vsetivli t5, 8, e8, m1, ta, ma 482 vle8.v v30, (a2) 483 li t0, 2 484 srli t1, a4, 1 485 1: 486 mv t4, a4 487 2: 488 vsetvli t5, t1, e8, m1, ta, ma 489 vle8.v v0, (a3) 490 add a3, a3, t5 491 vsrl.vi v2, v0, 4 492 sub t6, a4, t4 493 vand.vi v1, v0, 7 494 add t6, a0, t6 495 vrgather.vv v3, v30, v1 496 addi t2, t6, 1 497 vrgather.vv v4, v30, v2 498 slli t5, t5, 1 499 vsse8.v v3, (t6), t0 500 sub t4, t4, t5 501 vsse8.v v4, (t2), t0 502 503 bnez t4, 2b 504 addi a5, a5, -1 505 add a0, a0, a1 506 bnez a5, 1b 507 ret 508 endfunc