mc.S (18243B)
1 /****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2024, Nathan Egge, Niklas Haas, Bogdan Gligorijevic 4 * Copyright © 2025, Sungjoon Moon 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 *****************************************************************************/ 28 29 #include "src/riscv/asm.S" 30 31 function blend_vl256_8bpc_rvv, export=1, ext=zbb 32 ctz t0, a3 33 addi t0, t0, 0xc3 34 j L(blend_epilog) 35 endfunc 36 37 function blend_8bpc_rvv, export=1, ext="v,zbb" 38 ctz t0, a3 39 addi t0, t0, 0xc4 40 L(blend_epilog): 41 csrw vxrm, zero 42 andi t0, t0, 0xc7 43 vsetvl zero, a3, t0 44 li t1, 64 45 1: 46 addi a4, a4, -2 47 vle8.v v4, (a2) 48 add a2, a2, a3 49 vle8.v v6, (a2) 50 add a2, a2, a3 51 vle8.v v8, (a5) 52 add a5, a5, a3 53 vle8.v v10, (a5) 54 add a5, a5, a3 55 vle8.v v0, (a0) 56 add t0, a0, a1 57 vle8.v v2, (t0) 58 vwmulu.vv v16, v4, v8 59 vwmulu.vv v20, v6, v10 60 vrsub.vx v8, v8, t1 61 vrsub.vx v10, v10, t1 62 vwmaccu.vv v16, v0, v8 63 vwmaccu.vv v20, v2, v10 64 vnclipu.wi v0, v16, 6 65 vnclipu.wi v2, v20, 6 66 vse8.v v0, (a0) 67 vse8.v v2, (t0) 68 add a0, t0, a1 69 bnez a4, 1b 70 ret 71 endfunc 72 73 function blend_h_vl256_8bpc_rvv, export=1, ext=zbb 74 srai t0, a3, 2 75 li t2, 64 76 ctz t0, t0 77 addi t0, t0, 0xc5 78 j L(blend_h_epilog) 79 endfunc 80 81 function blend_h_8bpc_rvv, export=1, ext="v,zbb" 82 li t2, 64 83 bgt a3, t2, 128f 84 ctz t0, a3 85 addi t0, t0, 0xc4 86 L(blend_h_epilog): 87 csrw vxrm, zero 88 andi t0, t0, 0xc7 89 vsetvl zero, a3, t0 90 la t1, dav1d_obmc_masks 91 srai t0, a4, 2 92 add t1, t1, a4 93 sub a4, a4, t0 94 0: 95 mv t5, ra 96 1: 97 addi a4, a4, -2 98 lbu t3, (t1) 99 addi t1, t1, 1 100 lbu t4, (t1) 101 addi t1, t1, 1 102 vle8.v v8, (a2) 103 add a2, a2, a3 104 vle8.v v12, (a2) 105 add a2, a2, a3 106 vle8.v v0, (a0) 107 add t0, a0, a1 108 vle8.v v4, (t0) 109 vwmulu.vx v16, v8, t3 110 vwmulu.vx v24, v12, t4 111 sub t3, t2, t3 112 sub t4, t2, t4 113 vwmaccu.vx v16, t3, v0 114 vwmaccu.vx v24, t4, v4 115 vnclipu.wi v0, v16, 6 116 vnclipu.wi v4, v24, 6 117 vse8.v v0, (a0) 118 vse8.v v4, (t0) 119 add a0, t0, a1 120 bgtz a4, 1b 121 jr t5 122 128: 123 csrw vxrm, zero 124 vsetvli zero, t2, e8, m4, ta, ma 125 la t1, dav1d_obmc_masks 126 srai t0, a4, 2 127 add t1, t1, a4 128 sub a4, a4, t0 129 mv a5, a0 130 mv a6, a2 131 mv a7, a4 132 jal t5, 1b 133 add t1, t1, a4 134 add a0, a5, t2 135 add a2, a6, t2 136 mv a4, a7 137 sub t1, t1, a4 138 j 0b 139 endfunc 140 141 function blend_v_vl256_8bpc_rvv, export=1, ext=zbb 142 srai t0, a3, 2 143 ctz t0, t0 144 addi t0, t0, 0xc5 145 j L(blend_v_epilog) 146 endfunc 147 148 function blend_v_8bpc_rvv, export=1, ext="v,zbb" 149 ctz t0, a3 150 addi t0, t0, 0xc4 151 L(blend_v_epilog): 152 andi t0, t0, 0xc7 153 srai t1, a3, 2 154 sub t1, a3, t1 155 vsetvl zero, t1, t0 156 csrw vxrm, zero 157 la t1, dav1d_obmc_masks 158 add t1, t1, a3 159 vle8.v v8, (t1) 160 li t0, 64 161 vrsub.vx v10, v8, t0 162 1: 163 addi a4, a4, -2 164 vle8.v v4, (a2) 165 add a2, a2, a3 166 vle8.v v6, (a2) 167 add a2, a2, a3 168 vle8.v v0, (a0) 169 add t0, a0, a1 170 vle8.v v2, (t0) 171 vwmulu.vv v12, v4, v8 172 vwmulu.vv v16, v6, v8 173 vwmaccu.vv v12, v0, v10 174 vwmaccu.vv v16, v2, v10 175 vnclipu.wi v0, v12, 6 176 vnclipu.wi v2, v16, 6 177 vse8.v v0, (a0) 178 vse8.v v2, (t0) 179 add a0, t0, a1 180 bnez a4, 1b 181 ret 182 endfunc 183 184 .macro avg va, vb, vm 185 vadd.vv \va, \va, \vb 186 .endm 187 188 .macro w_avg va, vb, vm 189 vwmul.vx v24, \va, a6 190 vwmacc.vx v24, a7, \vb 191 vnclip.wi \va, v24, 8 192 .endm 193 194 .macro mask va, vb, vm 195 vwmul.vv v24, \va, \vm 196 vrsub.vx \vm, \vm, a7 197 vwmacc.vv v24, \vb, \vm 198 vnclip.wi \va, v24, 10 199 .endm 200 201 .macro bidir_fn type, shift 202 function \type\()_8bpc_rvv, export=1, ext="v,zba,zbb" 203 .ifc \type, w_avg 204 li a7, 16 205 sub a7, a7, a6 206 .endif 207 .ifc \type, mask 208 li a7, 64 209 .endif 210 li t0, 4 211 csrw vxrm, zero 212 beq t0, a4, 4f 213 csrr t0, vlenb 214 ctz t1, a4 215 ctz t0, t0 216 li t2, 1 217 sub t0, t1, t0 218 li t4, -3 219 bgt t0, t2, 2f 220 max t0, t0, t4 221 andi t1, t0, 0x7 222 addi t0, t1, 1 # may overflow into E16 bit 223 ori t0, t0, MA | TA | E16 224 ori t1, t1, MA | TA | E8 225 1: 226 addi a5, a5, -4 227 .rept 2 228 vsetvl zero, a4, t0 229 sh1add t3, a4, a2 230 vle16.v v0, (a2) 231 sh1add a2, a4, t3 232 vle16.v v4, (t3) 233 sh1add t3, a4, a3 234 vle16.v v8, (a3) 235 sh1add a3, a4, t3 236 vle16.v v12, (t3) 237 .ifc \type, mask 238 add t3, a4, a6 239 vle8.v v24, (a6) 240 add a6, a4, t3 241 vle8.v v26, (t3) 242 vzext.vf2 v16, v24 243 vzext.vf2 v20, v26 244 .endif 245 \type v0, v8, v16 246 \type v4, v12, v20 247 vmax.vx v8, v0, zero 248 vmax.vx v12, v4, zero 249 vsetvl zero, zero, t1 250 vnclipu.wi v0, v8, \shift 251 vnclipu.wi v2, v12, \shift 252 add t3, a1, a0 253 vse8.v v0, (a0) 254 add a0, a1, t3 255 vse8.v v2, (t3) 256 .endr 257 bnez a5, 1b 258 ret 259 2: 260 mv t0, a0 261 neg t4, a4 262 add a0, a1, a0 263 addi a5, a5, -1 264 20: 265 vsetvli t2, a4, e16, m4, ta, ma 266 sh1add t4, t2, t4 267 sh1add t3, t2, a2 268 vle16.v v0, (a2) 269 sh1add a2, t2, t3 270 vle16.v v4, (t3) 271 sh1add t3, t2, a3 272 vle16.v v8, (a3) 273 sh1add a3, t2, t3 274 vle16.v v12, (t3) 275 .ifc \type, mask 276 add t3, t2, a6 277 vle8.v v24, (a6) 278 add a6, t2, t3 279 vle8.v v26, (t3) 280 vzext.vf2 v16, v24 281 vzext.vf2 v20, v26 282 .endif 283 \type v0, v8, v16 284 \type v4, v12, v20 285 vmax.vx v8, v0, zero 286 vmax.vx v12, v4, zero 287 vsetvli zero, zero, e8, m2, ta, ma 288 vnclipu.wi v0, v8, \shift 289 vnclipu.wi v2, v12, \shift 290 add t3, t2, t0 291 vse8.v v0, (t0) 292 add t0, t2, t3 293 vse8.v v2, (t3) 294 bnez t4, 20b 295 bnez a5, 2b 296 ret 297 4: 298 slli t0, a5, 2 299 vsetvli t1, t0, e16, m4, ta, ma 300 vle16.v v0, (a2) 301 sh1add a2, t1, a2 302 vle16.v v4, (a3) 303 sh1add a3, t1, a3 304 .ifc \type, mask 305 vle8.v v16, (a6) 306 add a6, t1, a6 307 vzext.vf2 v8, v16 308 .endif 309 \type v0, v4, v8 310 vmax.vx v8, v0, zero 311 vsetvli zero, zero, e8, m2, ta, ma 312 vnclipu.wi v0, v8, \shift 313 vsetvli t1, a5, e32, m2, ta, ma 314 vsse32.v v0, (a0), a1 315 ctz t0, t1 316 sub a5, a5, t1 317 sll t0, a1, t0 318 add a0, t0, a0 319 bnez a5, 4b 320 ret 321 endfunc 322 .endm 323 324 bidir_fn avg, 5 325 bidir_fn w_avg, 0 326 bidir_fn mask, 0 327 328 function warp_8x8_8bpc_rvv, export=1, ext="v" 329 csrw vxrm, zero 330 331 vsetivli zero, 8, e16, m1, ta, ma 332 addi sp, sp, -2*15*8 333 mv t5, sp 334 li t0, 3 335 mul t0, a3, t0 336 sub a2, a2, t0 337 addi a2, a2, -3 338 339 li t0, 64 340 addi a3, a3, -8 341 li t1, 15 342 la t2, dav1d_mc_warp_filter 343 344 lh t6, (a4) 345 lh t4, 2(a4) 346 vid.v v30 347 vwmul.vx v28, v30, t6 348 1: 349 addi t1, t1, -1 350 351 352 vsetvli zero, zero, e32, m2, ta, ma 353 vadd.vx v4, v28, a5 354 add a5, a5, t4 355 vssra.vi v2, v4, 10 356 vadd.vx v2, v2, t0 357 vsll.vi v24, v2, 3 358 vsetvli zero, zero, e8, mf2, ta, ma 359 360 vluxseg8ei32.v v2, (t2), v24 361 362 vsetvli zero, zero, e16, m1, ta, ma 363 .irp i, 2, 3, 4, 5, 6, 7, 8, 9 364 vle8.v v10, (a2) 365 addi a2, a2, 1 366 367 vsext.vf2 v14, v\i 368 vzext.vf2 v16, v10 369 370 .if \i == 2 371 vwmulsu.vv v12, v14, v16 372 .else 373 vwmaccsu.vv v12, v14, v16 374 .endif 375 .endr 376 vnclip.wi v10, v12, 3 377 378 add a2, a2, a3 379 vse16.v v10, (t5) 380 addi t5, t5, 16 381 382 bnez t1, 1b 383 384 mv t5, sp 385 li t1, 8 386 387 lh t6, 4(a4) 388 lh t4, 6(a4) 389 vwmul.vx v28, v30, t6 390 2: 391 addi t1, t1, -1 392 393 vsetvli zero, zero, e32, m2, ta, ma 394 vadd.vx v4, v28, a6 395 396 add a6, a6, t4 397 vssra.vi v2, v4, 10 398 vadd.vx v2, v2, t0 399 vsll.vi v24, v2, 3 400 vsetvli zero, zero, e8, mf2, ta, ma 401 402 vluxseg8ei32.v v2, (t2), v24 403 vsetvli zero, zero, e16, m1, ta, ma 404 405 .irp i, 2, 3, 4, 5, 6, 7, 8, 9 406 vle16.v v10, (t5) 407 addi t5, t5, 16 408 409 vsext.vf2 v14, v\i 410 411 .if \i == 2 412 vwmul.vv v12, v14, v10 413 .else 414 vwmacc.vv v12, v14, v10 415 .endif 416 .endr 417 addi t5, t5, -16*7 418 vnclip.wi v10, v12, 11 419 420 vmax.vx v10, v10, zero 421 vsetvli zero, zero, e8, mf2, ta, ma 422 423 vnclipu.wi v12, v10, 0 424 425 vse8.v v12, (a0) 426 add a0, a0, a1 427 428 bnez t1, 2b 429 430 addi sp, sp, 2*15*8 431 432 ret 433 endfunc 434 435 function warp_8x8t_8bpc_rvv, export=1, ext="v,zba" 436 csrw vxrm, zero 437 438 vsetivli zero, 8, e16, m1, ta, ma 439 addi sp, sp, -2*15*8 440 mv t5, sp 441 li t0, 3 442 mul t0, a3, t0 443 sub a2, a2, t0 444 addi a2, a2, -3 445 446 li t0, 64 447 addi a3, a3, -8 448 li t1, 15 449 la t2, dav1d_mc_warp_filter 450 451 lh t6, (a4) 452 lh t4, 2(a4) 453 vid.v v30 454 vwmul.vx v28, v30, t6 455 1: 456 addi t1, t1, -1 457 458 459 vsetvli zero, zero, e32, m2, ta, ma 460 vadd.vx v4, v28, a5 461 add a5, a5, t4 462 vssra.vi v2, v4, 10 463 vadd.vx v2, v2, t0 464 vsll.vi v24, v2, 3 465 vsetvli zero, zero, e8, mf2, ta, ma 466 467 vluxseg8ei32.v v2, (t2), v24 468 469 vsetvli zero, zero, e16, m1, ta, ma 470 .irp i, 2, 3, 4, 5, 6, 7, 8, 9 471 vle8.v v10, (a2) 472 addi a2, a2, 1 473 474 vsext.vf2 v14, v\i 475 vzext.vf2 v16, v10 476 477 .if \i == 2 478 vwmulsu.vv v12, v14, v16 479 .else 480 vwmaccsu.vv v12, v14, v16 481 .endif 482 .endr 483 vnclip.wi v10, v12, 3 484 485 add a2, a2, a3 486 vse16.v v10, (t5) 487 addi t5, t5, 16 488 489 bnez t1, 1b 490 491 mv t5, sp 492 li t1, 8 493 494 lh t6, 4(a4) 495 lh t4, 6(a4) 496 vwmul.vx v28, v30, t6 497 2: 498 addi t1, t1, -1 499 500 vsetvli zero, zero, e32, m2, ta, ma 501 vadd.vx v4, v28, a6 502 add a6, a6, t4 503 vssra.vi v2, v4, 10 504 vadd.vx v2, v2, t0 505 vsll.vi v24, v2, 3 506 vsetvli zero, zero, e8, mf2, ta, ma 507 508 vluxseg8ei32.v v2, (t2), v24 509 vsetvli zero, zero, e16, m1, ta, ma 510 511 .irp i, 2, 3, 4, 5, 6, 7, 8, 9 512 vle16.v v10, (t5) 513 addi t5, t5, 16 514 515 vsext.vf2 v14, v\i 516 517 .if \i == 2 518 vwmul.vv v12, v14, v10 519 .else 520 vwmacc.vv v12, v14, v10 521 .endif 522 523 .endr 524 addi t5, t5, -16*7 525 vnclip.wi v10, v12, 7 526 527 vse16.v v10, (a0) 528 sh1add a0, a1, a0 529 530 bnez t1, 2b 531 532 addi sp, sp, 2*15*8 533 534 ret 535 endfunc 536 537 function emu_edge_8bpc_rvv, export=1, ext="v,zbb" 538 ld t0, 0(sp) 539 ld t1, 8(sp) 540 541 // int cx = iclip((int) x, 0, (int) iw - 1); 542 max t2, a4, zero 543 addi t4, a2, -1 544 min t2, t2, t4 545 546 // int cy = iclip((int) y, 0, (int) ih - 1); 547 max t3, a5, zero 548 addi t5, a3, -1 549 min t3, t3, t5 550 551 // ref += cy*PXSTRIDE(ref_stride) + cx 552 mul t3, t3, t1 553 add t3, t3, t2 554 555 add t0, t0, t3 556 557 addi t4, a0, -1 558 559 neg t2, a4 560 add t3, a4, a0 561 sub t3, t3, a2 562 563 // int left_ext = iclip((int) -x, 0, (int) bw - 1); 564 max t2, t2, zero 565 min a2, t2, t4 # a2 = left_ext 566 567 // int right_ext = iclip((int) (x + bw - iw), 0, (int) bw - 1); 568 max t3, t3, zero 569 min a4, t3, t4 # a4 = right_ext 570 571 addi t6, a1, -1 572 573 neg t4, a5 574 add t5, a5, a1 575 sub t5, t5, a3 576 577 // int top_ext = iclip((int) -y, 0, (int) bh - 1); 578 max t4, t4, zero 579 min a3, t4, t6 # a3 = top_ext 580 581 // int bottom_ext = iclip((int) (x + bh - ih), 0, (int) bh - 1); 582 max t5, t5, zero 583 min a5, t5, t6 # a5 = bottom_ext 584 585 sub t4, a1, a3 586 sub t4, t4, a5 # t4 = center_h 587 588 mul t5, a3, a7 589 add a1, a6, t5 # blk = dst + top_ext * dst_stride 590 591 sub t3, a0, a2 592 sub t3, t3, a4 # t3 = center_w = bw - left_ext - right_ext 593 594 .macro v_loop need_left, need_right 595 9: 596 # pixel_copy() 597 add t5, a1, a2 # t5 = blk + left_ext 598 mv t2, t0 # ref 599 0: 600 vsetvli t6, t3, e8, m1, ta, ma 601 vle8.v v8, (t2) 602 add t2, t2, t6 603 604 vse8.v v8, (t5) 605 sub t3, t3, t6 606 add t5, t5, t6 607 bnez t3, 0b 608 609 sub t3, a0, a2 610 sub t3, t3, a4 # t3 = center_w = bw - left_ext - right_ext 611 612 .if \need_left 613 lb t2, (t0) # ref[0] 614 # pixel_set() 615 vsetvli t6, a2, e8, m1, ta, ma 616 vmv.v.x v8, t2 617 mv t2, a2 # left_ext 618 mv t5, a1 # blk 619 0: 620 vse8.v v8, (t5) 621 sub t2, t2, t6 # left_ext -= t6 622 add t5, t5, t6 # blk += t6 623 vsetvli t6, t2, e8, m1, ta, ma 624 bnez t2, 0b 625 .endif 626 627 .if \need_right 628 add t5, a1, a2 # t5 = blk + left_ext 629 add t5, t5, t3 # t5 = blk + left_ext + center_w 630 lb t2, -1(t5) # blk[left_ext + center_w - 1] 631 # pixel_set() 632 vsetvli t6, a4, e8, m1, ta, ma 633 vmv.v.x v8, t2 634 mv t2, a4 # right_ext 635 0: 636 vse8.v v8, (t5) 637 sub t2, t2, t6 638 add t5, t5, t6 639 vsetvli t6, t2, e8, m1, ta, ma 640 bnez t2, 0b 641 .endif 642 643 add t0, t0, t1 # ref += ref_stride 644 add a1, a1, a7 # blk += dst_stride 645 addi t4, t4, -1 # center_h-- 646 bnez t4, 9b 647 .endm 648 649 L(emu_edge_center): 650 blez t4, L(emu_edge_bottom) 651 652 beqz a2, 1f # if (left_ext) 653 beqz a4, 2f # if (right_ext) 654 v_loop 1, 1 655 j L(emu_edge_bottom) 656 657 1: 658 beqz a4, 3f 659 v_loop 0, 1 660 j L(emu_edge_bottom) 661 662 2: 663 v_loop 1, 0 664 j L(emu_edge_bottom) 665 666 3: 667 v_loop 0, 0 668 669 L(emu_edge_bottom): # copy bottom 670 blez a5, L(emu_edge_top) 671 mv t2, a0 # bw 672 2: 673 mv t5, a5 # bottom_ext 674 mv t1, a1 # dst 675 676 vsetvli t6, t2, e8, m1, ta, ma 677 sub t0, t1, a7 # dst - dst_stride 678 vle8.v v8, (t0) 679 0: 680 vse8.v v8, (t1) 681 add t1, t1, a7 682 addi t5, t5, -1 683 bnez t5, 0b 684 685 sub t2, t2, t6 686 add a1, a1, t6 687 bnez t2, 2b 688 689 L(emu_edge_top): # copy top 690 blez a3, L(emu_edge_end) 691 mul t5, a3, a7 692 add t1, a6, t5 # blk = dst + top_ext * PXSTRIDE(dst_stride) 693 # a6 = dst 694 1: 695 mv t0, a3 # top_ext 696 mv t4, a6 # dst 697 698 vsetvli t6, a0, e8, m1, ta, ma 699 vle8.v v8, (t1) 700 0: 701 vse8.v v8, (t4) 702 add t4, t4, a7 703 vse8.v v8, (t4) 704 add t4, t4, a7 705 addi t0, t0, -2 706 bgtz t0, 0b 707 708 sub a0, a0, t6 709 add t1, t1, t6 710 add a6, a6, t6 711 712 bnez a0, 1b 713 714 L(emu_edge_end): 715 ret 716 endfunc 717 718 .macro w_mask_fn type vlen 719 function w_mask_\type\()_\vlen\()8bpc_rvv, export=1, ext="v,zba,zbb" 720 csrw vxrm, zero 721 li t1, 38*256+8 722 .ifc \vlen, vl256_ 723 addi t0, zero, 64 724 bgt a4, t0, 2f 725 li t2, 0xCAC9C8CFCE0000 726 li t3, 0xC1C0C7C6C50000 727 .else 728 addi t0, zero, 32 729 bgt a4, t0, 2f 730 li t2, 0xCAC9C8CF0000 731 li t3, 0xC1C0C7C60000 732 .endif 733 ctz t4, a4 734 slli t4, t4, 3 735 srl t2, t2, t4 736 andi t2, t2, 0xFF 737 srl t3, t3, t4 738 andi t3, t3, 0xFF 739 740 1: 741 .if \type == 444 742 w_mask_body 444 narrow 743 744 sh1add a0, a1, a0 # dst += dst_stride 745 add a6, a6, a4 # mask += w 746 .elseif \type == 422 747 w_mask_body 422 narrow 748 749 sh1add a0, a1, a0 # dst += dst_stride 750 srli t4, a4, 1 751 add a6, a6, t4 # mask += w >> 1 752 .elseif \type == 420 753 w_mask_body 420 narrow 754 755 sh1add a0, a1, a0 # dst += dst_stride 756 .endif 757 758 sh1add a2, a4, a2 759 sh1add a3, a4, a3 760 761 addi a5, a5, -2 762 bnez a5, 1b 763 764 ret 765 766 2: 767 li t2, 0xca 768 li t3, 0xc1 769 770 3: 771 mv t5, zero 772 773 .if \type == 444 774 w_mask_body 444 wide # VLEN>=256 775 .elseif \type == 422 776 w_mask_body 422 wide # VLEN>=256 777 .elseif \type == 420 778 w_mask_body 420 wide # VLEN>=256 779 .endif 780 781 add t5, t5, t6 782 bne t5, a4, 4b 783 784 sh1add a0, a1, a0 # dst += dst_stride 785 .if \type == 444 786 add a6, a6, a4 # mask += w 787 .elseif \type == 422 788 srli t4, a4, 1 789 add a6, a6, t4 # mask += w >> 1 790 .elseif \type == 420 791 .endif 792 793 sh1add a2, a4, a2 794 sh1add a3, a4, a3 795 796 addi a5, a5, -2 797 bnez a5, 3b 798 799 ret 800 801 endfunc 802 .endm 803 804 .macro w_mask_body type size 805 mv t0, a0 # dst 806 807 4: 808 vsetvl t6, a4, t2 809 810 # load tmp1 and tmp2 811 vle16.v v0, (a2) # tmp1[x] 812 813 sh1add t4, a4, a2 # tmp1 814 vle16.v v16, (t4) # tmp1[x] 815 sh1add a2, t6, a2 # tmp1 += w / k 816 817 vle16.v v4, (a3) # tmp2[x] 818 819 sh1add t4, a4, a3 # tmp2 820 vle16.v v20, (t4) # tmp2[x] 821 sh1add a3, t6, a3 # tmp2 += w / k 822 823 # v12 = abs(tmp1[x] - tmp2[x]) 824 vsub.vv v12, v0, v4 # tmp1[x] - tmp2[x] 825 vsub.vv v8, v4, v0 # tmp2[x] - tmp1[x] 826 vmax.vv v8, v12, v8 827 828 vsub.vv v28, v16, v20 # tmp1[x] - tmp2[x] 829 vsub.vv v24, v20, v16 # tmp2[x] - tmp1[x] 830 vmax.vv v24, v28, v24 831 832 li t4, 64 833 834 # min(38 + (v12 + 8) >> 8, 64) -> min((v12 + 38*256 + 8) >> 8, 64) 835 vadd.vx v8, v8, t1 836 vsra.vi v8, v8, 8 837 vmin.vx v8, v8, t4 838 839 vadd.vx v24, v24, t1 840 vsra.vi v24, v24, 8 841 vmin.vx v24, v24, t4 842 843 # dst[x] = (tmp1[x] - tmp2[x]) * m + 64 * tmp2[x]; 844 # v12, v28 = tmp1[x] - tmp2[x] 845 # v8, v24 = {m,n} 846 vwmul.vx v0, v4, t4 847 vwmacc.vv v0, v8, v12 848 vnclipu.wi v0, v0, 10 849 vmax.vx v0, v0, zero 850 851 vwmul.vx v16, v20, t4 852 vwmacc.vv v16, v24, v28 853 vnclipu.wi v16, v16, 10 854 vmax.vx v16, v16, zero 855 856 .if \type == 444 857 vsetvl zero, zero, t3 858 859 vnclipu.wi v0, v0, 0 860 vnclipu.wi v16, v16, 0 861 862 vse8.v v0, (t0) # dst[x] = 863 add t4, t0, a1 864 vse8.v v16, (t4) # dst[x] = 865 add t0, t0, t6 866 867 vnsrl.wi v8, v8, 0 868 vnsrl.wi v24, v24, 0 869 870 vse8.v v8, (a6) # mask[x] = m 871 add t4, a6, a4 872 vse8.v v24, (t4) # mask[x] = m 873 add a6, a6, t6 874 875 .elseif \type == 422 876 # v4, v20 = m 877 # v12, v28 = n 878 vnsrl.wi v4, v8, 0 879 vnsrl.wi v8, v8, 16 880 881 vnsrl.wi v20, v24, 0 882 vnsrl.wi v24, v24, 16 883 884 # v8, v24 = m + n - sign 885 vadd.vv v8, v4, v8 886 vsub.vx v8, v8, a7 887 888 vadd.vv v24, v20, v24 889 vsub.vx v24, v24, a7 890 891 vsetvl zero, zero, t3 892 893 vnclipu.wi v0, v0, 0 894 vnclipu.wi v16, v16, 0 895 896 vse8.v v0, (t0) # dst[x] = 897 add t4, t0, a1 898 vse8.v v16, (t4) # dst[x] = 899 add t0, t0, t6 900 901 vnclipu.wi v8, v8, 1 902 vnclipu.wi v24, v24, 1 903 904 .ifc \size, wide 905 srli t4, t6, 1 906 vsetvl zero, t4, t3 907 .endif 908 909 vse8.v v8, (a6) # mask[x] = m + n + 1 - sign 910 srli t4, a4, 1 911 add t4, a6, t4 912 vse8.v v24, (t4) # mask[x] = m + n + 1 - sign 913 srli t4, t6, 1 914 add a6, a6, t4 915 .elseif \type == 420 916 # v4, v20 = m 917 # v12, v28 = n 918 vnsrl.wi v4, v8, 0 919 vnsrl.wi v8, v8, 16 920 921 vnsrl.wi v20, v24, 0 922 vnsrl.wi v24, v24, 16 923 924 # v8 = m + n + mask[x >> 1] 925 vadd.vv v8, v4, v8 926 vadd.vv v24, v20, v24 927 vadd.vv v8, v8, v24 928 vsub.vx v8, v8, a7 929 930 vsetvl zero, zero, t3 931 932 vnclipu.wi v0, v0, 0 933 vnclipu.wi v16, v16, 0 934 935 vse8.v v0, (t0) # dst[x] = 936 add t4, t0, a1 937 vse8.v v16, (t4) # dst[x] = 938 add t0, t0, t6 939 940 vnclipu.wi v8, v8, 2 941 942 vse8.v v8, (a6) # mask[x] = (m + n + mask[x >> 1] + 2 - sign) >> 2; 943 srli t4, t6, 1 944 add a6, a6, t4 945 .endif 946 .endm 947 948 w_mask_fn 444 949 w_mask_fn 444 vl256_ 950 w_mask_fn 422 951 w_mask_fn 422 vl256_ 952 w_mask_fn 420 953 w_mask_fn 420 vl256_