tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mc.S (18243B)


      1 /******************************************************************************
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2024, Nathan Egge, Niklas Haas, Bogdan Gligorijevic
      4 * Copyright © 2025, Sungjoon Moon
      5 * All rights reserved.
      6 *
      7 * Redistribution and use in source and binary forms, with or without
      8 * modification, are permitted provided that the following conditions are met:
      9 *
     10 * 1. Redistributions of source code must retain the above copyright notice, this
     11 *    list of conditions and the following disclaimer.
     12 *
     13 * 2. Redistributions in binary form must reproduce the above copyright notice,
     14 *    this list of conditions and the following disclaimer in the documentation
     15 *    and/or other materials provided with the distribution.
     16 *
     17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 *****************************************************************************/
     28 
     29 #include "src/riscv/asm.S"
     30 
     31 function blend_vl256_8bpc_rvv, export=1, ext=zbb
     32  ctz t0, a3
     33  addi t0, t0, 0xc3
     34  j L(blend_epilog)
     35 endfunc
     36 
     37 function blend_8bpc_rvv, export=1, ext="v,zbb"
     38  ctz t0, a3
     39  addi t0, t0, 0xc4
     40 L(blend_epilog):
     41  csrw vxrm, zero
     42  andi t0, t0, 0xc7
     43  vsetvl zero, a3, t0
     44  li t1, 64
     45 1:
     46  addi a4, a4, -2
     47  vle8.v v4, (a2)
     48  add a2, a2, a3
     49  vle8.v v6, (a2)
     50  add a2, a2, a3
     51  vle8.v v8, (a5)
     52  add a5, a5, a3
     53  vle8.v v10, (a5)
     54  add a5, a5, a3
     55  vle8.v v0, (a0)
     56  add t0, a0, a1
     57  vle8.v v2, (t0)
     58  vwmulu.vv v16, v4, v8
     59  vwmulu.vv v20, v6, v10
     60  vrsub.vx v8, v8, t1
     61  vrsub.vx v10, v10, t1
     62  vwmaccu.vv v16, v0, v8
     63  vwmaccu.vv v20, v2, v10
     64  vnclipu.wi v0, v16, 6
     65  vnclipu.wi v2, v20, 6
     66  vse8.v v0, (a0)
     67  vse8.v v2, (t0)
     68  add a0, t0, a1
     69  bnez a4, 1b
     70  ret
     71 endfunc
     72 
     73 function blend_h_vl256_8bpc_rvv, export=1, ext=zbb
     74  srai t0, a3, 2
     75  li t2, 64
     76  ctz t0, t0
     77  addi t0, t0, 0xc5
     78  j L(blend_h_epilog)
     79 endfunc
     80 
     81 function blend_h_8bpc_rvv, export=1, ext="v,zbb"
     82  li t2, 64
     83  bgt a3, t2, 128f
     84  ctz t0, a3
     85  addi t0, t0, 0xc4
     86 L(blend_h_epilog):
     87  csrw vxrm, zero
     88  andi t0, t0, 0xc7
     89  vsetvl zero, a3, t0
     90  la t1, dav1d_obmc_masks
     91  srai t0, a4, 2
     92  add t1, t1, a4
     93  sub a4, a4, t0
     94 0:
     95  mv t5, ra
     96 1:
     97  addi a4, a4, -2
     98  lbu t3, (t1)
     99  addi t1, t1, 1
    100  lbu t4, (t1)
    101  addi t1, t1, 1
    102  vle8.v v8, (a2)
    103  add a2, a2, a3
    104  vle8.v v12, (a2)
    105  add a2, a2, a3
    106  vle8.v v0, (a0)
    107  add t0, a0, a1
    108  vle8.v v4, (t0)
    109  vwmulu.vx v16, v8, t3
    110  vwmulu.vx v24, v12, t4
    111  sub t3, t2, t3
    112  sub t4, t2, t4
    113  vwmaccu.vx v16, t3, v0
    114  vwmaccu.vx v24, t4, v4
    115  vnclipu.wi v0, v16, 6
    116  vnclipu.wi v4, v24, 6
    117  vse8.v v0, (a0)
    118  vse8.v v4, (t0)
    119  add a0, t0, a1
    120  bgtz a4, 1b
    121  jr t5
    122 128:
    123  csrw vxrm, zero
    124  vsetvli zero, t2, e8, m4, ta, ma
    125  la t1, dav1d_obmc_masks
    126  srai t0, a4, 2
    127  add t1, t1, a4
    128  sub a4, a4, t0
    129  mv a5, a0
    130  mv a6, a2
    131  mv a7, a4
    132  jal t5, 1b
    133  add t1, t1, a4
    134  add a0, a5, t2
    135  add a2, a6, t2
    136  mv a4, a7
    137  sub t1, t1, a4
    138  j 0b
    139 endfunc
    140 
    141 function blend_v_vl256_8bpc_rvv, export=1, ext=zbb
    142  srai t0, a3, 2
    143  ctz t0, t0
    144  addi t0, t0, 0xc5
    145  j L(blend_v_epilog)
    146 endfunc
    147 
    148 function blend_v_8bpc_rvv, export=1, ext="v,zbb"
    149  ctz t0, a3
    150  addi t0, t0, 0xc4
    151 L(blend_v_epilog):
    152  andi t0, t0, 0xc7
    153  srai t1, a3, 2
    154  sub t1, a3, t1
    155  vsetvl zero, t1, t0
    156  csrw vxrm, zero
    157  la t1, dav1d_obmc_masks
    158  add t1, t1, a3
    159  vle8.v v8, (t1)
    160  li t0, 64
    161  vrsub.vx v10, v8, t0
    162 1:
    163  addi a4, a4, -2
    164  vle8.v v4, (a2)
    165  add a2, a2, a3
    166  vle8.v v6, (a2)
    167  add a2, a2, a3
    168  vle8.v v0, (a0)
    169  add t0, a0, a1
    170  vle8.v v2, (t0)
    171  vwmulu.vv v12, v4, v8
    172  vwmulu.vv v16, v6, v8
    173  vwmaccu.vv v12, v0, v10
    174  vwmaccu.vv v16, v2, v10
    175  vnclipu.wi v0, v12, 6
    176  vnclipu.wi v2, v16, 6
    177  vse8.v v0, (a0)
    178  vse8.v v2, (t0)
    179  add a0, t0, a1
    180  bnez a4, 1b
    181  ret
    182 endfunc
    183 
    184 .macro avg va, vb, vm
    185    vadd.vv \va, \va, \vb
    186 .endm
    187 
    188 .macro w_avg va, vb, vm
    189    vwmul.vx v24, \va, a6
    190    vwmacc.vx v24, a7, \vb
    191    vnclip.wi \va, v24, 8
    192 .endm
    193 
    194 .macro mask va, vb, vm
    195    vwmul.vv v24, \va, \vm
    196    vrsub.vx \vm, \vm, a7
    197    vwmacc.vv v24, \vb, \vm
    198    vnclip.wi \va, v24, 10
    199 .endm
    200 
    201 .macro bidir_fn type, shift
    202 function \type\()_8bpc_rvv, export=1, ext="v,zba,zbb"
    203 .ifc \type, w_avg
    204    li a7, 16
    205    sub a7, a7, a6
    206 .endif
    207 .ifc \type, mask
    208    li a7, 64
    209 .endif
    210    li t0, 4
    211    csrw vxrm, zero
    212    beq t0, a4, 4f
    213    csrr t0, vlenb
    214    ctz t1, a4
    215    ctz t0, t0
    216    li t2, 1
    217    sub t0, t1, t0
    218    li t4, -3
    219    bgt t0, t2, 2f
    220    max t0, t0, t4
    221    andi t1, t0, 0x7
    222    addi t0, t1, 1 # may overflow into E16 bit
    223    ori t0, t0, MA | TA | E16
    224    ori t1, t1, MA | TA | E8
    225 1:
    226    addi a5, a5, -4
    227 .rept 2
    228    vsetvl zero, a4, t0
    229    sh1add t3, a4, a2
    230    vle16.v v0, (a2)
    231    sh1add a2, a4, t3
    232    vle16.v v4, (t3)
    233    sh1add t3, a4, a3
    234    vle16.v v8, (a3)
    235    sh1add a3, a4, t3
    236    vle16.v v12, (t3)
    237 .ifc \type, mask
    238    add t3, a4, a6
    239    vle8.v v24, (a6)
    240    add a6, a4, t3
    241    vle8.v v26, (t3)
    242    vzext.vf2 v16, v24
    243    vzext.vf2 v20, v26
    244 .endif
    245    \type v0, v8, v16
    246    \type v4, v12, v20
    247    vmax.vx v8, v0, zero
    248    vmax.vx v12, v4, zero
    249    vsetvl zero, zero, t1
    250    vnclipu.wi v0, v8,  \shift
    251    vnclipu.wi v2, v12, \shift
    252    add t3, a1, a0
    253    vse8.v v0, (a0)
    254    add a0, a1, t3
    255    vse8.v v2, (t3)
    256 .endr
    257    bnez a5, 1b
    258    ret
    259 2:
    260    mv t0, a0
    261    neg t4, a4
    262    add a0, a1, a0
    263    addi a5, a5, -1
    264 20:
    265    vsetvli t2, a4, e16, m4, ta, ma
    266    sh1add t4, t2, t4
    267    sh1add t3, t2, a2
    268    vle16.v v0, (a2)
    269    sh1add a2, t2, t3
    270    vle16.v v4, (t3)
    271    sh1add t3, t2, a3
    272    vle16.v v8, (a3)
    273    sh1add a3, t2, t3
    274    vle16.v v12, (t3)
    275 .ifc \type, mask
    276    add t3, t2, a6
    277    vle8.v v24, (a6)
    278    add a6, t2, t3
    279    vle8.v v26, (t3)
    280    vzext.vf2 v16, v24
    281    vzext.vf2 v20, v26
    282 .endif
    283    \type v0, v8, v16
    284    \type v4, v12, v20
    285    vmax.vx v8, v0, zero
    286    vmax.vx v12, v4, zero
    287    vsetvli zero, zero, e8, m2, ta, ma
    288    vnclipu.wi v0, v8,  \shift
    289    vnclipu.wi v2, v12, \shift
    290    add t3, t2, t0
    291    vse8.v v0, (t0)
    292    add t0, t2, t3
    293    vse8.v v2, (t3)
    294    bnez t4, 20b
    295    bnez a5, 2b
    296    ret
    297 4:
    298    slli t0, a5, 2
    299    vsetvli t1, t0, e16, m4, ta, ma
    300    vle16.v v0, (a2)
    301    sh1add a2, t1, a2
    302    vle16.v v4, (a3)
    303    sh1add a3, t1, a3
    304 .ifc \type, mask
    305    vle8.v v16, (a6)
    306    add a6, t1, a6
    307    vzext.vf2 v8, v16
    308 .endif
    309    \type v0, v4, v8
    310    vmax.vx v8, v0, zero
    311    vsetvli zero, zero, e8, m2, ta, ma
    312    vnclipu.wi v0, v8, \shift
    313    vsetvli t1, a5, e32, m2, ta, ma
    314    vsse32.v v0, (a0), a1
    315    ctz t0, t1
    316    sub a5, a5, t1
    317    sll t0, a1, t0
    318    add a0, t0, a0
    319    bnez a5, 4b
    320    ret
    321 endfunc
    322 .endm
    323 
    324 bidir_fn avg,   5
    325 bidir_fn w_avg, 0
    326 bidir_fn mask,  0
    327 
    328 function warp_8x8_8bpc_rvv, export=1, ext="v"
    329    csrw vxrm, zero
    330 
    331    vsetivli zero, 8, e16, m1, ta, ma
    332    addi sp, sp, -2*15*8
    333    mv t5, sp
    334    li t0, 3
    335    mul t0, a3, t0
    336    sub a2, a2, t0
    337    addi a2, a2, -3
    338 
    339    li t0, 64
    340    addi a3, a3, -8
    341    li t1, 15
    342    la t2, dav1d_mc_warp_filter
    343 
    344    lh t6, (a4)
    345    lh t4, 2(a4)
    346    vid.v v30
    347    vwmul.vx v28, v30, t6
    348 1:
    349    addi t1, t1, -1
    350 
    351 
    352    vsetvli zero, zero, e32, m2, ta, ma
    353    vadd.vx v4, v28, a5
    354    add a5, a5, t4
    355    vssra.vi v2, v4, 10
    356    vadd.vx v2, v2, t0
    357    vsll.vi v24, v2, 3
    358    vsetvli zero, zero, e8, mf2, ta, ma
    359 
    360    vluxseg8ei32.v v2, (t2), v24
    361 
    362    vsetvli zero, zero, e16, m1, ta, ma
    363 .irp i, 2, 3, 4, 5, 6, 7, 8, 9
    364    vle8.v v10, (a2)
    365    addi a2, a2, 1
    366 
    367    vsext.vf2 v14, v\i
    368    vzext.vf2 v16, v10
    369 
    370 .if \i == 2
    371    vwmulsu.vv v12, v14, v16
    372 .else
    373    vwmaccsu.vv v12, v14, v16
    374 .endif
    375 .endr
    376    vnclip.wi v10, v12, 3
    377 
    378    add a2, a2, a3
    379    vse16.v v10, (t5)
    380    addi t5, t5, 16
    381 
    382    bnez t1, 1b
    383 
    384    mv t5, sp
    385    li t1, 8
    386 
    387    lh t6, 4(a4)
    388    lh t4, 6(a4)
    389    vwmul.vx v28, v30, t6
    390 2:
    391    addi t1, t1, -1
    392 
    393    vsetvli zero, zero, e32, m2, ta, ma
    394    vadd.vx v4, v28, a6
    395 
    396    add a6, a6, t4
    397    vssra.vi v2, v4, 10
    398    vadd.vx v2, v2, t0
    399    vsll.vi v24, v2, 3
    400    vsetvli zero, zero, e8, mf2, ta, ma
    401 
    402    vluxseg8ei32.v v2, (t2), v24
    403    vsetvli zero, zero, e16, m1, ta, ma
    404 
    405 .irp i, 2, 3, 4, 5, 6, 7, 8, 9
    406    vle16.v v10, (t5)
    407    addi t5, t5, 16
    408 
    409    vsext.vf2 v14, v\i
    410 
    411 .if \i == 2
    412    vwmul.vv v12, v14, v10
    413 .else
    414    vwmacc.vv v12, v14, v10
    415 .endif
    416 .endr
    417    addi t5, t5, -16*7
    418    vnclip.wi v10, v12, 11
    419 
    420    vmax.vx v10, v10, zero
    421    vsetvli zero, zero, e8, mf2, ta, ma
    422 
    423    vnclipu.wi v12, v10, 0
    424 
    425    vse8.v v12, (a0)
    426    add a0, a0, a1
    427 
    428    bnez t1, 2b
    429 
    430    addi sp, sp, 2*15*8
    431 
    432    ret
    433 endfunc
    434 
    435 function warp_8x8t_8bpc_rvv, export=1, ext="v,zba"
    436    csrw vxrm, zero
    437 
    438    vsetivli zero, 8, e16, m1, ta, ma
    439    addi sp, sp, -2*15*8
    440    mv t5, sp
    441    li t0, 3
    442    mul t0, a3, t0
    443    sub a2, a2, t0
    444    addi a2, a2, -3
    445 
    446    li t0, 64
    447    addi a3, a3, -8
    448    li t1, 15
    449    la t2, dav1d_mc_warp_filter
    450 
    451    lh t6, (a4)
    452    lh t4, 2(a4)
    453    vid.v v30
    454    vwmul.vx v28, v30, t6
    455 1:
    456    addi t1, t1, -1
    457 
    458 
    459    vsetvli zero, zero, e32, m2, ta, ma
    460    vadd.vx v4, v28, a5
    461    add a5, a5, t4
    462    vssra.vi v2, v4, 10
    463    vadd.vx v2, v2, t0
    464    vsll.vi v24, v2, 3
    465    vsetvli zero, zero, e8, mf2, ta, ma
    466 
    467    vluxseg8ei32.v v2, (t2), v24
    468 
    469    vsetvli zero, zero, e16, m1, ta, ma
    470 .irp i, 2, 3, 4, 5, 6, 7, 8, 9
    471    vle8.v v10, (a2)
    472    addi a2, a2, 1
    473 
    474    vsext.vf2 v14, v\i
    475    vzext.vf2 v16, v10
    476 
    477 .if \i == 2
    478    vwmulsu.vv v12, v14, v16
    479 .else
    480    vwmaccsu.vv v12, v14, v16
    481 .endif
    482 .endr
    483    vnclip.wi v10, v12, 3
    484 
    485    add a2, a2, a3
    486    vse16.v v10, (t5)
    487    addi t5, t5, 16
    488 
    489    bnez t1, 1b
    490 
    491    mv t5, sp
    492    li t1, 8
    493 
    494    lh t6, 4(a4)
    495    lh t4, 6(a4)
    496    vwmul.vx v28, v30, t6
    497 2:
    498    addi t1, t1, -1
    499 
    500    vsetvli zero, zero, e32, m2, ta, ma
    501    vadd.vx v4, v28, a6
    502    add a6, a6, t4
    503    vssra.vi v2, v4, 10
    504    vadd.vx v2, v2, t0
    505    vsll.vi v24, v2, 3
    506    vsetvli zero, zero, e8, mf2, ta, ma
    507 
    508    vluxseg8ei32.v v2, (t2), v24
    509    vsetvli zero, zero, e16, m1, ta, ma
    510 
    511 .irp i, 2, 3, 4, 5, 6, 7, 8, 9
    512    vle16.v v10, (t5)
    513    addi t5, t5, 16
    514 
    515    vsext.vf2 v14, v\i
    516 
    517 .if \i == 2
    518    vwmul.vv v12, v14, v10
    519 .else
    520    vwmacc.vv v12, v14, v10
    521 .endif
    522 
    523 .endr
    524    addi t5, t5, -16*7
    525    vnclip.wi v10, v12, 7
    526 
    527    vse16.v v10, (a0)
    528    sh1add a0, a1, a0
    529 
    530    bnez t1, 2b
    531 
    532    addi sp, sp, 2*15*8
    533 
    534    ret
    535 endfunc
    536 
    537 function emu_edge_8bpc_rvv, export=1, ext="v,zbb"
    538  ld t0, 0(sp)
    539  ld t1, 8(sp)
    540 
    541  // int cx = iclip((int) x, 0, (int) iw - 1);
    542  max t2, a4, zero
    543  addi t4, a2, -1
    544  min t2, t2, t4
    545 
    546  // int cy = iclip((int) y, 0, (int) ih - 1);
    547  max t3, a5, zero
    548  addi t5, a3, -1
    549  min t3, t3, t5
    550 
    551  // ref += cy*PXSTRIDE(ref_stride) + cx
    552  mul t3, t3, t1
    553  add t3, t3, t2
    554 
    555  add t0, t0, t3
    556 
    557  addi t4, a0, -1
    558 
    559  neg t2, a4
    560  add t3, a4, a0
    561  sub t3, t3, a2
    562 
    563  // int left_ext = iclip((int) -x, 0, (int) bw - 1);
    564  max t2, t2, zero
    565  min a2, t2, t4 # a2 = left_ext
    566 
    567  // int right_ext = iclip((int) (x + bw - iw), 0, (int) bw - 1);
    568  max t3, t3, zero
    569  min a4, t3, t4 # a4 = right_ext
    570 
    571  addi t6, a1, -1
    572 
    573  neg t4, a5
    574  add t5, a5, a1
    575  sub t5, t5, a3
    576 
    577  // int top_ext = iclip((int) -y, 0, (int) bh - 1);
    578  max t4, t4, zero
    579  min a3, t4, t6 # a3 = top_ext
    580 
    581  // int bottom_ext = iclip((int) (x + bh - ih), 0, (int) bh - 1);
    582  max t5, t5, zero
    583  min a5, t5, t6 # a5 = bottom_ext
    584 
    585  sub t4, a1, a3
    586  sub t4, t4, a5 # t4 = center_h
    587 
    588  mul t5, a3, a7
    589  add a1, a6, t5 # blk = dst + top_ext * dst_stride
    590 
    591  sub t3, a0, a2
    592  sub t3, t3, a4 # t3 = center_w = bw - left_ext - right_ext
    593 
    594 .macro v_loop need_left, need_right
    595 9:
    596  # pixel_copy()
    597  add t5, a1, a2 # t5 = blk + left_ext
    598  mv t2, t0 # ref
    599 0:
    600  vsetvli t6, t3, e8, m1, ta, ma
    601  vle8.v v8, (t2)
    602  add t2, t2, t6
    603 
    604  vse8.v v8, (t5)
    605  sub t3, t3, t6
    606  add t5, t5, t6
    607  bnez t3, 0b
    608 
    609  sub t3, a0, a2
    610  sub t3, t3, a4 # t3 = center_w = bw - left_ext - right_ext
    611 
    612 .if \need_left
    613  lb t2, (t0) # ref[0]
    614  # pixel_set()
    615  vsetvli t6, a2, e8, m1, ta, ma
    616  vmv.v.x v8, t2
    617  mv t2, a2 # left_ext
    618  mv t5, a1 # blk
    619 0:
    620  vse8.v v8, (t5)
    621  sub t2, t2, t6 # left_ext -= t6
    622  add t5, t5, t6 # blk += t6
    623  vsetvli t6, t2, e8, m1, ta, ma
    624  bnez t2, 0b
    625 .endif
    626 
    627 .if \need_right
    628  add t5, a1, a2 # t5 = blk + left_ext
    629  add t5, t5, t3 # t5 = blk + left_ext + center_w
    630  lb t2, -1(t5) # blk[left_ext + center_w - 1]
    631  # pixel_set()
    632  vsetvli t6, a4, e8, m1, ta, ma
    633  vmv.v.x v8, t2
    634  mv t2, a4 # right_ext
    635 0:
    636  vse8.v v8, (t5)
    637  sub t2, t2, t6
    638  add t5, t5, t6
    639  vsetvli t6, t2, e8, m1, ta, ma
    640  bnez t2, 0b
    641 .endif
    642 
    643  add t0, t0, t1 # ref += ref_stride
    644  add a1, a1, a7 # blk += dst_stride
    645  addi t4, t4, -1 # center_h--
    646  bnez t4, 9b
    647 .endm
    648 
    649 L(emu_edge_center):
    650  blez t4, L(emu_edge_bottom)
    651 
    652  beqz a2, 1f # if (left_ext)
    653  beqz a4, 2f # if (right_ext)
    654  v_loop 1, 1
    655  j L(emu_edge_bottom)
    656 
    657 1:
    658  beqz a4, 3f
    659  v_loop 0, 1
    660  j L(emu_edge_bottom)
    661 
    662 2:
    663  v_loop 1, 0
    664  j L(emu_edge_bottom)
    665 
    666 3:
    667  v_loop 0, 0
    668 
    669 L(emu_edge_bottom): # copy bottom
    670  blez a5, L(emu_edge_top)
    671  mv t2, a0 # bw
    672 2:
    673  mv t5, a5 # bottom_ext
    674  mv t1, a1 # dst
    675 
    676  vsetvli t6, t2, e8, m1, ta, ma
    677  sub t0, t1, a7 # dst - dst_stride
    678  vle8.v v8, (t0)
    679 0:
    680  vse8.v v8, (t1)
    681  add t1, t1, a7
    682  addi t5, t5, -1
    683  bnez t5, 0b
    684 
    685  sub t2, t2, t6
    686  add a1, a1, t6
    687  bnez t2, 2b
    688 
    689 L(emu_edge_top): # copy top
    690  blez a3, L(emu_edge_end)
    691  mul t5, a3, a7
    692  add t1, a6, t5 # blk = dst + top_ext * PXSTRIDE(dst_stride)
    693  # a6 = dst
    694 1:
    695  mv t0, a3 # top_ext
    696  mv t4, a6 # dst
    697 
    698  vsetvli t6, a0, e8, m1, ta, ma
    699  vle8.v v8, (t1)
    700 0:
    701  vse8.v v8, (t4)
    702  add t4, t4, a7
    703  vse8.v v8, (t4)
    704  add t4, t4, a7
    705  addi t0, t0, -2
    706  bgtz t0, 0b
    707 
    708  sub a0, a0, t6
    709  add t1, t1, t6
    710  add a6, a6, t6
    711 
    712  bnez a0, 1b
    713 
    714 L(emu_edge_end):
    715  ret
    716 endfunc
    717 
    718 .macro w_mask_fn type vlen
    719 function w_mask_\type\()_\vlen\()8bpc_rvv, export=1, ext="v,zba,zbb"
    720  csrw vxrm, zero
    721  li t1, 38*256+8
    722 .ifc \vlen, vl256_
    723  addi t0, zero, 64
    724  bgt a4, t0, 2f
    725  li t2, 0xCAC9C8CFCE0000
    726  li t3, 0xC1C0C7C6C50000
    727 .else
    728  addi t0, zero, 32
    729  bgt a4, t0, 2f
    730  li t2, 0xCAC9C8CF0000
    731  li t3, 0xC1C0C7C60000
    732 .endif
    733  ctz t4, a4
    734  slli t4, t4, 3
    735  srl t2, t2, t4
    736  andi t2, t2, 0xFF
    737  srl t3, t3, t4
    738  andi t3, t3, 0xFF
    739 
    740 1:
    741 .if \type == 444
    742 w_mask_body 444 narrow
    743 
    744  sh1add a0, a1, a0 # dst += dst_stride
    745  add a6, a6, a4 # mask += w
    746 .elseif \type == 422
    747 w_mask_body 422 narrow
    748 
    749  sh1add a0, a1, a0 # dst += dst_stride
    750  srli t4, a4, 1
    751  add a6, a6, t4 # mask += w >> 1
    752 .elseif \type == 420
    753 w_mask_body 420 narrow
    754 
    755  sh1add a0, a1, a0 # dst += dst_stride
    756 .endif
    757 
    758  sh1add a2, a4, a2
    759  sh1add a3, a4, a3
    760 
    761  addi a5, a5, -2
    762  bnez a5, 1b
    763 
    764  ret
    765 
    766 2:
    767  li t2, 0xca
    768  li t3, 0xc1
    769 
    770 3:
    771  mv t5, zero
    772 
    773 .if \type == 444
    774 w_mask_body 444 wide # VLEN>=256
    775 .elseif \type == 422
    776 w_mask_body 422 wide # VLEN>=256
    777 .elseif \type == 420
    778 w_mask_body 420 wide # VLEN>=256
    779 .endif
    780 
    781  add t5, t5, t6
    782  bne t5, a4, 4b
    783 
    784  sh1add a0, a1, a0 # dst += dst_stride
    785 .if \type == 444
    786  add a6, a6, a4 # mask += w
    787 .elseif \type == 422
    788  srli t4, a4, 1
    789  add a6, a6, t4 # mask += w >> 1
    790 .elseif \type == 420
    791 .endif
    792 
    793  sh1add a2, a4, a2
    794  sh1add a3, a4, a3
    795 
    796  addi a5, a5, -2
    797  bnez a5, 3b
    798 
    799  ret
    800 
    801 endfunc
    802 .endm
    803 
    804 .macro w_mask_body type size
    805  mv t0, a0 # dst
    806 
    807 4:
    808  vsetvl t6, a4, t2
    809 
    810  # load tmp1 and tmp2
    811  vle16.v v0, (a2) # tmp1[x]
    812 
    813  sh1add t4, a4, a2 # tmp1
    814  vle16.v v16, (t4) # tmp1[x]
    815  sh1add a2, t6, a2 # tmp1 += w / k
    816 
    817  vle16.v v4, (a3) # tmp2[x]
    818 
    819  sh1add t4, a4, a3 # tmp2
    820  vle16.v v20, (t4) # tmp2[x]
    821  sh1add a3, t6, a3 # tmp2 += w / k
    822 
    823  # v12 = abs(tmp1[x] - tmp2[x])
    824  vsub.vv v12, v0, v4 # tmp1[x] - tmp2[x]
    825  vsub.vv v8, v4, v0 # tmp2[x] - tmp1[x]
    826  vmax.vv v8, v12, v8
    827 
    828  vsub.vv v28, v16, v20 # tmp1[x] - tmp2[x]
    829  vsub.vv v24, v20, v16 # tmp2[x] - tmp1[x]
    830  vmax.vv v24, v28, v24
    831 
    832  li t4, 64
    833 
    834  # min(38 + (v12 + 8) >> 8, 64) -> min((v12 + 38*256 + 8) >> 8, 64)
    835  vadd.vx v8, v8, t1
    836  vsra.vi v8, v8, 8
    837  vmin.vx v8, v8, t4
    838 
    839  vadd.vx v24, v24, t1
    840  vsra.vi v24, v24, 8
    841  vmin.vx v24, v24, t4
    842 
    843  # dst[x] = (tmp1[x] - tmp2[x]) * m + 64 * tmp2[x];
    844  # v12, v28 = tmp1[x] - tmp2[x]
    845  # v8, v24 = {m,n}
    846  vwmul.vx v0, v4, t4
    847  vwmacc.vv v0, v8, v12
    848  vnclipu.wi v0, v0, 10
    849  vmax.vx v0, v0, zero
    850 
    851  vwmul.vx v16, v20, t4
    852  vwmacc.vv v16, v24, v28
    853  vnclipu.wi v16, v16, 10
    854  vmax.vx v16, v16, zero
    855 
    856 .if \type == 444
    857  vsetvl zero, zero, t3
    858 
    859  vnclipu.wi v0, v0, 0
    860  vnclipu.wi v16, v16, 0
    861 
    862  vse8.v v0, (t0) # dst[x] =
    863  add t4, t0, a1
    864  vse8.v v16, (t4) # dst[x] =
    865  add t0, t0, t6
    866 
    867  vnsrl.wi v8, v8, 0
    868  vnsrl.wi v24, v24, 0
    869 
    870  vse8.v v8, (a6) # mask[x] = m
    871  add t4, a6, a4
    872  vse8.v v24, (t4) # mask[x] = m
    873  add a6, a6, t6
    874 
    875 .elseif \type == 422
    876  # v4, v20 = m
    877  # v12, v28 = n
    878  vnsrl.wi v4, v8, 0
    879  vnsrl.wi v8, v8, 16
    880 
    881  vnsrl.wi v20, v24, 0
    882  vnsrl.wi v24, v24, 16
    883 
    884  # v8, v24 = m + n - sign
    885  vadd.vv v8, v4, v8
    886  vsub.vx v8, v8, a7
    887 
    888  vadd.vv v24, v20, v24
    889  vsub.vx v24, v24, a7
    890 
    891  vsetvl zero, zero, t3
    892 
    893  vnclipu.wi v0, v0, 0
    894  vnclipu.wi v16, v16, 0
    895 
    896  vse8.v v0, (t0) # dst[x] =
    897  add t4, t0, a1
    898  vse8.v v16, (t4) # dst[x] =
    899  add t0, t0, t6
    900 
    901  vnclipu.wi v8, v8, 1
    902  vnclipu.wi v24, v24, 1
    903 
    904 .ifc \size, wide
    905  srli t4, t6, 1
    906  vsetvl zero, t4, t3
    907 .endif
    908 
    909  vse8.v v8, (a6) # mask[x] = m + n + 1 - sign
    910  srli t4, a4, 1
    911  add t4, a6, t4
    912  vse8.v v24, (t4) # mask[x] = m + n + 1 - sign
    913  srli t4, t6, 1
    914  add a6, a6, t4
    915 .elseif \type == 420
    916  # v4, v20 = m
    917  # v12, v28 = n
    918  vnsrl.wi v4, v8, 0
    919  vnsrl.wi v8, v8, 16
    920 
    921  vnsrl.wi v20, v24, 0
    922  vnsrl.wi v24, v24, 16
    923 
    924  # v8 = m + n + mask[x >> 1]
    925  vadd.vv v8, v4, v8
    926  vadd.vv v24, v20, v24
    927  vadd.vv v8, v8, v24
    928  vsub.vx v8, v8, a7
    929 
    930  vsetvl zero, zero, t3
    931 
    932  vnclipu.wi v0, v0, 0
    933  vnclipu.wi v16, v16, 0
    934 
    935  vse8.v v0, (t0) # dst[x] =
    936  add t4, t0, a1
    937  vse8.v v16, (t4) # dst[x] =
    938  add t0, t0, t6
    939 
    940  vnclipu.wi v8, v8, 2
    941 
    942  vse8.v v8, (a6) # mask[x] = (m + n + mask[x >> 1] + 2 - sign) >> 2;
    943  srli t4, t6, 1
    944  add a6, a6, t4
    945 .endif
    946 .endm
    947 
    948 w_mask_fn 444
    949 w_mask_fn 444 vl256_
    950 w_mask_fn 422
    951 w_mask_fn 422 vl256_
    952 w_mask_fn 420
    953 w_mask_fn 420 vl256_